## Data Preprocessing

In [1]:
import torch
print(torch.cuda.is_available())  # Should print True
print(torch.cuda.get_device_name(0))  # Shows the GPU name

True
Tesla T4


### Imports

In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Text Preprocessor Class

In [3]:
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def clean_text(self, text):
        """Comprehensive text cleaning for product search"""
        if pd.isna(text):
            return ""

        text = str(text).lower()

        # Remove special characters but keep important ones for products
        text = re.sub(r'[^\w\s\-\.]', ' ', text)

        # Handle measurements and units (keep them meaningful)
        text = re.sub(r'(\d+)\s*-\s*(\w+)', r'\1\2', text)  # "12-gauge" -> "12gauge"
        text = re.sub(r'(\d+)\s*(\w+)', r'\1\2', text)      # "1 gal" -> "1gal"

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def extract_features(self, text):
        """Extract meaningful features from product text"""
        if pd.isna(text):
            return ""

        text = str(text).lower()
        features = []

        # Extract brand information (first few words often contain brand)
        words = text.split()
        if len(words) > 0:
            potential_brand = words[0]
            features.append(f"brand_{potential_brand}")

        # Extract measurements
        measurements = re.findall(r'\d+(?:\.\d+)?(?:inch|in|ft|gal|gauge|lb)', text)
        features.extend([f"measure_{m}" for m in measurements])

        colors = re.findall(r'\b(?:black|white|brown|gray|grey|red|blue|green|yellow|silver|gold)\b', text)
        features.extend([f"color_{c}" for c in colors])

        return " ".join(features)

    def remove_stopwords_and_stem(self, text):
        """Remove stopwords and apply stemming"""
        if pd.isna(text):
            return ""

        words = text.split()
        important_words = {'with', 'for', 'in', 'on', 'over', 'under'}
        filtered_words = []

        for word in words:
            if word not in self.stop_words or word in important_words:
                stemmed = self.stemmer.stem(word)
                filtered_words.append(stemmed)

        return " ".join(filtered_words)


### Data Preprocessor Class

In [8]:
class DataPreprocessor:
    def __init__(self, csv_path, name):
        self.df = pd.read_csv(csv_path)
        self.name = name

    def __remove_duplicates_and_nulls(self):
        print(f"Before removing duplicates and nulls: {self.df.shape}")
        self.df = self.df.drop_duplicates()
        required_columns = ['product_title', 'search_term']
        if 'relevance' in self.df.columns:
            required_columns.append('relevance')
        self.df = self.df.dropna(subset=required_columns)
        print(f"After removing duplicates and nulls: {self.df.shape}")

    def __preprocess_text(self):
        text_preprossor = TextPreprocessor()
        self.df['search_term_clean'] = self.df['search_term'].apply(text_preprossor.clean_text)
        self.df['product_title_clean'] = self.df['product_title'].apply(text_preprossor.clean_text)

        self.df['search_term_features'] = self.df['search_term'].apply(text_preprossor.extract_features)
        self.df['product_title_features'] = self.df['product_title'].apply(text_preprossor.extract_features)

        self.df['search_term'] = self.df['search_term_clean'] + " " + self.df['search_term_features']
        self.df['product_title'] = self.df['product_title_clean'] + " " + self.df['product_title_features']

        self.df['search_term'] = self.df['search_term'].apply(text_preprossor.remove_stopwords_and_stem)
        self.df['product_title'] = self.df['product_title'].apply(text_preprossor.remove_stopwords_and_stem)

    def __remove_outliers(self):
        Q1 = self.df['relevance'].quantile(0.25)
        Q3 = self.df['relevance'].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        self.df['relevance'] = self.df['relevance'].clip(lower_bound, upper_bound)

        outliers_count = len(self.df[(self.df['relevance'] < lower_bound) | (self.df['relevance'] > upper_bound)])
        print(f"Number of outliers: {outliers_count}")

    def __remove_unnecessary_columns(self):
        self.df = self.df.drop(
            columns=[
                'search_term_clean',
                'product_title_clean',
                'search_term_features',
                'product_title_features',
                'product_uid',
                'id'
            ]
        )

    def __normalize_relevance(self):
        scaler = MinMaxScaler()
        self.df['relevance'] = scaler.fit_transform(self.df[['relevance']])

    def set_df_and_name(self, csv_path, name):
        self.df = pd.read_csv(csv_path)
        self.name = name

    def get_df(self):
        return self.df

    def preprocess(self):
        self.__remove_duplicates_and_nulls()
        self.__preprocess_text()
        self.__remove_unnecessary_columns()
        if self.name == 'train':
            self.__remove_outliers()
            self.__normalize_relevance()
        print(f"Final {self.name} dataset shape: {self.df.shape}")
        print(self.df.head())
        print("--" * 80)

### Preprocessing Steps

In [9]:
data_preprocessor = DataPreprocessor('csv/clean/train.csv', 'train')
data_preprocessor.preprocess()
train_df = data_preprocessor.get_df()

data_preprocessor.set_df_and_name('csv/clean/test.csv', 'test')
data_preprocessor.preprocess()
test_df = data_preprocessor.get_df()

Before removing duplicates and nulls: (74067, 5)
After removing duplicates and nulls: (74067, 5)
Number of outliers: 0
Final train dataset shape: (74067, 3)
                                       product_title  \
0        simpson strong-ti 12gaug angl brand_simpson   
1        simpson strong-ti 12gaug angl brand_simpson   
2  behr premium textur deckov 1gal. sc-141tugboat...   
3  delta vero 1handl shower faucet trim kit in ch...   
4  delta vero 1handl shower faucet trim kit in ch...   

                   search_term  relevance  
0      angl bracket brand_angl      1.000  
1            l bracket brand_l      0.750  
2         deck over brand_deck      1.000  
3  rain shower head brand_rain      0.665  
4     shower faucet brand_show      0.835  
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Before removing duplicates and nulls: (166691, 4)
After removing duplicates and n

## Model Training

In [10]:
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import os
os.environ["WANDB_DISABLED"] = "true"

In [11]:
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

train_examples = [
    InputExample(texts=[s, p], label=float(r))
    for s, p, r in zip(train_df['search_term'], train_df['product_title'], train_df['relevance'])
]

val_examples = [
    InputExample(texts=[s, p], label=float(r))
    for s, p, r in zip(val_df['search_term'], val_df['product_title'], val_df['relevance'])
]

print("Train examples size: ", len(train_examples))
print("Val examples size: ", len(val_examples))

Train examples size:  59253
Val examples size:  14814


In [12]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)

val_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name='val')

model = SentenceTransformer('all-MiniLM-L6-v2')
train_loss = losses.CosineSimilarityLoss(model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=val_evaluator,
    epochs=3,
    warmup_steps=100,
    evaluation_steps=100,
    show_progress_bar=True,
    output_path='fine_tuned_model'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Val Pearson Cosine,Val Spearman Cosine
100,No log,No log,0.385437,0.371214
200,No log,No log,0.423255,0.414559
300,No log,No log,0.436013,0.425924
400,No log,No log,0.443393,0.433508
500,0.062400,No log,0.453244,0.444931
600,0.062400,No log,0.453867,0.445294
700,0.062400,No log,0.459225,0.451316
800,0.062400,No log,0.460117,0.451509
900,0.062400,No log,0.464365,0.456068
1000,0.058300,No log,0.461539,0.454369


## Model Testing

### Using test dataframe

In [13]:
model = SentenceTransformer('fine_tuned_model')

def predict_relevance_scores(search_terms, product_titles, batch_size=32):
    """
    Predict relevance scores between search terms and product titles
    Returns scores between 0 and 1 (higher = more relevant)
    """
    relevance_scores = []

    for i in range(0, len(search_terms), batch_size):
        batch_search = search_terms[i:i+batch_size]
        batch_products = product_titles[i:i+batch_size]

        search_embeddings = model.encode(batch_search, convert_to_tensor=True)
        product_embeddings = model.encode(batch_products, convert_to_tensor=True)

        similarities = util.cos_sim(search_embeddings, product_embeddings)

        batch_scores = [similarities[j][j].item() for j in range(len(batch_search))]
        relevance_scores.extend(batch_scores)

    return relevance_scores

print("Predicting relevance scores...")
predicted_scores = predict_relevance_scores(
    test_df['search_term'].tolist(),
    test_df['product_title'].tolist()
)

test_df['predicted_relevance'] = predicted_scores

print(f"\\nPredicted relevance scores (sample):")
print(test_df[['search_term', 'product_title', 'predicted_relevance']].head(10))

print(f"\\nScore statistics:")
print(f"Mean: {np.mean(predicted_scores):.4f}")
print(f"Std: {np.std(predicted_scores):.4f}")
print(f"Min: {np.min(predicted_scores):.4f}")
print(f"Max: {np.max(predicted_scores):.4f}")


Predicting relevance scores...
\nPredicted relevance scores (sample):
                               search_term  \
0                 90degre bracket brand_90   
1                metal l bracket brand_met   
2            simpson sku abl brand_simpson   
3         simpson strong tie brand_simpson   
4  simpson strong tie hcc668 brand_simpson   
5                wood connector brand_wood   
6               bath shower kit brand_bath   
7                bath drain kit brand_bath   
8             one piec tub shower brand_on   
9                  solar panel brand_solar   

                                       product_title  predicted_relevance  
0        simpson strong-ti 12gaug angl brand_simpson             0.626305  
1        simpson strong-ti 12gaug angl brand_simpson             0.633799  
2        simpson strong-ti 12gaug angl brand_simpson             0.645359  
3        simpson strong-ti 12gaug angl brand_simpson             0.640294  
4        simpson strong-ti 12gaug angl bran

In [14]:
def analyze_predictions(df, top_n=10):
    """
    Analyze the highest and lowest predicted relevance scores
    """
    df_sorted = df.sort_values('predicted_relevance', ascending=False)

    print("=" * 100)
    print(f"TOP {top_n} HIGHEST PREDICTED RELEVANCE SCORES")
    print("=" * 100)

    for i, (_, row) in enumerate(df_sorted.head(top_n).iterrows()):
        print(f"{i+1}. Score: {row['predicted_relevance']:.4f}")
        print(f"   Search: {row['search_term']}")
        print(f"   Product: {row['product_title']}")
        print("-" * 80)

    print("\\n" + "=" * 100)
    print(f"TOP {top_n} LOWEST PREDICTED RELEVANCE SCORES")
    print("=" * 100)

    for i, (_, row) in enumerate(df_sorted.tail(top_n).iterrows()):
        print(f"{i+1}. Score: {row['predicted_relevance']:.4f}")
        print(f"   Search: {row['search_term']}")
        print(f"   Product: {row['product_title']}")
        print("-" * 80)

analyze_predictions(test_df, top_n=5)


TOP 5 HIGHEST PREDICTED RELEVANCE SCORES
1. Score: 1.0000
   Search: emerald green arborvita brand_emerald color_green
   Product: emerald green arborvita brand_emerald color_green
--------------------------------------------------------------------------------
2. Score: 1.0000
   Search: brigg stratton carburetor brand_brigg
   Product: brigg stratton carburetor brand_brigg
--------------------------------------------------------------------------------
3. Score: 1.0000
   Search: electr box extend brand_electr
   Product: electr box extend brand_electr
--------------------------------------------------------------------------------
4. Score: 1.0000
   Search: emerald green arborvita brand_emerald color_green
   Product: emerald green arborvita brand_emerald color_green
--------------------------------------------------------------------------------
5. Score: 1.0000
   Search: samsung refriger water filter brand_samsung
   Product: samsung refriger water filter brand_samsung
---------

In [16]:
output_df = test_df[['product_title', 'search_term', 'predicted_relevance']].copy()

output_df = output_df.sort_values('predicted_relevance', ascending=False)

output_df.to_csv('test_predictions.csv', index=False)
print(f"✅ Saved predictions to 'test_predictions.csv'")
print(f"Total predictions: {len(output_df)}")

print(f"\\n📊 PREDICTION SUMMARY:")
print(f"Mean relevance score: {output_df['predicted_relevance'].mean():.4f}")
print(f"Median relevance score: {output_df['predicted_relevance'].median():.4f}")
print(f"Standard deviation: {output_df['predicted_relevance'].std():.4f}")

print(f"\\n📈 SCORE DISTRIBUTION:")
score_ranges = [
    (0.0, 0.2, "Very Low"),
    (0.2, 0.4, "Low"),
    (0.4, 0.6, "Medium"),
    (0.6, 0.8, "High"),
    (0.8, 1.0, "Very High")
]

for min_score, max_score, label in score_ranges:
    count = len(output_df[(output_df['predicted_relevance'] >= min_score) &
                         (output_df['predicted_relevance'] < max_score)])
    percentage = (count / len(output_df)) * 100
    print(f"{label} ({min_score}-{max_score}): {count} pairs ({percentage:.1f}%)")

print(f"\\n🎯 Top 5 most relevant search-product pairs:")
print(output_df[['search_term', 'product_title', 'predicted_relevance']].head())


✅ Saved predictions to 'test_predictions.csv'
Total predictions: 166691
\n📊 PREDICTION SUMMARY:
Mean relevance score: 0.7077
Median relevance score: 0.7257
Standard deviation: 0.1398
\n📈 SCORE DISTRIBUTION:
Very Low (0.0-0.2): 89 pairs (0.1%)
Low (0.2-0.4): 3966 pairs (2.4%)
Medium (0.4-0.6): 33265 pairs (20.0%)
High (0.6-0.8): 79129 pairs (47.5%)
Very High (0.8-1.0): 50221 pairs (30.1%)
\n🎯 Top 5 most relevant search-product pairs:
                                              search_term  \
110907  emerald green arborvita brand_emerald color_green   
72964               brigg stratton carburetor brand_brigg   
106252                     electr box extend brand_electr   
110906  emerald green arborvita brand_emerald color_green   
134128        samsung refriger water filter brand_samsung   

                                            product_title  predicted_relevance  
110907  emerald green arborvita brand_emerald color_green                  1.0  
72964               brigg stratton

### Manual Testing

In [19]:
def semantic_search_test_data(query, top_k=5):
    """
    Search for most relevant products in test data for a given query
    """
    text_preprocessor = TextPreprocessor()
    query_clean = text_preprocessor.clean_text(query)
    query_features = text_preprocessor.extract_features(query)
    query_processed = text_preprocessor.remove_stopwords_and_stem(query_clean + " " + query_features)

    unique_products = test_df['product_title'].unique()

    query_embedding = model.encode(query_processed, convert_to_tensor=True)
    product_embeddings = model.encode(unique_products, convert_to_tensor=True)

    hits = util.semantic_search(query_embedding, product_embeddings, top_k=top_k)[0]

    print(f"\\nTop {top_k} products for query: '{query}'")
    print("-" * 80)
    for i, hit in enumerate(hits):
        product_idx = hit['corpus_id']
        score = hit['score']
        product = unique_products[product_idx]

        original_product = test_df[test_df['product_title'] == product]['product_title'].iloc[0]

        print(f"{i+1}. Score: {score:.4f}")
        print(f"   Product: {original_product}")
        print()

test_queries = [
    "samsung washer",
    "concrete mix",
    "led lights",
    "fence post",
    "shower faucet"
]

for query in test_queries:
    semantic_search_test_data(query, top_k=3)

\nTop 3 products for query: 'samsung washer'
--------------------------------------------------------------------------------
1. Score: 0.9214
   Product: samsung 4.2cu. ft. front load washer with steam in platinum energi star brand_samsung

2. Score: 0.9206
   Product: samsung 4.8cu. ft. high-effici top load washer with activewash in white energi star brand_samsung color_whit

3. Score: 0.9205
   Product: samsung 4.2cu. ft. front load washer with steam in white energi star brand_samsung color_whit

\nTop 3 products for query: 'concrete mix'
--------------------------------------------------------------------------------
1. Score: 0.9024
   Product: quikret 50lb. fast-set concret mix brand_quikret

2. Score: 0.8991
   Product: quikret 80lb. concret mix brand_quikret

3. Score: 0.8825
   Product: rapid set 60lb. concret mix brand_rapid

\nTop 3 products for query: 'led lights'
--------------------------------------------------------------------------------
1. Score: 0.9345
   Product: l

## Saving model from colab to local

In [20]:
!zip -r fine_tuned_model.zip fine_tuned_model

  adding: fine_tuned_model/ (stored 0%)
  adding: fine_tuned_model/tokenizer.json (deflated 71%)
  adding: fine_tuned_model/README.md (deflated 69%)
  adding: fine_tuned_model/special_tokens_map.json (deflated 80%)
  adding: fine_tuned_model/tokenizer_config.json (deflated 73%)
  adding: fine_tuned_model/1_Pooling/ (stored 0%)
  adding: fine_tuned_model/1_Pooling/config.json (deflated 57%)
  adding: fine_tuned_model/config_sentence_transformers.json (deflated 34%)
  adding: fine_tuned_model/sentence_bert_config.json (deflated 4%)
  adding: fine_tuned_model/eval/ (stored 0%)
  adding: fine_tuned_model/eval/similarity_evaluation_val_results.csv (deflated 32%)
  adding: fine_tuned_model/model.safetensors (deflated 8%)
  adding: fine_tuned_model/modules.json (deflated 62%)
  adding: fine_tuned_model/vocab.txt (deflated 53%)
  adding: fine_tuned_model/config.json (deflated 48%)
  adding: fine_tuned_model/2_Normalize/ (stored 0%)


In [22]:
from google.colab import files
files.download('fine_tuned_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>