## Data Preprocessing

In [None]:
import torch
print(torch.cuda.is_available())  # Should print True
print(torch.cuda.get_device_name(0))  # Shows the GPU name

True
Tesla T4


### Imports

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

### Preprocessor Class

In [None]:
class Preprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def clean_text(self, text):
        """Comprehensive text cleaning for product search"""
        if pd.isna(text):
            return ""

        text = str(text).lower()

        # Remove special characters but keep important ones for products
        text = re.sub(r'[^\w\s\-\.]', ' ', text)

        # Handle measurements and units (keep them meaningful)
        text = re.sub(r'(\d+)\s*-\s*(\w+)', r'\1\2', text)  # "12-gauge" -> "12gauge"
        text = re.sub(r'(\d+)\s*(\w+)', r'\1\2', text)      # "1 gal" -> "1gal"

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def extract_features(self, text):
        """Extract meaningful features from product text"""
        if pd.isna(text):
            return ""

        text = str(text).lower()
        features = []

        # Extract brand information (first few words often contain brand)
        words = text.split()
        if len(words) > 0:
            potential_brand = words[0]
            features.append(f"brand_{potential_brand}")

        # Extract measurements
        measurements = re.findall(r'\d+(?:\.\d+)?(?:inch|in|ft|gal|gauge|lb)', text)
        features.extend([f"measure_{m}" for m in measurements])

        colors = re.findall(r'\b(?:black|white|brown|gray|grey|red|blue|green|yellow|silver|gold)\b', text)
        features.extend([f"color_{c}" for c in colors])

        return " ".join(features)

    def remove_stopwords_and_stem(self, text):
        """Remove stopwords and apply stemming"""
        if pd.isna(text):
            return ""

        words = text.split()
        important_words = {'with', 'for', 'in', 'on', 'over', 'under'}
        filtered_words = []

        for word in words:
            if word not in self.stop_words or word in important_words:
                stemmed = self.stemmer.stem(word)
                filtered_words.append(stemmed)

        return " ".join(filtered_words)


### Preprocessing Steps

In [None]:
df = pd.read_csv('csv/small/train.csv')

df = df.drop_duplicates()
df = df.dropna(subset=['product_title', 'search_term', 'relevance'])
print(f"After removing duplicates and nulls: {df.shape}")

preprocessor = Preprocessor()
df['search_term_clean'] = df['search_term'].apply(preprocessor.clean_text)
df['product_title_clean'] = df['product_title'].apply(preprocessor.clean_text)

df['search_term_features'] = df['search_term'].apply(preprocessor.extract_features)
df['product_title_features'] = df['product_title'].apply(preprocessor.extract_features)

df['search_term'] = df['search_term_clean'] + " " + df['search_term_features']
df['product_title'] = df['product_title_clean'] + " " + df['product_title_features']
df = df.drop(columns=['search_term_clean', 'product_title_clean', 'search_term_features', 'product_title_features', 'product_uid', 'id'])

df['search_term'] = df['search_term'].apply(preprocessor.remove_stopwords_and_stem)
df['product_title'] = df['product_title'].apply(preprocessor.remove_stopwords_and_stem)

print(f"Final dataset shape: {df.shape}")
print(df.head())

After removing duplicates and nulls: (5000, 5)
Final dataset shape: (5000, 3)
                                       product_title  \
0        simpson strong-ti 12gaug angl brand_simpson   
1        simpson strong-ti 12gaug angl brand_simpson   
2  behr premium textur deckov 1gal. sc-141tugboat...   
3  delta vero 1handl shower faucet trim kit in ch...   
4  delta vero 1handl shower faucet trim kit in ch...   

                   search_term  relevance  
0      angl bracket brand_angl       3.00  
1            l bracket brand_l       2.50  
2         deck over brand_deck       3.00  
3  rain shower head brand_rain       2.33  
4     shower faucet brand_show       2.67  


In [None]:
Q1 = df['relevance'].quantile(0.25)
Q3 = df['relevance'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Outlier bounds: [{lower_bound:.2f}, {upper_bound:.2f}]")
outliers_count = len(df[(df['relevance'] < lower_bound) | (df['relevance'] > upper_bound)])
print(f"Number of outliers: {outliers_count}")

df['relevance'] = df['relevance'].clip(lower_bound, upper_bound)

scaler = MinMaxScaler()
df['relevance'] = scaler.fit_transform(df[['relevance']])

print(df.head())

Outlier bounds: [-0.25, 1.75]
Number of outliers: 0
                                       product_title  \
0        simpson strong-ti 12gaug angl brand_simpson   
1        simpson strong-ti 12gaug angl brand_simpson   
2  behr premium textur deckov 1gal. sc-141tugboat...   
3  delta vero 1handl shower faucet trim kit in ch...   
4  delta vero 1handl shower faucet trim kit in ch...   

                   search_term  relevance  
0      angl bracket brand_angl      1.000  
1            l bracket brand_l      0.750  
2         deck over brand_deck      1.000  
3  rain shower head brand_rain      0.665  
4     shower faucet brand_show      0.835  


## Model Training

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_examples = [
    InputExample(texts=[s, p], label=float(r))
    for s, p, r in zip(train_df['search_term'], train_df['product_title'], train_df['relevance'])
]

val_examples = [
    InputExample(texts=[s, p], label=float(r))
    for s, p, r in zip(val_df['search_term'], val_df['product_title'], val_df['relevance'])
]

print("Train examples size: ", len(train_examples))
print("Val examples size: ", len(val_examples))

Train examples size:  4000
Val examples size:  1000


In [None]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)

val_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name='val')

model = SentenceTransformer('all-MiniLM-L6-v2')
train_loss = losses.CosineSimilarityLoss(model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=val_evaluator,
    epochs=3,
    warmup_steps=100,
    evaluation_steps=100,
    show_progress_bar=True,
    output_path='fine_tuned_model'
)

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Val Pearson Cosine,Val Spearman Cosine
100,No log,No log,0.391603,0.375935
200,No log,No log,0.427199,0.412341
250,No log,No log,0.431074,0.416758
300,No log,No log,0.440604,0.426996
400,No log,No log,0.435584,0.421595
500,0.061000,No log,0.444919,0.436591
600,0.061000,No log,0.445679,0.437631
700,0.061000,No log,0.450858,0.442915
750,0.061000,No log,0.450748,0.442925


## Model Testing Manual

In [None]:
from sentence_transformers import SentenceTransformer, util

documents = df['product_title'].tolist()

model = SentenceTransformer('fine_tuned_model')

doc_embeddings = model.encode(documents, convert_to_tensor=True)

def semantic_search(query: str, top_k: int = 5):
    query_embedding = model.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, doc_embeddings, top_k=top_k)[0]

    print(f"\nTop {top_k} results for your query: \"{query}\"")
    for i, hit in enumerate(hits):
        doc = documents[hit['corpus_id']]
        print(f"{i+1}. {doc} (score: {hit['score']:.4f})")

user_query = "simpson"
semantic_search(user_query)


Top 5 results for your query: "simpson"
1. simpson strong-ti cb 6x6 galvan column base brand_simpson (score: 0.4448)
2. simpson strong-ti 2in. 12gaug pipe grip tie brand_simpson (score: 0.4354)
3. simpson strong-ti 2in. 12gaug pipe grip tie brand_simpson (score: 0.4354)
4. simpson strong-ti 2in. 12gaug pipe grip tie brand_simpson (score: 0.4354)
5. simpson strong-ti 12gaug black powder-co e-z base brand_simpson color_black (score: 0.4236)
