# Imports & Setup

In [1]:
!pip -q install torchsummary
!pip -q install transformers
!pip -q install tokenizers

In [3]:
# Standard library imports
import re
from collections import Counter
import os
from time import time

# Third-party library imports for data manipulation and analysis
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from tokenizers import ByteLevelBPETokenizer

# PyTorch and related library imports for deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset, Subset

import pytorch_lightning as pl

# Transformers library imports for NLP
from transformers import BertModel, BertTokenizer

# Gensim library imports for word embeddings and nltk for text processing
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

# Matplotlib library imports for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Wandb library imports for experiment tracking
import wandb

# Other utility and helper imports
from torchsummary import summary
from tqdm import tqdm
from kaggle_secrets import UserSecretsClient

In [4]:
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_key")
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
class Hyperparameters:
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

hp = Hyperparameters(
    # setup
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    seed=2024,

    # data
    val_ratio=0.1,
    batch_size=128,
    max_len=512,
    num_workers=0,
    
    # model
    hidden_size=128,
    
    # training
    learning_rate=1e-3,
    max_lr=0.01,
    num_epochs=30,
    patience=3,
    
    criterion=nn.CrossEntropyLoss(),
    optimizer=optim.AdamW,
)

In [6]:
torch.multiprocessing.set_start_method('spawn')

# Data

In [7]:
root = '/kaggle/input/home-depot-product-search-relevance'

train_path = f'{root}/train.csv.zip'
test_path = f'{root}/test.csv.zip'
test_labels_path = f'{root}/sample_submission.csv.zip'
product_path = f'{root}/product_descriptions.csv.zip'
attributes_path = f'{root}/attributes.csv.zip'

In [8]:
train_set = pd.read_csv(train_path, encoding='ISO-8859-1')
test_set = pd.read_csv(test_path, encoding='ISO-8859-1')
test_labels =  pd.read_csv(test_labels_path)
product_df = pd.read_csv(product_path)
attributes_df = pd.read_csv(attributes_path)

test_set = pd.merge(test_set, test_labels, on='id')
test_set = test_set[test_set['relevance'] != -1]

# EDA

In [None]:
# Display a few rows of the loaded data
print("Training Data:")
print(train_set.head())

print("\nTesting Data:")
print(test_set.head())

print("\nAttributes Data:")
print(attributes_df.head())

print("\nDescriptions Data:")
print(product_df.head())

In [None]:
print(train_set.info())
print()
print(train_set.isna().sum())

In [None]:
print("training data shape is:", train_set.shape)
print("testing data shape is:", test_set.shape)
print("attribute data shape is:", attributes_df.shape)
print("description data shape is:", product_df.shape)

In [None]:
# Number of unique product
len(train_set['product_uid'].unique()) # 54667 rather than 74067

In [None]:
train_set['search_len'] = train_set['search_term'].apply(len)  # length of search term
train_set['product_len'] = train_set['product_title'].apply(len)  # length of pro

In [None]:
nltk.download('punkt')

# tokenize the words in the 'search_term' column
search_term_words = train_set['search_term'].apply(word_tokenize).tolist()
search_term_words = [word for sublist in search_term_words for word in sublist]

# calculate the frequency of each word
search_term_freq = Counter(search_term_words)

# print the most common words
search_term_freq.most_common(10)

In [None]:
train_set[['search_len', 'product_len', 'relevance']].corr()  # correlation between length

In [None]:
# Some statistic about relevance score
print(train_set['relevance'].value_counts())
train_set['relevance'].hist(bins=13)

In [None]:
# Explore distribution of relevance scores
plt.figure(figsize=(10, 6))
sns.histplot(train_set['relevance'], bins=30, kde=True)
plt.title('Distribution of Relevance Scores on Training Data')
plt.xlabel('Relevance Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Histogram for 'search_len'
plt.hist(train_set['search_len'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Search Length')
plt.ylabel('Frequency')
plt.title('Histogram of Search Length')
plt.show()

# Histogram for 'product_len'
plt.hist(train_set['product_len'], bins=20, color='salmon', edgecolor='black')
plt.xlabel('Product Title Length')
plt.ylabel('Frequency')
plt.title('Histogram of Product Length')
plt.show()

In [None]:
# Explore the relationship between relevance and the length of the search term
plt.figure(figsize=(10, 6))
sns.scatterplot(x='search_len', y='relevance', data=train_set)
plt.title('Relevance vs. Search Term Length')
plt.xlabel('Search Term Length')
plt.ylabel('Relevance Score')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='product_len', y='relevance', data=train_set)
plt.title('Relevance vs. Search Term Length')
plt.xlabel('Product Title Length')
plt.ylabel('Relevance Score')
plt.show()

In [None]:
# Calculate token lengths for each description
pd_token_len = pd.Series(map(lambda x: len(x.split()), product_df['product_description'].tolist()))

# Print value counts
print(pd_token_len.value_counts())

# Create histogram
pd_token_len.hist()

In [None]:
# 155 entries have no data
attributes_df[attributes_df['product_uid'].isnull()]

# Baseline Naive model

In [None]:
# Create features using CountVectorizer
vectorizer = CountVectorizer(analyzer='char')
X_product = vectorizer.fit_transform(train_set['product_title'])
X_search = vectorizer.transform(train_set['search_term'])

# Combine the features
X = X_product + X_search

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, train_set['relevance'], test_size=0.2, random_state=42)

# test set
X_test_product = vectorizer.transform(test_set['product_title'])
X_test_search = vectorizer.transform(test_set['search_term'])
X_test = X_test_product + X_test_search
y_test = test_set['relevance']

# # Train a Random Forest model
# t = time()

# regressor_rf = RandomForestRegressor(n_estimators=100, random_state=42)
# regressor_rf.fit(X_train, y_train)

# print(f"RF training time: {time() - t:.2f}s")

# Train an XGBoost model
t = time()

regressor_xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
regressor_xgb.fit(X_train, y_train)

print(f"XGB training time: {time() - t:.2f}s")


In [None]:
# Make predictions
# y_train_pred_rf = regressor_rf.predict(X_train)
# y_val_pred_rf = regressor_rf.predict(X_val)
# y_test_pred_rf = regressor_rf.predict(X_test)

y_train_pred_xgb = regressor_xgb.predict(X_train)
y_val_pred_xgb = regressor_xgb.predict(X_val)
y_test_pred_xgb = regressor_xgb.predict(X_test)

# Evaluate the models
# train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
# val_rmse_rf = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
# test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))

# train_mae_rf = mean_absolute_error(y_train, y_train_pred_rf)
# val_mae_rf = mean_absolute_error(y_val, y_val_pred_rf)
# test_mae_rf = mean_absolute_error(y_test, y_test_pred_rf)

train_rmse_xgb = np.sqrt(mean_squared_error(y_train, y_train_pred_xgb))
val_rmse_xgb = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, y_test_pred_xgb))

train_mae_xgb = mean_absolute_error(y_train, y_train_pred_xgb)
val_mae_xgb = mean_absolute_error(y_val, y_val_pred_xgb)
test_mae_xgb = mean_absolute_error(y_test, y_test_pred_xgb)

# print('\nRandom Forest:')
# print(f'Training RMSE: {train_rmse_rf:.4f}')
# print(f'Validation RMSE: {val_rmse_rf:.4f}')
# print(f'Test RMSE: {test_rmse_rf:.4f}')
# print(f'Training MAE: {train_mae_rf:.4f}')
# print(f'Validation MAE: {val_mae_rf:.4f}')
# print(f'Test MAE: {test_mae_rf:.4f}')

print('\nXGBoost:')
print(f'Training RMSE: {train_rmse_xgb:.4f}')
print(f'Validation RMSE: {val_rmse_xgb:.4f}')
print(f'Test RMSE: {test_rmse_xgb:.4f}')
print(f'Training MAE: {train_mae_xgb:.4f}')
print(f'Validation MAE: {val_mae_xgb:.4f}')
print(f'Test MAE: {test_mae_xgb:.4f}')

# Character Level Model

## Data Pre-Processing

In [None]:
# Remove 155 null entries in attribute table
attributes_df = attributes_df.drop(attributes_df[attributes_df['product_uid'].isnull()].index, axis=0)

In [None]:
# Merge dataframes
train_data = pd.merge(train_set, product_df, how='left', on='product_uid')
test_data = pd.merge(test_set, product_df, how='left', on='product_uid')

In [None]:
train_data['search_term'] = train_data['search_term'].apply(list)
train_data['product_title'] = train_data['product_title'].apply(list)
train_data['product_description'] = train_data['product_description'].apply(list)
train_data.head(1)

In [None]:
# same for test data
test_data['search_term'] = test_data['search_term'].apply(list)
test_data['product_title'] = test_data['product_title'].apply(list)
test_data['product_description'] = test_data['product_description'].apply(list)
test_data.head(1)

In [None]:
# unique characters in the data
unique_chars = set()
for s in train_data['search_term']:
    unique_chars.update(s)
for s in train_data['product_title']:
    unique_chars.update(s)
for s in train_data['product_description']:
    unique_chars.update(s)

for s in test_data['search_term']:
    unique_chars.update(s)
for s in test_data['product_title']:
    unique_chars.update(s)
for s in test_data['product_description']:
    unique_chars.update(s)

len(unique_chars)

## Dataset

In [9]:
class SearchRelevanceDataset(Dataset):
    def __init__(self, df, unique_chars, max_len=512):
        self.df = df
        self.unique_chars = unique_chars
        self.max_len = max_len

        self.char2idx = {char: idx for idx, char in enumerate(self.unique_chars)}
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        product = self.df.iloc[idx]['product_title']
        search = self.df.iloc[idx]['search_term']

        # ids
        product = [self.char2idx[char] for char in product]
        search = [self.char2idx[char] for char in search]

        # tensorize
        product = torch.tensor(product).float()
        search = torch.tensor(search).float()

        # pad to max_len or truncate
        if len(product) < self.max_len:
            product = F.pad(product, (0, self.max_len - len(product)))
        else:
            product = product[:self.max_len]

        if len(search) < self.max_len:
            search = F.pad(search, (0, self.max_len - len(search)))
        else:
            search = search[:self.max_len]

        relevance = torch.tensor(self.df.iloc[idx]['relevance']).float()
        
        return product.unsqueeze(-1), search.unsqueeze(-1), relevance

In [9]:
dataset = SearchRelevanceDataset(train_data, unique_chars)

# Split the dataset into training and validation sets
val_size = int(hp.val_ratio * len(dataset))
train_size = len(dataset) - val_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset,
                          batch_size=hp.batch_size,
                          num_workers=hp.num_workers,
                          shuffle=True)
val_loader = DataLoader(val_dataset,
                        batch_size=hp.batch_size,
                        num_workers=hp.num_workers,
                        shuffle=False)

test_dataset = SearchRelevanceDataset(test_data, unique_chars)
test_loader = DataLoader(test_dataset,
                         batch_size=hp.batch_size,
                         num_workers=hp.num_workers,
                         shuffle=False)

NameError: name 'train_data' is not defined

In [None]:
train_dataset[0][0].shape, train_dataset[1][0].shape

In [None]:
next(iter(train_loader))[0].shape

## Model

In [10]:
class SiameseLSTM(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim):
        super(SiameseLSTM, self).__init__()

        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True,dropout=0.4)
        self.fc_out = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

        self.loss_fn = nn.MSELoss()
        self.mae = nn.L1Loss()
        self.bn = nn.BatchNorm1d(hidden_dim)  # Batch normalization layer

    def forward_net(self, x):
        _, (h, _) = self.lstm(x)  # h: (1, batch_size, hidden_dim)
        
        return h.squeeze(0)  # (batch_size, hidden_dim)

    def forward(self, input1, input2):
        output1 = self.forward_net(input1)
        output2 = self.forward_net(input2)

        diff = torch.abs(output1 - output2)
        output = self.fc_out(diff)

        output = self.sigmoid(output)

        return output  # (batch_size, 1)

    def training_step(self, batch, batch_idx):
        product, search, relevance = batch
        output = self(product, search)  # (batch_size, 1)

        # normalize relevance - min is 1, max is 3
        relevance = (relevance - 1) / 2  # (batch_size)

        loss = self.loss_fn(output, relevance.unsqueeze(-1))
        mae = self.mae(output, relevance.unsqueeze(-1))
        
        self.log('train_loss', loss)
        self.log('train_mae', mae)

        return loss

    def validation_step(self, batch, batch_idx):
        product, search, relevance = batch
        output = self(product, search)

        # normalize relevance - min is 1, max is 3
        relevance = (relevance - 1) / 2

        loss = self.loss_fn(output, relevance.unsqueeze(-1))
        mae = self.mae(output, relevance.unsqueeze(-1))

        self.log('val_loss', loss)
        self.log('val_mae', mae)

        return loss

    def test_step(self, batch, batch_idx):
        product, search, relevance = batch
        output = self(product, search)

        # normalize relevance - min is 1, max is 3
        relevance = (relevance - 1) / 2

        loss = self.loss_fn(output, relevance.unsqueeze(-1))
        mae = self.mae(output, relevance.unsqueeze(-1))
  
        self.log('test_loss', loss)
        self.log('test_mae', mae)

        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [None]:
input_dim = 1  # one character (input_id) at a time
hidden_dim = hp.hidden_size
model_char = SiameseLSTM(input_dim, hidden_dim)

model_char.train()

In [None]:
# check feedforward
with torch.no_grad():
    product, search, relevance = train_dataset[0]

    print(product.shape, search.shape)
    
    output = model_char(product.unsqueeze(0), search.unsqueeze(0))

    print(output)

## Training

In [24]:
logger = pl.loggers.WandbLogger(entity='questgen', project='dlw-ass3', log_model=True)
checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor='val_loss', save_top_k=1, mode='min')
early_stopping = pl.callbacks.EarlyStopping(monitor='val_loss', patience=hp.patience, mode='min')

trainer = pl.Trainer(
    accelerator='auto',
    max_epochs=hp.num_epochs,
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping],
)

In [25]:
trainer.fit(model_char, train_loader, val_loader)

[34m[1mwandb[0m: Currently logged in as: [33mkatzmax[0m ([33mquestgen[0m). Use [1m`wandb login --relogin`[0m to force relogin


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [26]:
# load the best model
model_char = SiameseLSTM.load_from_checkpoint(checkpoint_callback.best_model_path, hidden_dim=hp.hidden_size, input_dim=1)

trainer.test(model_char, test_loader)

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.27223077416419983, 'test_mae': 0.5217583179473877}]

In [27]:
wandb.finish()

VBox(children=(Label(value='0.786 MB of 0.833 MB uploaded\r'), FloatProgress(value=0.9438638112336497, max=1.0…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
test_loss,▁
test_mae,▁
train_loss,▆▄▃▃▄█▃▃▁▇▂▄▃▃▇▅▄▅▅▃▅▆▄▄▃▁▄▆▆▂▃▇▂▃▅▃▅▄▄▂
train_mae,▅▄▃▃▅█▃▃▁█▃▃▄▃▆▄▄▆▄▂▄▅▅▃▂▁▄▅▆▂▃▆▁▃▅▃▅▄▃▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,███▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▁▁▁
val_mae,████▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▁▁▁

0,1
epoch,30.0
test_loss,0.27223
test_mae,0.52176
train_loss,0.06254
train_mae,0.20514
trainer/global_step,7830.0
val_loss,0.09912
val_mae,0.26765


# Word Level Model

## Word Preproccesing

In [11]:
# read the data again
train_set = pd.read_csv(train_path, encoding='ISO-8859-1')
test_set = pd.read_csv(test_path, encoding='ISO-8859-1')
test_labels = pd.read_csv(test_labels_path)
product_df = pd.read_csv(product_path)
attributes_df = pd.read_csv(attributes_path)

test_set = pd.merge(test_set, test_labels, on='id')
test_set = test_set[test_set['relevance'] != -1]

# merge the dataframes
train_set = pd.merge(train_set, product_df, how='left', on='product_uid')
test_set = pd.merge(test_set, product_df, how='left', on='product_uid')

# remove the 155 null entries in the attribute table
attributes_df = attributes_df.drop(attributes_df[attributes_df['product_uid'].isnull()].index, axis=0)

# merge the dataframes
train_set = pd.merge(train_set, attributes_df, how='left', on='product_uid')
test_set = pd.merge(test_set, attributes_df, how='left', on='product_uid')

In [12]:
train_text = train_set['product_title'] + ' ' + train_set['product_description'] + ' ' + train_set['search_term']
train_text = train_text.tolist()

In [13]:
# create a tokenizer
bpe_tokenizer = ByteLevelBPETokenizer()

# train the tokenizer without using parallel processing
# os.environ['TOKENIZERS_PARALLELISM'] = 'false'
bpe_tokenizer.train_from_iterator(train_text, min_frequency=2)






In [14]:
bpe_tokenizer

Tokenizer(vocabulary_size=30000, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [15]:
tokenized_train = bpe_tokenizer.encode_batch(train_text[:150_000])
tokenized_train = [t.tokens for t in tokenized_train]

In [16]:
# train the word2vec model on the tokenized text
word2vec_model = Word2Vec(tokenized_train, vector_size=100, window=5, min_count=1, workers=0)  # workers=0 for single core

## Dataset

In [17]:
class WordSearchRelevanceDataset(Dataset):
    def __init__(self, df, tokenizer, word_embeddings, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.word_embeddings = word_embeddings
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        product = self.df.iloc[idx]['product_title']
        search = self.df.iloc[idx]['search_term']

        # convert to word embeddings
        product = self.embed(product)  # shape: (seq_len, embedding_dim)
        search = self.embed(search)  # shape: (seq_len, embedding_dim)

        # pad to max_len or truncate
        if product.shape[0] < self.max_len:
            product = F.pad(product, (0, 0, 0, self.max_len - product.shape[0]))
        else:
            product = product[:self.max_len]

        if search.shape[0] < self.max_len:
            search = F.pad(search, (0, 0, 0, self.max_len - search.shape[0]))
        else:
            search = search[:self.max_len]

        relevance = torch.tensor(self.df.iloc[idx]['relevance']).float()

        return product, search, relevance
    
    def embed(self, text):
        tokens = self.tokenizer.encode(text).tokens
        embed = torch.tensor([self.word_embeddings[word] for word in tokens if word in self.word_embeddings])

        if not embed.shape[0]:
            embed = torch.zeros(self.word_embeddings.vector_size).unsqueeze(0)

        return embed  # shape: (seq_len, embedding_dim)

In [18]:
dataset = WordSearchRelevanceDataset(train_set, bpe_tokenizer, word2vec_model.wv)

# Split the dataset into training and validation sets
val_size = int(hp.val_ratio * len(dataset))
train_size = len(dataset) - val_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset,
                          batch_size=hp.batch_size,
                          num_workers=hp.num_workers,
                          shuffle=True)
val_loader = DataLoader(val_dataset,
                        batch_size=hp.batch_size,
                        num_workers=hp.num_workers,
                        shuffle=False)

test_dataset = WordSearchRelevanceDataset(test_set, bpe_tokenizer, word2vec_model.wv)
test_loader = DataLoader(test_dataset,
                         batch_size=hp.batch_size,
                         num_workers=hp.num_workers,
                         shuffle=False)

In [19]:
train_dataset[0][0].shape, train_dataset[1][0].shape

  embed = torch.tensor([self.word_embeddings[word] for word in tokens if word in self.word_embeddings])


(torch.Size([512, 100]), torch.Size([512, 100]))

In [20]:
next(iter(train_loader))[0].shape

torch.Size([128, 512, 100])

## Training Word Model

In [21]:
model_word = SiameseLSTM(word2vec_model.vector_size,
                         hp.hidden_size)
model_word.train()



SiameseLSTM(
  (lstm): LSTM(100, 128, batch_first=True, dropout=0.4)
  (fc_out): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (loss_fn): MSELoss()
  (mae): L1Loss()
  (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [22]:
logger = pl.loggers.WandbLogger(entity='questgen', project='dlw-ass3', log_model=True)
checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor='val_loss', save_top_k=1, mode='min')
early_stopping = pl.callbacks.EarlyStopping(monitor='val_loss', patience=hp.patience, mode='min')

trainer = pl.Trainer(
    accelerator='auto',
    max_epochs=hp.num_epochs,
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping],
)

In [None]:
trainer.fit(model_word, train_loader, val_loader)

[34m[1mwandb[0m: Currently logged in as: [33mkatzmax[0m ([33mquestgen[0m). Use [1m`wandb login --relogin`[0m to force relogin


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
# load the best model
model_word = SiameseLSTM.load_from_checkpoint(checkpoint_callback.best_model_path, hidden_dim=hp.hidden_size, input_dim=word2vec_model.vector_size)

trainer.test(model_word, test_loader)

In [None]:
wandb.finish()

# Pretrained Bert Model

## Dataset

In [None]:
class BertSearchRelevanceDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased').to(hp.device)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        product = self.df.iloc[idx]['product_title']
        search = self.df.iloc[idx]['search_term']

        product = self.embed(product)
        search = self.embed(search)

        relevance = torch.tensor(self.df.iloc[idx]['relevance']).float()

        return product, search, relevance
    
    def embed(self, text):
        tokens = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(hp.device)
        with torch.no_grad():
            output = self.model(**tokens)
        embed = output.last_hidden_state.mean(dim=1).squeeze()

        return embed

In [None]:
dataset = BertSearchRelevanceDataset(train_set)

# Split the dataset into training and validation sets
val_size = int(hp.val_ratio * len(dataset))
train_size = len(dataset) - val_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset,
                          batch_size=hp.batch_size,
                          num_workers=hp.num_workers,
                          shuffle=True)
val_loader = DataLoader(val_dataset,
                        batch_size=hp.batch_size,
                        num_workers=hp.num_workers,
                        shuffle=False)

test_dataset = BertSearchRelevanceDataset(test_set)
test_loader = DataLoader(test_dataset,
                         batch_size=hp.batch_size,
                         num_workers=hp.num_workers,
                         shuffle=False)

In [None]:
train_dataset[0][0].shape

## Model

In [None]:
class SiameseLinear(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim):
        super(SiameseLinear, self).__init__()

        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 128)
        self.fc_out = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

        self.loss_fn = nn.MSELoss()
        self.mae = nn.L1Loss()

    def forward_net(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        return x
    
    def forward(self, input1, input2):
        output1 = self.forward_net(input1)
        output2 = self.forward_net(input2)

        diff = torch.abs(output1 - output2)
        output = self.fc_out(diff)

        output = self.sigmoid(output)

        return output

    def training_step(self, batch, batch_idx):
        product, search, relevance = batch
        output = self(product, search)

        # normalize relevance - min is 1, max is 3
        relevance = (relevance - 1) / 2

        loss = self.loss_fn(output, relevance.unsqueeze(1))
        mae = self.mae(output, relevance.unsqueeze(1))
        
        self.log('train_loss', loss)
        self.log('train_mae', mae)

        return loss

    def validation_step(self, batch, batch_idx):
        product, search, relevance = batch
        output = self(product, search)

        # normalize relevance - min is 1, max is 3
        relevance = (relevance - 1) / 2

        loss = self.loss_fn(output, relevance.unsqueeze(1))
        mae = self.mae(output, relevance.unsqueeze(1))

        self.log('val_loss', loss)
        self.log('val_mae', mae)

        return loss

    def test_step(self, batch, batch_idx):
        product, search, relevance = batch
        output = self(product, search)

        # normalize relevance - min is 1, max is 3
        relevance = (relevance - 1) / 2

        loss = self.loss_fn(output, relevance.unsqueeze(1))
        mae = self.mae(output, relevance.unsqueeze(1))
  
        self.log('test_loss', loss)
        self.log('test_mae', mae)

        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

## Training Model

In [None]:
model_bert = SiameseLinear(dataset.model.config.hidden_size,
                           hp.hidden_size)
model_bert.train()

In [None]:
logger = pl.loggers.WandbLogger(entity='questgen', project='dlw-ass3', log_model=True)
checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor='val_loss', save_top_k=1, mode='min')
early_stopping = pl.callbacks.EarlyStopping(monitor='val_loss', patience=hp.patience, mode='min')

trainer = pl.Trainer(
    accelerator='auto',
    max_epochs=hp.num_epochs,
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping],
)

In [None]:
trainer.fit(model_bert, train_loader, val_loader)

In [None]:
# load the best model
model_bert = SiameseLinear.load_from_checkpoint(checkpoint_callback.best_model_path, hidden_dim=hp.hidden_size, input_dim=bert_model.config.hidden_size)

trainer.test(model_bert, test_loader)

In [None]:
wandb.finish()