In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
#!pip install tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
# Load data
train_df = pd.read_csv('data/train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv('data/test.csv', encoding='ISO-8859-1')
product_descriptions_df = pd.read_csv('data/product_descriptions.csv', encoding='ISO-8859-1')

# Merge product descriptions with training and test data
train_df = pd.merge(train_df, product_descriptions_df, on='product_uid', how='left')
test_df = pd.merge(test_df, product_descriptions_df, on='product_uid', how='left')

# Combine text columns
train_df['text'] = train_df['product_title'] + " " + train_df['product_description'] + " " + train_df['search_term']
test_df['text'] = test_df['product_title'] + " " + test_df['product_description'] + " " + test_df['search_term']

# Tokenize the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['text'])

X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])

# Pad sequences to ensure uniform input length
max_seq_length = 100
X_train = pad_sequences(X_train, maxlen=max_seq_length)
X_test = pad_sequences(X_test, maxlen=max_seq_length)

# Get the target variable
y_train = train_df['relevance'].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [3]:

from torch.utils.data import TensorDataset, DataLoader
# Create TensorDatasets for training, validation, and test sets
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.long))

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [7]:
import torch.nn as nn

class RelevanceModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, lstm_units):
        super(RelevanceModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, lstm_units, batch_first=True)
        self.fc1 = nn.Linear(lstm_units, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Take the output of the last time step
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the model
vocab_size = 10000
embed_dim = 256
lstm_units = 256
model = RelevanceModel(vocab_size, embed_dim, lstm_units)

In [5]:
import torch.optim as optim
import torch
from tqdm import tqdm 
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Training loop
num_epochs = 50

for epoch in tqdm(range(num_epochs)):
    model.train()
    train_loss = 0
    for texts, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(texts).squeeze()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    val_loss = 0
    model.eval()
    with torch.no_grad():
        for texts, targets in val_loader:
            outputs = model(texts).squeeze()
            loss = criterion(outputs, targets)
            val_loss += loss.item()

    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}')

  2%|▏         | 1/50 [02:17<1:52:40, 137.98s/it]

Epoch 1/50, Train Loss: 0.30377683398307787, Val Loss: 0.2922657231114338


  4%|▍         | 2/50 [04:43<1:54:06, 142.64s/it]

Epoch 2/50, Train Loss: 0.24882313757169325, Val Loss: 0.2504666202397892


  6%|▌         | 3/50 [07:07<1:51:56, 142.91s/it]

Epoch 3/50, Train Loss: 0.22490353297772084, Val Loss: 0.25020063658834274


  8%|▊         | 4/50 [09:47<1:54:54, 149.89s/it]

Epoch 4/50, Train Loss: 0.2069965683277817, Val Loss: 0.25554341393207114


 10%|█         | 5/50 [12:00<1:47:41, 143.59s/it]

Epoch 5/50, Train Loss: 0.19231939729359443, Val Loss: 0.245989790166171


 12%|█▏        | 6/50 [14:13<1:42:38, 139.96s/it]

Epoch 6/50, Train Loss: 0.18052711274643826, Val Loss: 0.2571100928935325


 14%|█▍        | 7/50 [16:30<1:39:45, 139.19s/it]

Epoch 7/50, Train Loss: 0.17213267310407843, Val Loss: 0.2520551906806089


 16%|█▌        | 8/50 [18:50<1:37:32, 139.36s/it]

Epoch 8/50, Train Loss: 0.1640409476870015, Val Loss: 0.2641716411349325


 18%|█▊        | 9/50 [21:04<1:34:04, 137.66s/it]

Epoch 9/50, Train Loss: 0.1566698838701194, Val Loss: 0.26309581149216604


 20%|██        | 10/50 [23:21<1:31:38, 137.47s/it]

Epoch 10/50, Train Loss: 0.15184843048380917, Val Loss: 0.2606661297847591


 22%|██▏       | 11/50 [25:51<1:31:50, 141.29s/it]

Epoch 11/50, Train Loss: 0.1462413924193054, Val Loss: 0.25588375484454196


 24%|██▍       | 12/50 [28:08<1:28:45, 140.14s/it]

Epoch 12/50, Train Loss: 0.14225725261957162, Val Loss: 0.262502315685507


 26%|██▌       | 13/50 [30:47<1:29:54, 145.80s/it]

Epoch 13/50, Train Loss: 0.1392543392807052, Val Loss: 0.2614190470000838


 28%|██▊       | 14/50 [33:20<1:28:51, 148.09s/it]

Epoch 14/50, Train Loss: 0.13607086757998552, Val Loss: 0.27817015288715485


 30%|███       | 15/50 [35:51<1:26:46, 148.76s/it]

Epoch 15/50, Train Loss: 0.13325207906757072, Val Loss: 0.27620561868274185


 32%|███▏      | 16/50 [38:06<1:22:02, 144.77s/it]

Epoch 16/50, Train Loss: 0.13131377447726403, Val Loss: 0.26449040159357806


 34%|███▍      | 17/50 [40:13<1:16:42, 139.48s/it]

Epoch 17/50, Train Loss: 0.12913552760981328, Val Loss: 0.2682052879527887


 36%|███▌      | 18/50 [42:29<1:13:48, 138.39s/it]

Epoch 18/50, Train Loss: 0.12808343461567845, Val Loss: 0.27537525372809


 38%|███▊      | 19/50 [44:40<1:10:16, 136.03s/it]

Epoch 19/50, Train Loss: 0.12709181337792338, Val Loss: 0.2724733661305055


 40%|████      | 20/50 [46:52<1:07:21, 134.71s/it]

Epoch 20/50, Train Loss: 0.12543310206303007, Val Loss: 0.2793133260267857


 42%|████▏     | 21/50 [49:10<1:05:43, 136.00s/it]

Epoch 21/50, Train Loss: 0.1243169274458714, Val Loss: 0.2734121423270996


 44%|████▍     | 22/50 [51:32<1:04:10, 137.51s/it]

Epoch 22/50, Train Loss: 0.1242822749926417, Val Loss: 0.277284738199371


 44%|████▍     | 22/50 [51:48<1:05:56, 141.31s/it]


KeyboardInterrupt: 

In [6]:
#RMSE
from sklearn.metrics import mean_squared_error
import numpy as np
model.eval()
val_predictions = []
with torch.no_grad():
    for batch in val_loader:
        texts = batch[0]
        outputs = model(texts).squeeze()
        val_predictions.extend(outputs.tolist())

# Convert predictions and true values to NumPy arrays
val_predictions = np.array(val_predictions)
y_true = y_val

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_true, val_predictions))
print(f'RMSE on Validation Set: {rmse}')

RMSE on Validation Set: 0.5265764968395983
