In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
#!pip install tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
# Load data
train_df = pd.read_csv('data/train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv('data/test.csv', encoding='ISO-8859-1')
product_descriptions_df = pd.read_csv('data/product_descriptions.csv', encoding='ISO-8859-1')

# Merge product descriptions with training and test data
train_df = pd.merge(train_df, product_descriptions_df, on='product_uid', how='left')
test_df = pd.merge(test_df, product_descriptions_df, on='product_uid', how='left')

# Combine text columns
train_df['text'] = train_df['product_title'] + " " + train_df['product_description'] + " " + train_df['search_term']
test_df['text'] = test_df['product_title'] + " " + test_df['product_description'] + " " + test_df['search_term']

# Tokenize the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['text'])

X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])

# Pad sequences to ensure uniform input length
max_seq_length = 100
X_train = pad_sequences(X_train, maxlen=max_seq_length)
X_test = pad_sequences(X_test, maxlen=max_seq_length)

# Get the target variable
y_train = train_df['relevance'].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [15]:

from torch.utils.data import TensorDataset, DataLoader
# Create TensorDatasets for training, validation, and test sets
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.long))

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [16]:
import torch.nn as nn

class RelevanceModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, lstm_units):
        super(RelevanceModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, lstm_units, batch_first=True)
        self.fc1 = nn.Linear(lstm_units, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Take the output of the last time step
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the model
vocab_size = 10000
embed_dim = 128
lstm_units = 128
model = RelevanceModel(vocab_size, embed_dim, lstm_units)

In [22]:
import torch.optim as optim
import torch

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for texts, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(texts).squeeze()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    val_loss = 0
    model.eval()
    with torch.no_grad():
        for texts, targets in val_loader:
            outputs = model(texts).squeeze()
            loss = criterion(outputs, targets)
            val_loss += loss.item()

    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}')

Epoch 1/10, Train Loss: 0.04686620123593889, Val Loss: 0.2951247078360543
Epoch 2/10, Train Loss: 0.04487797481004857, Val Loss: 0.2892449029900343
Epoch 3/10, Train Loss: 0.043869834514958364, Val Loss: 0.2877040769961174
Epoch 4/10, Train Loss: 0.043042746355171774, Val Loss: 0.2868151162986364
Epoch 5/10, Train Loss: 0.04221641616599099, Val Loss: 0.29377011761680794
Epoch 6/10, Train Loss: 0.04109511586091206, Val Loss: 0.29185846101967355


KeyboardInterrupt: 

In [None]:
#RMSE
from sklearn.metrics import mean_squared_error
import numpy as np
model.eval()
val_predictions = []
with torch.no_grad():
    for batch in val_loader:
        texts = batch['text']
        targets = batch['target']
        outputs = model(texts).squeeze()
        val_predictions.extend(outputs.tolist())

# Convert predictions and true values to NumPy arrays
val_predictions = np.array(val_predictions)
y_true = y_val

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_true, val_predictions))
print(f'RMSE on Validation Set: {rmse}')

RMSE on Validation Set: 0.5387248756506721
