In [8]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, Dataset
import copy
from sklearn.model_selection import train_test_split
from model import FactorizationMachine
from evaluation import evaluation


In [10]:
users_features = pd.read_csv("../../preprocessed_data/users_features.csv")
games_features = pd.read_csv("../../preprocessed_data/games_features.csv")
explicit_data = pd.read_csv("../../preprocessed_data/ratings.csv")

In [11]:
users_list = []
add_user_index = len(users_features)
user_index_by_id = {id : idx for idx, id in enumerate(users_features['user_id'])}
for  index,row in users_features.iterrows():
    true_indices = row[row == True].index.tolist()
    column_indices = [index] + [users_features.columns.get_loc(col) + add_user_index -1 for col in true_indices]
    users_list.append(column_indices)


In [12]:
games_list = []
add_game_index = len(games_features)
games_index_by_id = {id : (idx) for idx, id in enumerate(games_features['app_id'])}
for  index,row in games_features.iterrows():
    true_indices = row[row == True].index.tolist()
    column_indices = [index + add_user_index + 15] + [games_features.columns.get_loc(col)+add_user_index+add_game_index+15 -1 for col in true_indices]
    games_list.append(column_indices)

In [13]:
train_ratings, validation_ratings = train_test_split(explicit_data, test_size= 0.2, random_state= 42)

In [14]:
total_inputs = games_list[-1][0] + len(games_features.columns)
NUM_MOVIES = len(games_list)
NUM_USERS = len(users_list)
padding_idx = total_inputs

94469

In [15]:
class FactorizationMachineDataset(Dataset):
    def __init__(self, rating_df):
        self.rating_df = rating_df
        self.max_size = 20

    def __len__(self):
        return len(self.rating_df)

    def __getitem__(self, index):

        users_index = user_index_by_id[self.rating_df["user_id"].iloc[index]]

        games_index = games_index_by_id[self.rating_df["app_id"].iloc[index]]
        rating = self.rating_df["explicit_rating"].iloc[index]
        users_feature = users_list[users_index]

        games_feature = games_list[games_index]

        padding_size = self.max_size - len(users_feature) - len(games_feature)
        feature = users_feature + games_feature + [padding_idx] * padding_size

        return torch.IntTensor(feature), rating




In [None]:
training_data = FactorizationMachineDataset(train_ratings)
validation_data = FactorizationMachineDataset(validation_ratings)

batch_size = 1024

train_dataloader = DataLoader(
    training_data, batch_size=batch_size, shuffle=True, num_workers=10
)

validation_dataloader = DataLoader(
    validation_data, batch_size=batch_size, shuffle=False, num_workers=10
)

In [16]:
def train(model, train_dataloader, validation_dataloader, epochs=20, k=5, threshold=3.5):
    torch.manual_seed(42)
    model.train()
    criterion = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.05, momentum=0.9)

    best_score = float('inf')
    best_model_wts = copy.deepcopy(model.state_dict())

    for epoch in range(epochs):
        train_loss = 0

        for batch_x, batch_y in train_dataloader:
            batch_x = batch_x.to(torch.long)
            batch_y = batch_y.to(torch.float)
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs.squeeze(), batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * batch_x.size(0)
        train_loss /= len(train_dataloader.dataset)

        val_loss = 0
        all_preds = []
        all_labels = []
        all_user_ids = []
        with torch.no_grad():
            model.eval()
            for batch_x, batch_y in validation_dataloader:
                outputs = model(batch_x)
                loss = criterion(outputs.squeeze(), batch_y)
                val_loss += loss.item() * batch_x.size(0)
                all_preds.append(outputs.squeeze().cpu().numpy())
                all_labels.append(batch_y.cpu().numpy())
                all_user_ids.append(batch_x[:, 0].cpu().numpy())
        val_loss /= len(validation_dataloader.dataset)

        all_preds = np.concatenate(all_preds)
        all_labels = np.concatenate(all_labels)
        all_user_ids = np.concatenate(all_user_ids)

        precision, recall, f1_k, ndcg_k = evaluation(all_preds, all_labels,all_user_ids, k, threshold)

        if val_loss < best_score:
            best_model_wts = copy.deepcopy(model.state_dict())
            best_score = val_loss

        print(f'Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Valid RMSE: {val_loss:.4f} - F1@{10}: {f1_k:.4f} - NDCG@{k}: {ndcg_k:.4f} - Precision@{10}: {precision:.4f} - Recall@{10}: {recall:.4f}')

    model.load_state_dict(best_model_wts)

    return model

In [17]:
n_factors = 80
model = FactorizationMachine(num_input=total_inputs, num_factor=n_factors)
train(model, train_dataloader, validation_dataloader, epochs = 15)

Epoch 1/15 - Train Loss: 1.9912 - Valid RMSE: 1.2417 - F1@10: 0.6486 - NDCG@5: 0.8060 - Precision@10: 0.7686 - Recall@10: 0.6248
Epoch 2/15 - Train Loss: 1.2862 - Valid RMSE: 1.0345 - F1@10: 0.7014 - NDCG@5: 0.8168 - Precision@10: 0.7973 - Recall@10: 0.6759
Epoch 3/15 - Train Loss: 1.0946 - Valid RMSE: 1.6192 - F1@10: 0.7495 - NDCG@5: 0.7895 - Precision@10: 0.7784 - Recall@10: 0.7800
Epoch 4/15 - Train Loss: 1.0386 - Valid RMSE: 1.0519 - F1@10: 0.5882 - NDCG@5: 0.8196 - Precision@10: 0.7922 - Recall@10: 0.5096
Epoch 5/15 - Train Loss: 0.9641 - Valid RMSE: 1.1831 - F1@10: 0.5274 - NDCG@5: 0.8197 - Precision@10: 0.7764 - Recall@10: 0.4390
Epoch 6/15 - Train Loss: 0.9250 - Valid RMSE: 1.1323 - F1@10: 0.7460 - NDCG@5: 0.8085 - Precision@10: 0.7924 - Recall@10: 0.7557
Epoch 7/15 - Train Loss: 0.9043 - Valid RMSE: 1.0150 - F1@10: 0.5829 - NDCG@5: 0.8217 - Precision@10: 0.7857 - Recall@10: 0.5073
Epoch 8/15 - Train Loss: 0.8928 - Valid RMSE: 1.0095 - F1@10: 0.7314 - NDCG@5: 0.8169 - Precision

FactorizationMachine(
  (embedding): Embedding(94470, 80, padding_idx=94469)
  (linear_layer): Embedding(94470, 1, padding_idx=94469)
)