In [2]:
import torch
import torch.nn as nn 
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, Dataset
import copy
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl

In [4]:
users_features = pd.read_csv("../preprocessed_data/users_features.csv")
games_features = pd.read_csv("../preprocessed_data/games_features.csv")
explicit_data = pd.read_csv("../preprocessed_data/ratings.csv")

In [5]:
users_list = []
add_user_index = len(users_features)
user_index_by_id = {id : idx for idx, id in enumerate(users_features['user_id'])}
for  index,row in users_features.iterrows():
    true_indices = row[row == True].index.tolist()
    column_indices = [index] + [users_features.columns.get_loc(col) + add_user_index -1 for col in true_indices]
    users_list.append(column_indices)


In [6]:
games_list = []
add_game_index = len(games_features)
games_index_by_id = {id : (idx) for idx, id in enumerate(games_features['app_id'])}
for  index,row in games_features.iterrows():
    true_indices = row[row == True].index.tolist()
    column_indices = [index + add_user_index + 20] + [games_features.columns.get_loc(col)+add_user_index+add_game_index+20 -1 for col in true_indices]
    games_list.append(column_indices)

In [7]:
train_ratings, validation_ratings = train_test_split(explicit_data, test_size= 0.1, random_state= 42)

In [8]:
total_inputs = games_list[-1][0] + len(games_features.columns)
total_inputs

94469

In [9]:
NUM_MOVIES = len(games_list)
NUM_USERS = len(users_list)
padding_idx = total_inputs


class FactorizationMachineDataset(Dataset):
    def __init__(self, rating_df):
        self.rating_df = rating_df
        self.max_size = 5  

    def __len__(self):
        return len(self.rating_df)

    def __getitem__(self, index):
        
        user_index = user_index_by_id[self.rating_df["user_id"].iloc[index]]
      
        games_index = games_index_by_id[self.rating_df["app_id"].iloc[index]]
     
        rating = self.rating_df["rating"].iloc[index]
      
        users_feature = users_features.iloc[user_index]
    
        games_feature = games_features[games_index]
      
        padding_size = self.max_size - len(users_feature) - len(games_feature)
        feature = users_feature + games_feature + [padding_idx] * padding_size
        
        return torch.IntTensor(feature), rating


training_data = FactorizationMachineDataset(train_ratings)
validation_data = FactorizationMachineDataset(validation_ratings)

batch_size = 128
num_workers = 10

train_dataloader = DataLoader(
    training_data, batch_size=batch_size, shuffle=True, num_workers=num_workers
)

validation_dataloader = DataLoader(
    validation_data, batch_size=batch_size, shuffle=False, num_workers=num_workers
)

In [10]:

class FactorizationMachine(nn.Module):
    def __init__(self,num_input, num_factor):
        super(FactorizationMachine, self).__init__()
        self.embedding = nn.Embedding(num_input + 1, num_factor, padding_idx= padding_idx)
        self.embedding.weight.data.uniform_(-1,1)
        torch.nn.init.xavier_normal_(self.embedding.weight.data,gain = 1e-3)
        self.linear_layer = nn.Embedding(num_input+1, 1, padding_idx= padding_idx)
        self.bias = nn.Parameter(data = torch.rand(1))

    def forward(self, x):
        emb = self.embedding(x)
        pow_of_sum = emb.sum(dim = 1 , keepdim = True).pow(2).sum(dim = 2)
        sum_of_pow = emb.pow(2).sum(dim = 1 , keepdim = True).sum(dim = 2)
        out_inter = 0.5 * (pow_of_sum - sum_of_pow)
        out_lin = self.linear_layer(x).sum(dim = 1)
        out = out_inter + out_lin + self.bias
        return out
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
def train(model, train_dataloader, validation_dataloader, epochs=10, device='cpu'):
    torch.manual_seed(42)
    print("he")
    model = model.to(device)
    print("he")
    criterion = nn.MSELoss()
    print("he")
    optimizer = torch.optim.SGD(model.parameters(), lr=0.05, momentum=0.9)
    print("he")

    best_score = float('inf')
    best_model_wts = copy.deepcopy(model.state_dict())
    print("he")

    for epoch in range(epochs):
        model.train()
        print("he")
        train_loss = 0
        i =0 
        for batch_x, batch_y in train_dataloader:
            print(i)
            i+=1
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs.squeeze(), batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * batch_x.size(0)
        train_loss /= len(train_dataloader.dataset)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_x, batch_y in validation_dataloader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs = model(batch_x)
                loss = criterion(outputs.squeeze(), batch_y)
                val_loss += loss.item() * batch_x.size(0)
        val_loss /= len(validation_dataloader.dataset)

        if val_loss < best_score:
            best_model_wts = copy.deepcopy(model.state_dict())
            best_score = val_loss

        print(f'Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Valid RMSE: {val_loss:.4f}')

    model.load_state_dict(best_model_wts)

    return model


In [11]:

n_factors = 80
model = FactorizationMachine(num_input=total_inputs, num_factor=n_factors)
train(model, train_dataloader, validation_dataloader)

he
he
he
he
he
he
