In [None]:
!pip install skorch

Collecting skorch
  Downloading skorch-0.15.0-py3-none-any.whl (239 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/239.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.3/239.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: skorch
Successfully installed skorch-0.15.0


In [None]:
import datetime
import itertools
import numpy as np
import os
import pandas as pd
import patsy
import time

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch import optim
from torch.autograd import Variable

from skorch import NeuralNet
from skorch.helper import predefined_split, SliceDataset
from skorch.callbacks import BatchScoring, Checkpoint, EarlyStopping, EpochScoring, LRScheduler, TensorBoard, ProgressBar

import tensorflow as tf
from tensorflow import summary


In [None]:
identifier = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(identifier)
print(device)

cuda


In [None]:
if not os.path.exists('ml-100k'):
    !wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
    !unzip -o ml-100k.zip

--2024-01-11 02:00:56--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2024-01-11 02:00:57 (9.92 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

### Dataset

In [None]:
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir", "Horror",
    "Musical", "Mystery", "Romance", "SciFi", "Thriller", "War", "Western"
]

In [None]:
class rsdataset(Dataset):
    def __init__(self, usersfile, moviesfile, ratingsfile, nrows=None):

        # Read files
        self.movies = pd.read_csv(moviesfile, sep='|', names=['MovieID', 'Title', 'date', 'video_rl_date', 'link']+genre_cols, engine='python', encoding='latin-1')
        self.users = pd.read_csv(usersfile, sep='|', names=['UserID', 'Age', 'Gender', 'Occupation', 'Zipcode'], engine='python', encoding='latin-1')
        self.ratings = pd.read_csv(ratingsfile, sep='\t', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', nrows=nrows, encoding='latin-1')

        df2 = self.movies[genre_cols]
        df2['Genre'] = df2.apply(lambda row: ', '.join(row.index[row == 1]), axis=1)
        self.movies['Genre'] = df2['Genre']
        self.movies = self.movies.drop(genre_cols, axis = 1)
        # self.movies['Genre'] = self.movies['Genre'].map(genre_dict)
        bins = [0, 18, 25, 35, 45, 50, 56, 100]
        labels = [1, 18, 25, 35, 45, 50, 56]
        self.users['Age'] = pd.cut(self.users['Age'], bins=bins, labels=labels, right=False)
        assert self.users['UserID'].nunique() >= self.ratings['UserID'].nunique(), 'UserID with unknown information'
        assert self.movies['MovieID'].nunique() >= self.ratings['MovieID'].nunique(), 'Movies with unknown information'

        self.users_emb_columns = []
        self.users_ohe_columns = []
        self.movies_emb_columns = []
        self.movies_ohe_columns = []
        self.interact_columns = []

        self.movies = self.movies.drop(['date', 'video_rl_date', 'link'], axis= 1)
        self.nusers = self.ratings['UserID'].nunique()
        self.nmovies = self.ratings['MovieID'].nunique()

        self.y_range = (self.ratings['Rating'].min(), self.ratings['Rating'].max())

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):

        return (((self.users_emb[idx])),
                ((self.users_ohe[idx])),
                ((self.movies_emb[idx])),
                ((self.movies_ohe[idx])),
                ((self.interact[idx]))), (self.y[idx])

    def to_tensor(self):
        self.users_emb = torch.from_numpy(self.ratings[self.users_emb_columns].values)
        self.users_ohe = torch.tensor(self.ratings[self.users_ohe_columns].values, dtype=torch.float)
        self.movies_emb = torch.from_numpy(self.ratings[self.movies_emb_columns].values)
        self.movies_ohe = torch.tensor(self.ratings[self.movies_ohe_columns].values, dtype=torch.float)
        self.interact = torch.from_numpy(self.ratings[self.interact_columns].values)
        self.y = torch.tensor(self.y.values, dtype=torch.float)

In [None]:
train = rsdataset('ml-100k/u.user', 'ml-100k/u.item', 'ml-100k/u.data', nrows=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Genre'] = df2.apply(lambda row: ', '.join(row.index[row == 1]), axis=1)


In [None]:
train.users

Unnamed: 0,UserID,Age,Gender,Occupation,Zipcode
0,1,18,M,technician,85711
1,2,50,F,other,94043
2,3,18,M,writer,32067
3,4,18,M,technician,43537
4,5,25,F,other,15213
...,...,...,...,...,...
938,939,25,F,student,33319
939,940,25,M,administrator,02215
940,941,18,M,student,97229
941,942,45,F,librarian,78209


In [None]:
train.movies

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),"Animation, Children, Comedy"
1,2,GoldenEye (1995),"Action, Adventure, Thriller"
2,3,Four Rooms (1995),Thriller
3,4,Get Shorty (1995),"Action, Comedy, Drama"
4,5,Copycat (1995),"Crime, Drama, Thriller"
...,...,...,...
1677,1678,Mat' i syn (1997),Drama
1678,1679,B. Monkey (1998),"Romance, Thriller"
1679,1680,Sliding Doors (1998),"Drama, Romance"
1680,1681,You So Crazy (1994),Comedy


In [None]:
train.ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


### Preprocessing of dataset

In [None]:
train.ratings = train.ratings.merge(train.movies, left_on='MovieID', right_on='MovieID')
train.movies = train.ratings[train.movies.columns]

train.ratings = train.ratings.merge(train.users, left_on='UserID', right_on='UserID')
train.users = train.ratings[train.users.columns]

train.y = train.ratings['Rating']

In [None]:
# Label Encode users
columns = ['UserID', 'Gender', 'Age', 'Occupation']
train.ratings[columns] = train.ratings[columns].apply(preprocessing.LabelEncoder().fit_transform)
train.users_emb_columns = train.users_emb_columns + columns

In [None]:
# Label Encode movies
columns = ['MovieID']
train.ratings[columns] = train.ratings[columns].apply(preprocessing.LabelEncoder().fit_transform)
train.movies_emb_columns = train.movies_emb_columns + columns

In [None]:
# One Hot Encode users
columns = ['Gender', 'Age', 'Occupation']
ohe = preprocessing.OneHotEncoder(categories='auto', sparse=False, dtype='uint8')
ohe.fit(train.ratings[columns])
train.ratings = pd.concat([train.ratings, pd.DataFrame(data=ohe.transform(train.ratings[columns]), columns=ohe.get_feature_names_out(columns))], axis=1)
train.users_ohe_columns = ohe.get_feature_names_out(columns)

assert train.ratings[train.users_ohe_columns].max().max()<=1, 'Error with ohe columns'



In [None]:
# One Hot Encode movies
genres = ["genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir", "Horror",
    "Musical", "Mystery", "Romance", "SciFi", "Thriller", "War", "Western"]

for genre in genres:
    genre = genre.replace('-', '')
    column = str(genre)
    train.ratings[column] = train.ratings['Genre'].apply(lambda x: 1 if genre in x else 0)
    train.movies_ohe_columns.append(column)

assert train.ratings[train.movies_ohe_columns].max().max()<=1, 'Error with ohe columns'

In [None]:
int_genres_gender = ""
for genre in train.movies_ohe_columns:
    int_genres_gender = int_genres_gender + '+' +genre + ':Gender'

int_genres_age = ""
for genre in train.movies_ohe_columns:
    int_genres_age = int_genres_age + '+' + genre + ':Age'

interact = patsy.dmatrix("0 + Gender:Age + Gender:Occupation + Age:Occupation"+int_genres_gender+int_genres_age, data=train.ratings.astype('object'), return_type='dataframe').astype('int8')
interact = interact.astype('uint8')
train.ratings = pd.concat([train.ratings, interact], axis=1)
train.interact_columns = interact.columns

In [None]:
interact.columns[interact.iloc[0] == 1]


Index(['Gender[1]:Age[4]', 'Gender[1]:Occupation[T.20]',
       'Age[T.4]:Occupation[T.20]', 'Comedy[T.1]:Gender[1]',
       'Comedy[T.1]:Age[T.4]'],
      dtype='object')

In [None]:
# Drop unused columns
train.movies.drop(['Title', 'Genre'], inplace=True, axis=1)
train.ratings.drop(['Title', 'Genre', 'Zipcode'], inplace=True, axis=1)

In [None]:
train.to_tensor()

### DataLoaders

In [None]:
# Split
train_size = int(0.8 * len(train))
test_size = len(train) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(train, [train_size, test_size])

# Create dataloaders
dataloaders = {}
dataloaders['train'] = torch.utils.data.DataLoader(train_dataset, batch_size=4096, shuffle=True)
dataloaders['valid'] = torch.utils.data.DataLoader(valid_dataset, batch_size=4096, shuffle=True)

### Define Pytorch models

In [None]:
class deepnwide(nn.Module):

    def __init__(self, users_emb, movies_emb, users_ohe, movies_ohe, interact, size_emb, y_range, dropout, linear_size= 500):
        super().__init__()

        self.name = 'deepnwide'
        self.y_range = y_range

        # wide part

        # deep
        self.emb_UserID = nn.Embedding(len(torch.unique(users_emb[:, 0])), size_emb)
        self.emb_UserID.weight.data.uniform_(-.01, .01)
        self.emb_Gender = nn.Embedding(len(torch.unique(users_emb[:, 1])), size_emb)
        self.emb_Gender.weight.data.uniform_(-.01, .01)
        self.emb_Age = nn.Embedding(len(torch.unique(users_emb[:, 2])), size_emb)
        self.emb_Age.weight.data.uniform_(-.01, .01)
        self.emb_Occupation = nn.Embedding(len(torch.unique(users_emb[:, 3])), size_emb)
        self.emb_Occupation.weight.data.uniform_(-.01, .01)
        self.emb_MovieID = nn.Embedding(len(torch.unique(movies_emb[:, 0])), size_emb)
        self.emb_MovieID.weight.data.uniform_(-.01, .01)

        # hidden layers
        self.h1 = nn.Linear(5 * size_emb, linear_size)
        self.h2 = nn.Linear(linear_size, linear_size)
        self.h3 = nn.Linear(linear_size, linear_size)

        # Dropout layers
        self.dropout1 = nn.Dropout(p=dropout)
        self.dropout2 = nn.Dropout(p=dropout)
        self.dropout3 = nn.Dropout(p=dropout)

        # final dense layer
        self.last_layer = nn.Linear((interact.shape[1]) + (movies_ohe.shape[1]) + (linear_size), 1)


    def forward(self, X):
        # Assign data
        user_emb = X[0]
        user_ohe = X[1]
        movie_emb = X[2]
        movie_ohe = X[3]
        interact = X[4]

        UserID = user_emb[:, 0]
        Gender = user_emb[:, 1]
        Age = user_emb[:, 2]
        Occupation = user_emb[:, 3]
        MovieID = movie_emb[:, 0]

        UserID = self.emb_UserID(UserID)
        Gender = self.emb_Gender(Gender)
        Age = self.emb_Age(Age)
        Occupation = self.emb_Occupation(Occupation)
        MovieID = self.emb_MovieID(MovieID)

        emb = torch.cat([UserID,
                         Age,
                         Gender,
                         Occupation,
                         MovieID],
                         dim=1)

        emb = F.relu(self.dropout1(self.h1(emb)))
        emb = F.relu(self.dropout2(self.h2(emb)))
        emb = F.relu(self.dropout3(self.h3(emb)))

        result = self.last_layer(torch.cat([interact.float(), movie_ohe.float(), emb.float()], dim=1))

        return (torch.sigmoid(result) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]).squeeze()


model = deepnwide(train.users_emb, train.movies_emb, train.users_ohe, train.movies_ohe, train.interact, 60, train.y_range, 0.5, 100)
model.to(device)
print(model)

deepnwide(
  (emb_UserID): Embedding(943, 60)
  (emb_Gender): Embedding(2, 60)
  (emb_Age): Embedding(7, 60)
  (emb_Occupation): Embedding(21, 60)
  (emb_MovieID): Embedding(1682, 60)
  (h1): Linear(in_features=300, out_features=100, bias=True)
  (h2): Linear(in_features=100, out_features=100, bias=True)
  (h3): Linear(in_features=100, out_features=100, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (dropout3): Dropout(p=0.5, inplace=False)
  (last_layer): Linear(in_features=445, out_features=1, bias=True)
)


In [None]:
class twoembeds(torch.nn.Module):

    def __init__(self, size_emb, y_range):
        super().__init__()

        # set name of model
        self.name = 'twoembeds'
        self.y_range = y_range

        # User and movie embeddings
        self.emb_UserID = nn.Embedding(train.nusers, size_emb)
        self.emb_MovieID = nn.Embedding(train.nmovies, size_emb)
        self.emb_UserID.weight.data.uniform_(-.01, .01)
        self.emb_MovieID.weight.data.uniform_(-.01, .01)

        # User and movie embeddings bía
        self.emb_UserID_b = nn.Embedding(train.nusers, 1)
        self.emb_MovieID_b = nn.Embedding(train.nmovies, 1)
        self.emb_UserID_b.weight.data.uniform_(-.01, .01)
        self.emb_MovieID_b.weight.data.uniform_(-.01, .01)


    def forward(self, X):
        user_emb = X[0]
        user_ohe = X[1]
        movie_emb = X[2]
        movie_ohe = X[3]

        UserID = user_emb[:, 0]
        MovieID = movie_emb[:, 0]

        user_emb = self.emb_UserID(UserID)
        movie_emb = self.emb_MovieID(MovieID)

        mult = (user_emb * movie_emb).sum(1)

        # add bias
        multb = mult + self.emb_UserID_b(UserID).squeeze() + self.emb_MovieID_b(MovieID).squeeze()

        multb = multb.float()

        return (torch.sigmoid(multb) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]).squeeze()

        return multb


model = twoembeds(15, train.y_range)
model.to(device)
print(model)

twoembeds(
  (emb_UserID): Embedding(943, 15)
  (emb_MovieID): Embedding(1682, 15)
  (emb_UserID_b): Embedding(943, 1)
  (emb_MovieID_b): Embedding(1682, 1)
)


In [None]:
class ncf(torch.nn.Module):

    def __init__(self, users_emb, movies_emb, users_ohe, movies_ohe, interact, size_emb, dropout, linear_size, y_range):
        super().__init__()

        # set name of model
        self.name = 'ncf'
        self.y_range = y_range

        ### GMF part
        # user embeddings
        self.gmf_embuserid = nn.Embedding(len(torch.unique(users_emb[:, 0])), size_emb)
        self.gmf_embuserid.weight.data.uniform_(-.01, .01)
        self.gmf_embgender = nn.Embedding(len(torch.unique(users_emb[:, 1])), size_emb)
        self.gmf_embgender.weight.data.uniform_(-.01, .01)
        self.gmf_embage = nn.Embedding(len(torch.unique(users_emb[:, 2])), size_emb)
        self.gmf_embage.weight.data.uniform_(-.01, .01)
        self.gmf_embocc = nn.Embedding(len(torch.unique(users_emb[:, 3])), size_emb)
        self.gmf_embocc.weight.data.uniform_(-.01, .01)
        # movie embeddings
        self.gmf_embmovieid = nn.Embedding(len(torch.unique(movies_emb[:, 0])), size_emb*4-len(train.movies_ohe_columns))
        self.gmf_embmovieid.weight.data.uniform_(-.01, .01)


        ### MLP part
        # user embeddings
        self.mlp_embuserid = nn.Embedding(len(torch.unique(users_emb[:, 0])), size_emb)
        self.mlp_embuserid.weight.data.uniform_(-.01, .01)
        self.mlp_embgender = nn.Embedding(len(torch.unique(users_emb[:, 1])), size_emb)
        self.mlp_embgender.weight.data.uniform_(-.01, .01)
        self.mlp_embage = nn.Embedding(len(torch.unique(users_emb[:, 2])), size_emb)
        self.mlp_embage.weight.data.uniform_(-.01, .01)
        self.mlp_embocc = nn.Embedding(len(torch.unique(users_emb[:, 3])), size_emb)
        self.mlp_embocc.weight.data.uniform_(-.01, .01)
        # movie embeddings
        self.mlp_embmovieid = nn.Embedding(len(torch.unique(movies_emb[:, 0])), size_emb)
        self.mlp_embmovieid.weight.data.uniform_(-.01, .01)
        # hidden layers
        self.h1 = nn.Linear(5*size_emb+len(train.movies_ohe_columns), linear_size)
        self.h2 = nn.Linear(linear_size, int(linear_size/2))
        #self.h3 = nn.Linear(linear_size, linear_size)
        # Dropout layers
        self.dropout1 = nn.Dropout(p=dropout)
        self.dropout2 = nn.Dropout(p=dropout)
        #self.dropout3 = nn.Dropout(p=dropout)

        # final dense layer
        self.last_layer = nn.Linear(size_emb*4+int(linear_size/2), 1)

    def forward(self, X):
        user_emb = X[0]
        user_ohe = X[1]
        movie_emb = X[2]
        movie_ohe = X[3]

        UserID = user_emb[:, 0]
        Gender = user_emb[:, 1]
        Age = user_emb[:, 2]
        Occupation = user_emb[:, 3]
        MovieID = movie_emb[:, 0]

        # GMF part
        gmf_embuserid = self.gmf_embuserid(UserID)
        gmf_embgender = self.gmf_embgender(Gender)
        gmf_embage = self.gmf_embage(Age)
        gmf_embocc = self.gmf_embocc(Occupation)
        gmf_embmovieid = self.gmf_embmovieid(MovieID)

        gmf_user_vector = torch.cat([gmf_embuserid,
                                    gmf_embgender,
                                    gmf_embage,
                                    gmf_embocc],
                                    dim=1)

        gmf_movie_vector = torch.cat([gmf_embmovieid, movie_ohe], 1)

        gmf_vector = (gmf_user_vector * gmf_movie_vector)


        # MLP part
        mlp_embuserid = self.mlp_embuserid(UserID)
        mlp_embgender = self.mlp_embgender(Gender)
        mlp_embage = self.mlp_embage(Age)
        mlp_embocc = self.mlp_embocc(Occupation)
        mlp_movieid = self.mlp_embmovieid(MovieID)

        mlp_vector = torch.cat([mlp_embuserid,
                                mlp_embgender,
                                mlp_embage,
                                mlp_embocc,
                                mlp_movieid,
                                movie_ohe],
                                dim=1)
        mlp_vector = F.relu(self.dropout1(self.h1(mlp_vector)))
        mlp_vector = F.relu(self.dropout2(self.h2(mlp_vector)))
        #mlp_vector = F.relu(self.dropout3(self.h3(mlp_vector)))

        # Fusion
        result = torch.cat([gmf_vector, mlp_vector], dim=1)
        result = self.last_layer(result)

        #return (torch.sigmoid(result) * (5-1) + 1).squeeze
        return (torch.sigmoid(result) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]).squeeze()


model = ncf(train.users_emb, train.movies_emb, train.users_ohe, train.movies_ohe, train.interact, 60, 0.5, 200, train.y_range)
model.to(device)
print(model)

ncf(
  (gmf_embuserid): Embedding(943, 60)
  (gmf_embgender): Embedding(2, 60)
  (gmf_embage): Embedding(7, 60)
  (gmf_embocc): Embedding(21, 60)
  (gmf_embmovieid): Embedding(1682, 221)
  (mlp_embuserid): Embedding(943, 60)
  (mlp_embgender): Embedding(2, 60)
  (mlp_embage): Embedding(7, 60)
  (mlp_embocc): Embedding(21, 60)
  (mlp_embmovieid): Embedding(1682, 60)
  (h1): Linear(in_features=319, out_features=200, bias=True)
  (h2): Linear(in_features=200, out_features=100, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (last_layer): Linear(in_features=340, out_features=1, bias=True)
)


### Skorch callbacks

In [None]:
# Earlystopping callback
earlystopping = EarlyStopping(monitor='valid_loss', patience=10, threshold=0.001)

In [None]:
# RMSE callback
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

def rmseloss(y_true, y_pred):
    #return f1_score(y_true, y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred))

def precision(y_true, y_pred):
    y_true_binary = (y_true >= 4).astype(int)
    y_pred_binary = (y_pred >= 4).astype(int)

    # Calculate Precision and Recall
    precision_scoree = precision_score(y_true_binary, y_pred_binary)
    return precision_scoree

def recall(y_true, y_pred):
    y_true_binary = (y_true >= 4).astype(int)
    y_pred_binary = (y_pred >= 4).astype(int)

    # Calculate Precision and Recall
    recall_scoree = recall_score(y_true_binary, y_pred_binary)
    return recall_scoree

def f1(y_true, y_pred):
    y_true_binary = (y_true >= 4).astype(int)
    y_pred_binary = (y_pred >= 4).astype(int)

    # Calculate F1 score
    f1_scoree = f1_score(y_true_binary, y_pred_binary)
    return f1_scoree

rmse_scorer = make_scorer(rmseloss)
precision_scorer = make_scorer(precision)
recall_scorer = make_scorer(recall)
f1_scorer = make_scorer(f1)

epoch_rmse = EpochScoring(rmse_scorer, name='rmse_score', lower_is_better=True)
epoch_precision = EpochScoring(precision_scorer, name='precision', lower_is_better= False)
epoch_recall = EpochScoring(recall_scorer, name='recall', lower_is_better= False)
epoch_f1 = EpochScoring(f1_scorer, name='f1', lower_is_better= False)


In [None]:
# Checkpoint callback
checkpoint = Checkpoint(monitor='rmse_score_best', f_params='params.pt', f_optimizer='optimizer.pt', f_history='history.json', f_pickle='model')

In [None]:
# Learning rate scheduler callback
lr_scheduler = LRScheduler(policy="StepLR", step_size=7, gamma=0.1)

### Neural Collaborative Filtering

#### Manually specify hyperparamers

In [None]:
#86.57
ncfnet = NeuralNet(
    ncf,
    module__users_emb=train.users_emb,
    module__movies_emb=train.movies_emb,
    module__users_ohe=train.users_ohe,
    module__movies_ohe=train.movies_ohe,
    module__interact=train.interact,
    module__size_emb=120,
    module__dropout=0.3,
    module__linear_size=400,
    module__y_range=train.y_range,#### Manually specify hyperparamers
    max_epochs=50,
    lr=0.01,
    optimizer=torch.optim.Adam,
    criterion=torch.nn.MSELoss,
    device=device,
    iterator_train__batch_size=1024,
    iterator_train__num_workers=0,
    iterator_train__shuffle=True,
    iterator_valid__batch_size=4096,
    train_split=predefined_split(valid_dataset),
    callbacks=[
               earlystopping,
               epoch_rmse,
               epoch_precision,
               epoch_recall,
               epoch_f1,
               checkpoint,
               lr_scheduler,

               ]
)

In [None]:
ncfnet.fit(train_dataset)

  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss    cp      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ----  ------  ------
      1  [36m0.4456[0m       [32m0.8579[0m    [35m0.3010[0m        [31m0.9324[0m        [94m1.0012[0m        [36m0.8695[0m     +  0.0100  3.6711
      2  [36m0.5758[0m       0.7997    [35m0.4498[0m        0.9671        [94m0.6406[0m        0.9354        0.0100  2.9789
      3  0.5740       0.7998    0.4477        0.9821        [94m0.2568[0m        0.9646        0.0100  2.7147
      4  [36m0.6083[0m       0.7849    [35m0.4966[0m        0.9922        [94m0.1416[0m        0.9846        0.0100  2.7281
      5  0.5872       0.7791    0.4711        1.0081        [94m0.1021[0m        1.0162        0.0100  3.7918
      6  [36m0.6089[0m       0.7735    [35m0.5021[0m        1.0133        [94m0.0865[0m        1.0268        0.0100  2.7138
      7  0.6048       0

<class 'skorch.net.NeuralNet'>[initialized](
  module_=ncf(
    (gmf_embuserid): Embedding(943, 120)
    (gmf_embgender): Embedding(2, 120)
    (gmf_embage): Embedding(7, 120)
    (gmf_embocc): Embedding(21, 120)
    (gmf_embmovieid): Embedding(1682, 461)
    (mlp_embuserid): Embedding(943, 120)
    (mlp_embgender): Embedding(2, 120)
    (mlp_embage): Embedding(7, 120)
    (mlp_embocc): Embedding(21, 120)
    (mlp_embmovieid): Embedding(1682, 120)
    (h1): Linear(in_features=619, out_features=400, bias=True)
    (h2): Linear(in_features=400, out_features=200, bias=True)
    (dropout1): Dropout(p=0.3, inplace=False)
    (dropout2): Dropout(p=0.3, inplace=False)
    (last_layer): Linear(in_features=680, out_features=1, bias=True)
  ),
)

#### GridSearchCV

In [None]:
params = {
    'lr': [0.01],
    'module__size_emb': [120],
    'module__dropout': [0.3],
    'module__linear_size': [400]
}
gs = GridSearchCV(ncfnet,
                  params,
                  verbose=50,
                  refit=True,
                  pre_dispatch=2,
                  n_jobs=1,
                  cv=5,
                  scoring='neg_mean_squared_error')

X_ds = SliceDataset(train, idx=0)
y_ds = SliceDataset(train, idx=1)
gs.fit(X_ds, y_ds)

print(gs.best_score_, gs.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START lr=0.01, module__dropout=0.3, module__linear_size=400, module__size_emb=120
  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss    cp      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ----  ------  ------
      1  [36m0.5496[0m       [32m0.8684[0m    [35m0.4020[0m        [31m0.8891[0m        [94m0.9943[0m        [36m0.7905[0m     +  0.0100  5.8627
      2  [36m0.7129[0m       [32m0.9620[0m    [35m0.5662[0m        [31m0.6510[0m        [94m0.6327[0m        [36m0.4238[0m     +  0.0100  4.8760
      3  [36m0.7997[0m       [32m0.9718[0m    [35m0.6794[0m        [31m0.5762[0m        [94m0.2678[0m        [36m0.3320[0m     +  0.0100  5.4818
      4  [36m0.8044[0m       [32m0.9770[0m    [35m0.6837[0m        [31m0.5479[0m        [94m0.1509[0m        [36m0.3002[0m     +  0.0100  6.7898
      5  0.

In [None]:
best_model = gs.best_estimator_
best_model.fit(train_dataset)

Re-initializing module because the following parameters were re-set: dropout, interact, linear_size, movies_emb, movies_ohe, size_emb, users_emb, users_ohe, y_range.
Re-initializing criterion.
Re-initializing optimizer.
  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss    cp      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ----  ------  ------
      1  [36m0.4265[0m       [32m0.8679[0m    [35m0.2827[0m        [31m0.9295[0m        [94m0.9991[0m        [36m0.8639[0m     +  0.0100  3.8039
      2  [36m0.6672[0m       0.7917    [35m0.5765[0m        0.9653        [94m0.6396[0m        0.9319        0.0100  3.0519
      3  0.6050       0.7948    0.4884        0.9748        [94m0.2639[0m        0.9502        0.0100  2.9263
      4  0.6203       0.7937    0.5091        0.9864        [94m0.1437[0m        0.9730        0.0100  2.7757
      5  0.6333       0.7850    0.5308        0.9986        [9

<class 'skorch.net.NeuralNet'>[initialized](
  module_=ncf(
    (gmf_embuserid): Embedding(943, 120)
    (gmf_embgender): Embedding(2, 120)
    (gmf_embage): Embedding(7, 120)
    (gmf_embocc): Embedding(21, 120)
    (gmf_embmovieid): Embedding(1682, 461)
    (mlp_embuserid): Embedding(943, 120)
    (mlp_embgender): Embedding(2, 120)
    (mlp_embage): Embedding(7, 120)
    (mlp_embocc): Embedding(21, 120)
    (mlp_embmovieid): Embedding(1682, 120)
    (h1): Linear(in_features=619, out_features=400, bias=True)
    (h2): Linear(in_features=400, out_features=200, bias=True)
    (dropout1): Dropout(p=0.3, inplace=False)
    (dropout2): Dropout(p=0.3, inplace=False)
    (last_layer): Linear(in_features=680, out_features=1, bias=True)
  ),
)

In [None]:
#rating_predict = best_model.predict(valid_dataset)
rating_predict = ncfnet.predict(valid_dataset)

In [None]:
valid_users = valid_dataset[:][0][0][:,0].numpy()
valid_movie = valid_dataset[:][0][0][:,3].numpy()
valid_rating = valid_dataset[:][1].numpy()

In [None]:
dct= {}
for i in range(20000):
    if valid_users[i] not in dct.keys():
        dct[valid_users[i]]  = [(i, valid_movie[i], rating_predict[i], valid_rating[i])]
    else:
        dct[valid_users[i]].append((i, valid_movie[i], rating_predict[i], valid_rating[i]))

In [None]:
#calculate HR@10
from sklearn.metrics import ndcg_score
import math
cnt_user = 0
sum = 0
ndcg = 0
for key, value in dct.items():
    if len(value) >= 10:
        cnt_user += 1
        cnt_movie = 0
        value_rank_by_pred = sorted(value, reverse = True , key=lambda x: x[2]) # sort by pred
        value_rank_by_label = sorted(value, reverse = True , key=lambda x: x[3]) # sort by label
        ranked_label = [[item[3] for item in value_rank_by_label]]
        ranked_pred = [[item[3] for item in value_rank_by_pred]]
        dcg = 0
        idcg = 0
        for j in range(10):
            if value_rank_by_pred[j][3] == 5:
                sum += 1
                break
        for j in range(10):
            dcg += (2**ranked_pred[0][j]-1)/(math.log2(j+2))
            idcg += (2**ranked_label[0][j]-1)/(math.log2(j+2))
        ndcg += dcg/idcg
        #ndcg += ndcg_score(ranked_label, ranked_pred, k=10)
        #ndcg += ndcg_at_k(ranked_pred, ranked_label, 10)
hr = sum/cnt_user
ndcg = ndcg/cnt_user
print(hr, ndcg)

0.9092495636998255 0.7560052037481741


In [None]:
torch.save(best_model.module_.state_dict(), 'best_model.pth')

### Deep and Wide

#### Manually specify hyperparamers

In [None]:
deepnwidenet = NeuralNet(
    deepnwide,
    module__users_emb=train.users_emb,
    module__movies_emb=train.movies_emb,
    module__users_ohe=train.users_ohe,
    module__movies_ohe=train.movies_ohe,
    #### Manually specify hyperparamers e=train.movies_ohe,
    module__interact=train.interact,
    module__size_emb=120,
    module__y_range=train.y_range,
    module__dropout=0.2,
    max_epochs=30,
    lr=0.01,
    optimizer=torch.optim.Adam,
    criterion=torch.nn.MSELoss,
    device=device,
    iterator_train__batch_size=1024,
    iterator_train__num_workers=0,
    iterator_train__shuffle=True,
    iterator_valid__batch_size=4096,
    train_split=predefined_split(valid_dataset),
    callbacks=[
               earlystopping,
               epoch_rmse,
               epoch_precision,
               epoch_recall,
               epoch_f1,
               #checkpoint,
               lr_scheduler,
               #TensorBoard(writer),
               #progressbar
               ]
)

In [None]:
deepnwidenet.fit(train_dataset)

  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ------  ------
      1  [36m0.4080[0m       [32m0.8612[0m    [35m0.2673[0m        [31m0.9489[0m        [94m0.9904[0m        [36m0.9005[0m  0.0100  2.9244
      2  0.3391       [32m0.8828[0m    0.2099        [31m0.9444[0m        [94m0.8754[0m        [36m0.8919[0m  0.0100  2.7907
      3  [36m0.5046[0m       0.8392    [35m0.3608[0m        [31m0.9349[0m        [94m0.8563[0m        [36m0.8741[0m  0.0100  3.0387
      4  [36m0.5105[0m       0.8395    [35m0.3668[0m        0.9383        [94m0.8406[0m        0.8804  0.0100  3.3135
      5  0.4705       0.8484    0.3255        [31m0.9342[0m        [94m0.8244[0m        [36m0.8728[0m  0.0100  3.2297
      6  0.5033       0.8464    0.3582        [31m0.9335[0m        [94m0.8125[0m        [36m0.8713[0m  0.0100  2.9621
      7 

<class 'skorch.net.NeuralNet'>[initialized](
  module_=deepnwide(
    (emb_UserID): Embedding(943, 120)
    (emb_Gender): Embedding(2, 120)
    (emb_Age): Embedding(7, 120)
    (emb_Occupation): Embedding(21, 120)
    (emb_MovieID): Embedding(1682, 120)
    (h1): Linear(in_features=600, out_features=500, bias=True)
    (h2): Linear(in_features=500, out_features=500, bias=True)
    (h3): Linear(in_features=500, out_features=500, bias=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
    (dropout3): Dropout(p=0.2, inplace=False)
    (last_layer): Linear(in_features=845, out_features=1, bias=True)
  ),
)

#### GridsearchCV

In [None]:
# params = {
#     'lr': [0.001, 0.01],
#     'module__size_emb': [30, 60, 120],
#     'module__dropout': [0.5],
#     'module__linear_size': [400, 500, 600]
# }
params = {
    'lr': [0.01],
    'module__size_emb': [60, 120],
    'module__dropout': [0.3],
    'module__linear_size': [500, 1000, 1500]
}
gs = GridSearchCV(deepnwidenet,
                  params,
                  verbose=50,
                  refit=True,
                  #pre_dispatch=8,
                  n_jobs=1,
                  cv=3,
                  scoring='neg_mean_squared_error')

X_ds = SliceDataset(train, idx=0)
y_ds = SliceDataset(train, idx=1)
gs.fit(X_ds, y_ds)

print(gs.best_score_, gs.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3; 1/6] START lr=0.01, module__dropout=0.3, module__linear_size=500, module__size_emb=60
  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ------  ------
      1  [36m0.3490[0m       [32m0.8462[0m    [35m0.2198[0m        [31m0.9818[0m        [94m1.0217[0m        [36m0.9640[0m  0.0100  5.5711
      2  [36m0.5170[0m       0.8149    [35m0.3786[0m        [31m0.9667[0m        [94m0.8876[0m        [36m0.9345[0m  0.0100  4.9588
      3  0.3983       0.8361    0.2614        [31m0.9621[0m        [94m0.8652[0m        [36m0.9257[0m  0.0100  4.4296
      4  0.3291       [32m0.8498[0m    0.2041        0.9646        [94m0.8502[0m        0.9305  0.0100  3.9035
      5  0.3477       0.8433    0.2190        0.9743        [94m0.8441[0m        0.9492  0.0100  3.7193
      6  0.45

In [None]:
best_model = gs.best_estimator_
best_model.fit(train_dataset)

Re-initializing module because the following parameters were re-set: dropout, interact, linear_size, movies_emb, movies_ohe, size_emb, users_emb, users_ohe, y_range.
Re-initializing criterion.
Re-initializing optimizer.
  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ------  ------
      1  [36m0.5195[0m       [32m0.8302[0m    [35m0.3780[0m        [31m0.9388[0m        [94m1.0714[0m        [36m0.8814[0m  0.0100  4.1835
      2  [36m0.6028[0m       0.8075    [35m0.4809[0m        0.9449        [94m0.8874[0m        0.8929  0.0100  4.2092
      3  0.5556       0.8149    0.4215        [31m0.9366[0m        [94m0.8700[0m        [36m0.8773[0m  0.0100  3.6634
      4  0.4168       [32m0.8626[0m    0.2748        [31m0.9334[0m        [94m0.8507[0m        [36m0.8712[0m  0.0100  3.8046
      5  0.5149       0.8460    0.3701        [31m0.9

<class 'skorch.net.NeuralNet'>[initialized](
  module_=deepnwide(
    (emb_UserID): Embedding(943, 120)
    (emb_Gender): Embedding(2, 120)
    (emb_Age): Embedding(7, 120)
    (emb_Occupation): Embedding(21, 120)
    (emb_MovieID): Embedding(1682, 120)
    (h1): Linear(in_features=600, out_features=1000, bias=True)
    (h2): Linear(in_features=1000, out_features=1000, bias=True)
    (h3): Linear(in_features=1000, out_features=1000, bias=True)
    (dropout1): Dropout(p=0.3, inplace=False)
    (dropout2): Dropout(p=0.3, inplace=False)
    (dropout3): Dropout(p=0.3, inplace=False)
    (last_layer): Linear(in_features=1345, out_features=1, bias=True)
  ),
)

In [None]:
#rating_predict = best_model.predict(valid_dataset)
rating_predict = deepnwidenet.predict(valid_dataset)

In [40]:
valid_users = valid_dataset[:][0][0][:,0].numpy()
valid_movie = valid_dataset[:][0][0][:,3].numpy()
valid_rating = valid_dataset[:][1].numpy()

In [None]:
valid_rating

In [41]:
dct= {}
for i in range(20000):
    if valid_users[i] not in dct.keys():
        dct[valid_users[i]]  = [(i, valid_movie[i], rating_predict[i], valid_rating[i])]
    else:
        dct[valid_users[i]].append((i, valid_movie[i], rating_predict[i], valid_rating[i]))

In [None]:
dct[725][1]

In [42]:
#calculate HR@10
from sklearn.metrics import ndcg_score
import math
cnt_user = 0
sum = 0
ndcg = 0
for key, value in dct.items():
    if len(value) >= 10:
        cnt_user += 1
        cnt_movie = 0
        value_rank_by_pred = sorted(value, reverse = True , key=lambda x: x[2]) # sort by pred
        value_rank_by_label = sorted(value, reverse = True , key=lambda x: x[3]) # sort by label
        ranked_label = [[item[3] for item in value_rank_by_label]]
        ranked_pred = [[item[3] for item in value_rank_by_pred]]
        dcg = 0
        idcg = 0
        for j in range(10):
            if value_rank_by_pred[j][3] == 5:
                sum += 1
                break
        for j in range(10):
            dcg += (2**ranked_pred[0][j]-1)/(math.log2(j+2))
            idcg += (2**ranked_label[0][j]-1)/(math.log2(j+2))
        ndcg += dcg/idcg
        #ndcg += ndcg_score(ranked_label, ranked_pred, k=10)
        #ndcg += ndcg_at_k(ranked_pred, ranked_label, 10)
hr = sum/cnt_user
ndcg = ndcg/cnt_user
print(hr, ndcg)

0.9144851657940664 0.7691176541141781


### Two embeddings - Basic matrix factorization

#### Manually specify hyperparamers

In [43]:
twoembedsnet = NeuralNet(
    twoembeds,
    module__size_emb=500,
    module__y_range=train.y_range,
    max_epochs=30,
    lr=0.01,
    optimizer=torch.optim.Adam,
    criterion=torch.nn.MSELoss,
    device=device,
    iterator_train__batch_size=4096,
    iterator_train__num_workers=0,
    iterator_train__shuffle=True,
    iterator_valid__batch_size=4096,
    train_split=predefined_split(valid_dataset),
    callbacks=[earlystopping,
               epoch_rmse,
               epoch_precision,
               epoch_recall,
               epoch_f1,
               #checkpoint,
               lr_scheduler]
)

In [44]:
twoembedsnet.fit(train_dataset)

  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ------  ------
      1  [36m0.6073[0m       [32m0.7948[0m    [35m0.4914[0m        [31m0.9753[0m        [94m1.2419[0m        [36m0.9512[0m  0.0100  4.1760
      2  0.4658       [32m0.8719[0m    0.3178        [31m0.9309[0m        [94m0.6232[0m        [36m0.8665[0m  0.0100  3.1107
      3  0.5314       0.8394    0.3887        0.9445        [94m0.2534[0m        0.8921  0.0100  3.1655
      4  0.5495       0.8286    0.4111        0.9556        [94m0.1141[0m        0.9131  0.0100  3.1317
      5  0.5524       0.8302    0.4139        0.9595        [94m0.0713[0m        0.9206  0.0100  3.9620
      6  0.5625       0.8242    0.4269        0.9623        [94m0.0561[0m        0.9261  0.0100  3.2080
      7  0.5644       0.8228    0.4295        0.9674        [94m0.0527[0m        0.9358  0.0100 

<class 'skorch.net.NeuralNet'>[initialized](
  module_=twoembeds(
    (emb_UserID): Embedding(943, 500)
    (emb_MovieID): Embedding(1682, 500)
    (emb_UserID_b): Embedding(943, 1)
    (emb_MovieID_b): Embedding(1682, 1)
  ),
)

In [None]:
torch.save(twoembedsnet.module_.state_dict(), 'GMF.pth')

#### GridSearchCV

In [None]:
params = {
    'lr': [0.01],
    'module__size_emb': [500]
}
gs = GridSearchCV(twoembedsnet,
                  params,
                  verbose=50,
                  refit=True,
                  #pre_dispatch=8,
                  #n_jobs=8,
                  cv=3,
                  scoring='neg_mean_squared_error')

X_ds = SliceDataset(train, idx=0)
y_ds = SliceDataset(train, idx=1)
gs.fit(X_ds, y_ds)
print(gs.best_score_, gs.best_params_)





Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START lr=0.01, module__size_emb=500...............................
  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ------  ------
      1  [36m0.5349[0m       [32m0.8186[0m    [35m0.3972[0m        [31m0.9922[0m        [94m1.2497[0m        [36m0.9846[0m  0.0100  3.9989
      2  0.4981       [32m0.9545[0m    0.3370        [31m0.8469[0m        [94m0.6804[0m        [36m0.7173[0m  0.0100  4.1132
      3  [36m0.6667[0m       [32m0.9865[0m    [35m0.5035[0m        [31m0.7497[0m        [94m0.3272[0m        [36m0.5621[0m  0.0100  4.7773
      4  [36m0.7077[0m       [32m0.9964[0m    [35m0.5488[0m        [31m0.7136[0m        [94m0.1508[0m        [36m0.5092[0m  0.0100  4.2700
      5  0.7063       [32m0.9978[0m    0.5466        [31m0.6989[0m        [

In [None]:
best_model = gs.best_estimator_
best_model.fit(train_dataset)

Re-initializing module because the following parameters were re-set: size_emb, y_range.
Re-initializing criterion.
Re-initializing optimizer.
  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ------  ------
      1  [36m0.6109[0m       [32m0.7961[0m    [35m0.4956[0m        [31m0.9753[0m        [94m1.2464[0m        [36m0.9513[0m  0.0100  3.8133
      2  0.4767       [32m0.8682[0m    0.3285        [31m0.9236[0m        [94m0.6205[0m        [36m0.8530[0m  0.0100  3.5169
      3  0.5336       0.8474    0.3894        0.9374        [94m0.2516[0m        0.8787  0.0100  3.5038
      4  0.5525       0.8289    0.4144        0.9485        [94m0.1132[0m        0.8996  0.0100  4.3620
      5  0.5533       0.8289    0.4153        0.9534        [94m0.0728[0m        0.9090  0.0100  3.5139
      6  0.5630       0.8238    0.4276        0.9551        [

<class 'skorch.net.NeuralNet'>[initialized](
  module_=twoembeds(
    (emb_UserID): Embedding(943, 500)
    (emb_MovieID): Embedding(1682, 500)
    (emb_UserID_b): Embedding(943, 1)
    (emb_MovieID_b): Embedding(1682, 1)
  ),
)

In [45]:
#rating_predict = best_model.predict(valid_dataset)
rating_predict = twoembedsnet.predict(valid_dataset)

In [46]:
valid_users = valid_dataset[:][0][0][:,0].numpy()
valid_movie = valid_dataset[:][0][0][:,3].numpy()
valid_rating = valid_dataset[:][1].numpy()

In [47]:
dct= {}
for i in range(20000):
    if valid_users[i] not in dct.keys():
        dct[valid_users[i]]  = [(i, valid_movie[i], rating_predict[i], valid_rating[i])]
    else:
        dct[valid_users[i]].append((i, valid_movie[i], rating_predict[i], valid_rating[i]))

In [48]:
#calculate HR@10
from sklearn.metrics import ndcg_score
import math
cnt_user = 0
sum = 0
ndcg = 0
for key, value in dct.items():
    if len(value) >= 10:
        cnt_user += 1
        cnt_movie = 0
        value_rank_by_pred = sorted(value, reverse = True , key=lambda x: x[2]) # sort by pred
        value_rank_by_label = sorted(value, reverse = True , key=lambda x: x[3]) # sort by label
        ranked_label = [[item[3] for item in value_rank_by_label]]
        ranked_pred = [[item[3] for item in value_rank_by_pred]]
        dcg = 0
        idcg = 0
        for j in range(10):
            if value_rank_by_pred[j][3] == 5:
                sum += 1
                break
        for j in range(10):
            dcg += (2**ranked_pred[0][j]-1)/(math.log2(j+2))
            idcg += (2**ranked_label[0][j]-1)/(math.log2(j+2))
        ndcg += dcg/idcg
        #ndcg += ndcg_score(ranked_label, ranked_pred, k=10)
        #ndcg += ndcg_at_k(ranked_pred, ranked_label, 10)
hr = sum/cnt_user
ndcg = ndcg/cnt_user
print(hr, ndcg, cnt_user)

0.9057591623036649 0.7721473278004224 573


### Benchmark with scikit-surprise SVD algorithm

In [None]:
!pip install surprise

In [None]:
from surprise import NormalPredictor
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, KFold

In [None]:
train[dataloaders['train'].dataset.indices][1]

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

#Predict

In [None]:
movies = pd.read_csv('/content/ml-100k/u.item', sep='|', names=['MovieID', 'Title', 'date', 'video_rl_date', 'link']+genre_cols, engine='python', encoding='latin-1')
users = pd.read_csv('/content/ml-100k/u.user', sep='|', names=['UserID', 'Age', 'Gender', 'Occupation', 'Zipcode'], engine='python', encoding='latin-1')
ratings = pd.read_csv('/content/ml-100k/u.data', sep='\t', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', encoding='latin-1')


In [None]:
ratings

In [None]:
filtered_rating = ratings[(ratings['UserID'] == 0) & (ratings['MovieID'] == 168)]['Rating'].values[0]
filtered_rating

In [None]:
bins = [0, 18, 25, 35, 45, 50, 56, 100]
labels = [1, 18, 25, 35, 45, 50, 56]
users['Age'] = pd.cut(users['Age'], bins=bins, labels=labels, right=False)

In [None]:
movies

In [None]:
# Label Encode users
columns = ['UserID', 'Gender', 'Age', 'Occupation']
users[columns] = users[columns].apply(preprocessing.LabelEncoder().fit_transform)
#users_emb_columns = train.users_emb_columns + columns
users = users.drop(['Zipcode'], axis = 1)

In [None]:
# One Hot Encode users
users_ohe_columns = []
columns = ['Gender', 'Age', 'Occupation']
ohe = preprocessing.OneHotEncoder(categories='auto', sparse=False, dtype='uint8')
ohe.fit(train.ratings[columns])
users = pd.concat([users, pd.DataFrame(data=ohe.transform(train.ratings[columns]), columns=ohe.get_feature_names_out(columns))], axis=1)
users_ohe_columns = ohe.get_feature_names_out(columns)


In [None]:
# Label Encode movies
columns = ['MovieID']
movies[columns] = movies[columns].apply(preprocessing.LabelEncoder().fit_transform)

In [None]:
df2 = movies[genre_cols]
df2['Genre'] = df2.apply(lambda row: ', '.join(row.index[row == 1]), axis=1)
movies['Genre'] = df2['Genre']
movies = movies.drop(genre_cols, axis = 1)

In [None]:
# One Hot Encode movies
movies_ohe_columns = []
genres = ["genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir", "Horror",
    "Musical", "Mystery", "Romance", "SciFi", "Thriller", "War", "Western"]

for genre in genres:
    genre = genre.replace('-', '')
    column = str(genre)
    movies[column] = movies['Genre'].apply(lambda x: 1 if genre in x else 0)
    movies_ohe_columns.append(column)



In [None]:
users_emb = torch.from_numpy(users[['UserID', 'Gender', 'Age', 'Occupation']].values)

In [None]:
movies_emb = torch.from_numpy(movies['MovieID'].values)


In [None]:
movies_emb[0].unsqueeze(0).unsqueeze(0)[:, 0]

In [None]:
movies_ohe = torch.tensor(movies[movies_ohe_columns].values, dtype=torch.float)

In [None]:
movies_ohe[0]

In [None]:
users_emb[2].long()

In [None]:
users_ohe = torch.tensor(users[users_ohe_columns].values, dtype=torch.float)


In [None]:
def predict(i, model):
    lst_movie = []
    for j in range(len(movies)):
        X = (users_emb[i].long().unsqueeze(0), users_ohe[i].unsqueeze(0), movies_emb[j].long().unsqueeze(0).unsqueeze(0), movies_ohe[j].unsqueeze(0))

        y = model(X)
        if y > 4:
            lst_movie.append((movies_emb[j].long(), y.item()))
        lst_movie = sorted(lst_movie, key=lambda x: x[1], reverse = True)
    return lst_movie[:10]

In [None]:

model = ncf(train.users_emb, train.movies_emb, train.users_ohe, train.movies_ohe, train.interact, 120, 0.5, 400, train.y_range)
model.to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/best_model.pth'))

In [None]:
dictt = torch.load('/content/params.pt')


In [None]:
#input is a user, return top 5 movies
predict(0, model)

In [None]:
train_dataset[507][1]

In [None]:
cnt = 0
for i in range(943):
    predict_ = predict(i, model)
    print(predict_)
    for j in range(len(predict_)):
        if (ratings[(ratings['UserID'] == i) & (ratings['MovieID'] == predict_[j][0].item())]['Rating'].values[0] == 5):
            cnt+= 1
cnt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
print(cnt/9430)