# Import

In [29]:
pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [30]:
from utils.dataprep_utils import MovieDataset,neg_sampling
from utils.models_utils import ImplicitDLCRS,ExplicitDLCRS

from os import listdir
from os.path import isfile, join

from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np

from sklearn import model_selection, metrics, preprocessing
import matplotlib.pyplot as plt 
import implicit

# Data preparation

In [31]:
column_names = ['userId','movieId','rating','timestamp']
exp_df = pd.read_csv('./archive/ml-100k/u.data', sep='\t',header=None,names=column_names)
exp_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [32]:
imp_df = neg_sampling(exp_df,
                      percent_print=50)

imp_df.rename(columns={"interact":"rating"},inplace=True)
imp_df.head()

(943, 1682)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nsamples['interact'] = nsamples.apply(lambda row: 1, axis=1)


processed ... 0.00% ...0.00secs
processed ... 49.95% ...0.07secs
processed ... 99.89% ...0.06secs


  nsamples=nsamples.append(pd.DataFrame(nTempData, columns=["userId","movieId", "interact"]),ignore_index=True)


Unnamed: 0,userId,movieId,rating
0,195,241,1
1,185,301,1
2,21,376,1
3,243,50,1
4,165,345,1


In [33]:
# encode the user and movie id to start from 0 so we don't run into index out of bound with Embedding

lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()

exp_df.userId = lbl_user.fit_transform(exp_df.userId.values)
exp_df.movieId = lbl_movie.fit_transform(exp_df.movieId.values)

imp_df.userId = lbl_user.fit_transform(imp_df.userId.values)
imp_df.movieId = lbl_movie.fit_transform(imp_df.movieId.values)

exp_train, exp_valid = model_selection.train_test_split(
    exp_df, test_size=0.2, random_state=42, stratify=exp_df.rating.values
)

imp_train, imp_valid = model_selection.train_test_split(
    imp_df, test_size=0.2, random_state=42, stratify=imp_df.rating.values
)

exp_train_dataset = MovieDataset(
    users=exp_train.userId.values,
    movies=exp_train.movieId.values,
    ratings=exp_train.rating.values
)

imp_train_dataset = MovieDataset(
    users=imp_train.userId.values,
    movies=imp_train.movieId.values,
    ratings=imp_train.rating.values
)

exp_valid_dataset = MovieDataset(
    users=exp_valid.userId.values,
    movies=exp_valid.movieId.values,
    ratings=exp_valid.rating.values
)

imp_valid_dataset = MovieDataset(
    users=imp_valid.userId.values,
    movies=imp_valid.movieId.values,
    ratings=imp_valid.rating.values
)

In [34]:
exp_train_loader = DataLoader(dataset=exp_train_dataset,
                              batch_size=64,
                              shuffle=True,
                              num_workers=4) 
 
exp_valid_loader = DataLoader(dataset=exp_valid_dataset,
                              batch_size=64,
                              shuffle=True,
                              num_workers=4)

imp_train_loader = DataLoader(dataset=imp_train_dataset,
                              batch_size=64,
                              shuffle=True,
                              num_workers=4) 
 
imp_valid_loader = DataLoader(dataset=imp_valid_dataset,
                              batch_size=64,
                              shuffle=True,
                              num_workers=4)

exp_dataiter = iter(exp_train_loader)
exp_dataloader_data = next(exp_dataiter) 
print(exp_dataloader_data)

imp_dataiter = iter(imp_train_loader)
imp_dataloader_data = next(imp_dataiter) 
print(imp_dataloader_data)

{'users': tensor([313, 587, 278, 918, 425, 324,  10,  63, 711, 550,   9,   5, 890, 826,
         17, 641, 726, 296, 167, 343,  24, 777, 654, 635, 470, 906, 869, 220,
        485,  15, 285,  92, 795, 252, 469, 255, 235, 751, 921, 492, 623, 463,
        183, 711, 889, 263, 449, 795, 794, 649, 846, 796, 530, 275, 319, 888,
        765, 490, 449, 663, 483, 404,  80, 166]), 'movies': tensor([1149,  394,   26,  252,  493,  153,  119,    3,  414,  684,  662,  422,
         125,  346,  215,   34, 1046,   41,  741,  310,  207,  711,    3,  812,
         626,  224,  526,  720,  830,  384,  688,  411,  796,   96,  545,  549,
          57,  326,  809,  973,   24,  256,   81,  450,  167,  319,  283,  539,
         567,  750,  947,   49,  747,  191,  807,   97,  497,  257,  482,  430,
         251,  443,  283,  614]), 'ratings': tensor([4, 4, 5, 3, 3, 3, 2, 3, 4, 1, 3, 3, 5, 3, 4, 2, 2, 3, 5, 4, 4, 3, 2, 5,
        1, 5, 5, 5, 3, 5, 5, 2, 3, 4, 4, 5, 2, 5, 4, 3, 4, 4, 3, 5, 5, 4, 4, 2,
        3, 2,

# Modeling

## Call

In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [36]:
exp_model = ExplicitDLCRS(
    n_users=len(lbl_user.classes_),
    n_movies=len(lbl_movie.classes_),
    n_latents=5,
    n_layers=3,
    dropout=0.33,
).to(device)

imp_model = ImplicitDLCRS(
    n_users=len(lbl_user.classes_),
    n_movies=len(lbl_movie.classes_),
    n_latents=5,
    n_layers=3,
    dropout=0.33,
).to(device)

exp_optimizer = torch.optim.Adam(exp_model.parameters())  
imp_optimizer = torch.optim.Adam(exp_model.parameters())  

exp_sch = torch.optim.lr_scheduler.StepLR(exp_optimizer, step_size=3, gamma=0.7)
imp_sch = torch.optim.lr_scheduler.StepLR(imp_optimizer, step_size=3, gamma=0.7)

loss_ENTR = nn.CrossEntropyLoss()
loss_RMSE = nn.MSELoss()

In [37]:
print(exp_model)
print(imp_model)

ExplicitDLCRS(
  (user_embed): Embedding(943, 5)
  (movie_embed): Embedding(1682, 5)
  (fc_activation): ReLU()
  (fc_layers): ModuleList(
    (0): Linear(in_features=10, out_features=150, bias=True)
    (1): Linear(in_features=150, out_features=100, bias=True)
    (2): Linear(in_features=100, out_features=50, bias=True)
    (3): Linear(in_features=50, out_features=1, bias=True)
  )
  (do_layers): ModuleList(
    (0-2): 3 x Dropout(p=0.33, inplace=False)
  )
)
ImplicitDLCRS(
  (user_embed): Embedding(943, 5)
  (movie_embed): Embedding(1682, 5)
  (fc_activation): ReLU()
  (fc_layers): ModuleList(
    (0): Linear(in_features=10, out_features=150, bias=True)
    (1): Linear(in_features=150, out_features=100, bias=True)
    (2): Linear(in_features=100, out_features=50, bias=True)
    (3): Linear(in_features=50, out_features=2, bias=True)
  )
  (do_layers): ModuleList(
    (0-2): 3 x Dropout(p=0.33, inplace=False)
  )
)


## Training loop

In [38]:
exp_model_path = "./models/explicit/"

exp_list_saves = [f for f in listdir(exp_model_path) if isfile(join(exp_model_path, f))]

imp_model_path = "./models/implicit/"

imp_list_saves = [f for f in listdir(imp_model_path) if isfile(join(imp_model_path, f))]

print(exp_list_saves)
print(imp_list_saves)

['0.pt', '1.pt', '2.pt', '3.pt', '4.pt', '5.pt', '6.pt', '7.pt', '8.pt', '9.pt', '10.pt', '11.pt', '12.pt', '13.pt', '14.pt', '15.pt', '16.pt', '17.pt', '18.pt', '19.pt', '20.pt', '21.pt', '22.pt', '23.pt', '24.pt', '25.pt', '26.pt', '27.pt', '28.pt', '29.pt', '30.pt', '31.pt', '32.pt', '33.pt', '34.pt', '35.pt', '36.pt', '37.pt', '38.pt', '39.pt', '40.pt', '41.pt', '42.pt', '43.pt', '44.pt', '45.pt', '46.pt', '47.pt', '48.pt', '49.pt', '50.pt', '51.pt', '52.pt', '53.pt', '54.pt', '55.pt', '56.pt', '57.pt', '58.pt', '59.pt', '60.pt', '61.pt', '62.pt', '63.pt', '64.pt', '65.pt', '66.pt', '67.pt', '68.pt', '69.pt', '70.pt', '71.pt', '72.pt', '73.pt', '74.pt', '75.pt', '76.pt', '77.pt', '78.pt', '79.pt', '80.pt', '81.pt', '82.pt', '83.pt', '84.pt', '85.pt', '86.pt', '87.pt', '88.pt', '89.pt', '90.pt', '91.pt', '92.pt', '93.pt', '94.pt', '95.pt', '96.pt', '97.pt', '98.pt', '99.pt']
['46.pt', '45.pt', '44.pt', '42.pt', '43.pt', '40.pt', '41.pt', '4.pt', '38.pt', '39.pt', '37.pt', '36.pt', '

In [39]:
if len(exp_list_saves) != 0 :
  exp_epochs_done = [int(save.split(".")[0]) for save in exp_list_saves]
  exp_most_recent_epoch = np.max(exp_epochs_done)
else :
  exp_most_recent_epoch = 0 

if len(imp_list_saves) != 0 :
  imp_epochs_done = [int(save.split(".")[0]) for save in imp_list_saves]
  imp_most_recent_epoch = np.max(imp_epochs_done)
else : 
  imp_most_recent_epoch = 0

### ExplicitDLCRS

In [40]:
epochs = 100
total_loss = []
plot_steps, print_steps = 5000, 5000
all_losses_list = [] 

exp_model.train() 
for epoch_i in range(exp_most_recent_epoch,epochs):
    saving_path = exp_model_path+str(epoch_i)+'.pt'
    if saving_path.split("/")[-1] in exp_list_saves :
      print(f"epoch {epoch_i} already done !")
      exp_model.load_state_dict(torch.load(saving_path))
    else :
      for i, train_data in enumerate(exp_train_loader):
          output = exp_model(train_data["users"].cuda(), 
                             train_data["movies"].cuda()) 
          
          rating = train_data["ratings"].view(output.shape[0], -1).to(torch.float32).cuda()
          loss = loss_RMSE(output, rating)
          total_loss.append(loss.sum().item())
          exp_optimizer.zero_grad()
          loss.backward()
          exp_optimizer.step()

      torch.save(exp_model.state_dict(),saving_path)
      avg_loss = np.mean(total_loss)
      print(f"epoch {epoch_i} loss is : {avg_loss}")
      print(f"model is saved for epoch {epoch_i}")
      all_losses_list.append(avg_loss)
      total_loss = [] 

epoch 99 already done !


In [41]:
torch.cuda.empty_cache()

### ImplicitDLCRS

In [42]:
epochs = 100
total_loss = []
plot_steps, print_steps = 5000, 5000
all_losses_list = [] 

imp_model.train() 
for epoch_i in range(imp_most_recent_epoch,epochs):
    saving_path = imp_model_path+str(epoch_i)+'.pt'
    if saving_path.split("/")[-1] in imp_list_saves :
      print(f"epoch {epoch_i} already done !")
      imp_model.load_state_dict(torch.load(saving_path))
    else :
      for i, train_data in enumerate(imp_train_loader):
          output = imp_model(train_data["users"].cuda(), 
                             train_data["movies"].cuda()) 
          
          rating = train_data["ratings"].view(output.shape[0], -1).to(torch.int64).cuda()
          loss = loss_ENTR(output, rating.reshape(output.shape[0]))
          total_loss.append(loss.sum().item())
          imp_optimizer.zero_grad()
          loss.backward()
          imp_optimizer.step()

      torch.save(imp_model.state_dict(),saving_path)
      avg_loss = np.mean(total_loss)
      print(f"epoch {epoch_i} loss is : {avg_loss}")
      print(f"model is saved for epoch {epoch_i}")
      all_losses_list.append(avg_loss)
      total_loss = [] 

epoch 99 already done !


In [43]:
torch.cuda.empty_cache()

## Validation loop

### ExplicitDLCRS

In [44]:
exp_model_output_list = []
exp_target_rating_list = []

exp_model.eval()

with torch.no_grad():
    for i, batched_data in enumerate(exp_valid_loader): 
        model_output = exp_model(batched_data['users'].cuda(), 
                                 batched_data["movies"].cuda())
        
        exp_model_output_list.append(model_output.tolist())
        target_rating = batched_data["ratings"].tolist()
        exp_target_rating_list.append(target_rating)

def flatten(l):
    return [item for sublist in l for item in sublist]
    
exp_target_rating_list = flatten(exp_target_rating_list)
exp_model_output_list = flatten(exp_model_output_list)

# squared If True returns MSE value, if False returns RMSE value.
rms = mean_squared_error(exp_target_rating_list, 
                         exp_model_output_list, 
                         squared=False)
print(f"rms: {rms}")

rms: 0.9696753457664186


In [45]:
torch.cuda.empty_cache()

### ImplicitCLRS

In [46]:
from sklearn.metrics import mean_squared_error

imp_model_output_list = []
imp_target_rating_list = []

imp_model.eval()

with torch.no_grad():
    for i, batched_data in enumerate(imp_valid_loader): 
        model_output = imp_model(batched_data['users'].cuda(), 
                                 batched_data["movies"].cuda())
    
        imp_model_output_list.append(model_output.softmax(dim=1)[:,1].tolist())
        target_rating = batched_data["ratings"].tolist()
        imp_target_rating_list.append(target_rating)

def flatten(l):
    return [item for sublist in l for item in sublist]

imp_target_rating_list = flatten(imp_target_rating_list)
imp_model_output_list = flatten(imp_model_output_list)

In [47]:
a = np.array(imp_model_output_list)
a[a > 0.5] = 1
a[a <= 0.5] = 0

a = a.tolist()

In [48]:
from sklearn.metrics import classification_report,roc_auc_score 

print(classification_report(imp_target_rating_list,a))

              precision    recall  f1-score   support

           0       0.50      0.62      0.55     20000
           1       0.49      0.37      0.43     20000

    accuracy                           0.50     40000
   macro avg       0.50      0.50      0.49     40000
weighted avg       0.50      0.50      0.49     40000



In [49]:
print(roc_auc_score(imp_target_rating_list,imp_model_output_list))

0.492319155


In [50]:
torch.cuda.empty_cache()