In [2]:
import torch
import pandas as pd
import numpy as np

In [3]:
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split

In [4]:
from read_data import read_data, create_dataset, create_tabular_data

In [5]:
ratings, movies = read_data(Path('ml-1m'))

In [6]:
users = pd.read_csv("ml-1m/users.dat",sep = "::", header = None, engine = "python",encoding ="latin-1")

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [10]:
def set_random_seed(state=1):
    gens = (np.random.seed, torch.manual_seed, torch.cuda.manual_seed)
    for set_state in gens:
        set_state(state)
RANDOM_STATE = 1
set_random_seed(RANDOM_STATE)

In [11]:
(n, m), (X, y), _ = create_dataset(ratings)

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid)}

In [13]:
################################################################################################################################################################################################################
################################################################################################################################################################################################################
#                                                                               Model1: Embedding Net                                                                                                          #
################################################################################################################################################################################################################
################################################################################################################################################################################################################

In [14]:
n,m = len(ratings.userId.unique()),len(ratings.movieId.unique()),

In [15]:
minmax = float(ratings.rating.min()), float(ratings.rating.max())

In [16]:
from models.embedding_net import EmbeddingNet
from models.embedding_net import predict_recommendations as emb_recommendations
from models.embedding_net import batches

In [17]:
net = EmbeddingNet(
    n_users=n, n_movies=m, 
    n_factors=150, hidden=[500, 500, 500], 
    embedding_dropout=0.05, dropouts=[0.5, 0.5, 0.25])

In [18]:
fh = open('best.weights','rb')
weights = pickle.load(fh)
net.load_state_dict(weights)
fh.close()


In [19]:
groud_truth, predictions = [], []
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
with torch.no_grad():
    for batch in batches(X_valid,y_valid, shuffle=False, bs=2000):
        x_batch, y_batch = [b for b in batch]
        # import pdb;pdb.set_trace()
        outputs = net(x_batch[:, 0], x_batch[:, 1], minmax)
        # break
        groud_truth.extend(y_batch.tolist())
        predictions.extend(outputs.tolist())



In [20]:
groud_truth = np.asarray(groud_truth).ravel()
predictions = np.asarray(predictions).ravel()

In [21]:
final_loss = np.sqrt(np.mean((predictions - groud_truth)**2))
print(f'Final RMSE: {final_loss:.4f}')

Final RMSE: 0.8854


In [22]:
# precision
#Calculate TP,FP,TN,FN at every threshold level (0.0 - 5.0)
final = []
for threshold in np.arange(0, 5.5, 0.5):
  tp=0
  fn=0
  fp=0
  tn=0
  temp = []
  for i in range(predictions.shape[0]):
    true_r = groud_truth[i]
    est = predictions[i]
    if(true_r>=threshold):
      if(est>=threshold):
        tp = tp+1
      else:
        fn = fn+1
    else:
      if(est>=threshold):
        fp = fp+1
      else:
        tn = tn+1   
    if tp == 0:
      precision = 0
      recall = 0
      f1 = 0
    else:
      precision = tp / (tp + fp)
      recall = tp / (tp + fn)
      f1 = 2 * (precision * recall) / (precision + recall)  
  temp = [threshold, tp,fp,tn ,fn, precision, recall, f1]
  final.append(temp)
results = pd.DataFrame(final)
results.rename(columns={0:'threshold', 1:'tp', 2: 'fp', 3: 'tn', 4:'fn', 5: 'Precision', 6:'Recall', 7:'F1'}, inplace=True)

**Definition of Relevant and Recommended**

Relevant: True Rating > = Thresh

Irrelevant: True Rating < Thresh

Recommended item: Predicted Rating > = Thresh

Not Recommended item: Predicted Rating > = Thresh

### Calculate Precision, Recall and F1 for every threshold for choosing optimal threshold

In [23]:
results

Unnamed: 0,threshold,tp,fp,tn,fn,Precision,Recall,F1
0,0.0,200000,0,0,0,1.0,1.0,1.0
1,0.5,200000,0,0,0,1.0,1.0,1.0
2,1.0,199707,0,0,293,1.0,0.998535,0.999267
3,1.5,188311,9759,1385,545,0.95073,0.997114,0.97337
4,2.0,185610,7704,3440,3246,0.960148,0.982812,0.971348
5,2.5,160815,20774,11942,6469,0.885599,0.961329,0.921911
6,3.0,146456,13347,19369,20828,0.916478,0.875493,0.895517
7,3.5,91124,29585,55478,23813,0.754906,0.792817,0.773397
8,4.0,51868,8616,76447,63069,0.857549,0.451273,0.591355
9,4.5,9713,4677,150053,35557,0.674983,0.214557,0.325612


#### We can choose either 2.5 as the threshold as it has optimal number of  true positives,true negatives, false positives and false negatives.

#### That is recommend movies for which predicted rating is greater than 2.5

**Precision** = 0.884

**Recall** = 0.963

**F1** = 0.922

**RMSE** = 0.885

In [24]:
emb_dict = {'precision':float(results.loc[5]['Precision']),'recall':float(results.loc[5]['Recall']),'F1':float(results.loc[5]['F1']),'RMSE':round(final_loss,4)}

In [25]:
emb_dict

{'precision': 0.8855987972839765,
 'recall': 0.9613292365079744,
 'F1': 0.9219114118891402,
 'RMSE': 0.8854}

In [26]:
################################################################################################################################################################################################################
################################################################################################################################################################################################################
#                                                                                      Model2: RBM                                                                                                             #
################################################################################################################################################################################################################
################################################################################################################################################################################################################

In [27]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [28]:
train_set = pd.read_csv('present_set.csv')
train_set = train_set[train_set.columns[2:]]
train_set = np.array(train_set,dtype = 'int')
test_set = pd.read_csv('future_set.csv')
test_set = test_set[test_set.columns[2:]]
test_set = np.array(test_set,dtype = 'int')


In [29]:

# Getting the number of movies and users
nb_users = train_set.shape[0]
nb_movies = train_set.shape[1]


In [30]:
train_set = torch.FloatTensor(train_set)
test_set = torch.FloatTensor(test_set)

In [31]:
from models.rbm import RBM

## TO-DO - After preparing a ucb eval dataset, edit the predict function to not recommend already watched movies

In [32]:
nb_users,nb_movies

(1208, 3883)

In [33]:
nv = nb_movies
nh = 100
rbm = RBM(nv,nh)

In [34]:
fh = open('RBM.weights','rb')
weights = pickle.load(fh)
rbm.load_state_dict(weights)
fh.close()

In [35]:
train_set[train_set==0] = -1
train_set[train_set==1] = 0
train_set[train_set==2] = 0
train_set[train_set>=3] = 1
test_set[test_set==0] = -1
test_set[test_set==1] = 0
test_set[test_set==2] = 0
test_set[test_set>=3] = 1

In [36]:
# rbm_rec=rbm_recommendations(model=rbm,unseen_set=test_set,user_id=164)

In [37]:
# for r in rbm_rec:
#     print('recommended movies: {}'.format(movies[movies['movieId']==r]['title']))

In [38]:
test_loss = 0
s = float(0)
nb_users = test_set.shape[0]
for id_user in range(nb_users):
    
    v = train_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    if len(vt[vt>=0]) > 0:
        _, h = rbm.sample_h(v)   
        _, v = rbm.sample_v(h)
        test_loss += torch.mean(torch.abs(vt[vt>=0]-v[vt>=0]))
        s += 1.
            

In [39]:
rmse = (test_loss/s).cpu().numpy()
print(' Final RMSE loss: '+str(rmse))


 Final RMSE loss: 0.52463025


In [40]:
predictions = np.zeros(train_set.shape)
for id_user in range(nb_users):
    
    v = train_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    if len(vt[vt>=0]) > 0:
        _, h = rbm.sample_h(v)   
        _, v = rbm.sample_v(h)
        predictions[id_user] = v.numpy()
        # break

In [41]:
# precision
#Calculate TP,FP,TN,FN at every threshold level (0.0 - 5.0)
final = []
# for threshold in np.arange(0, 5.5, 0.5):
tp=0
fn=0
fp=0
tn=0
temp = []
for i in range(train_set.shape[0]):
    for j in range(train_set.shape[1]):
        if train_set[i][j]>=0:
            continue
        # if predictions[]
        true_r = test_set[i][j]
        est = predictions[i][j]
        
        if(true_r==1):
            if(est==1):
                tp = tp+1
            else:
                fn = fn+1
        elif(true_r==0):
            if(est==1):
                fp = fp+1
            else:
                tn = tn+1   
        if tp == 0:
            precision = 0
            recall = 0
            f1 = 0
        else:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * (precision * recall) / (precision + recall)  


#### В RBM модель выдает 1 или 0 вместо рейтинга пользователя.
#### 1 - фильм понравится, 0 - иначе.
#### В итоге рекомендуем пользователю фильмы с предсказанием 1

In [42]:
temp = [ tp,fp,tn ,fn, precision, recall, f1]
print("True positives : {}".format(tp))
print("False positives : {}".format(fp))
print("True negatives : {}".format(tn))
print("false negatives : {}".format(fn))
print("Precision : {}".format(precision))
print("Recall : {}".format(recall))
print("f1 : {}".format(f1))

True positives : 6303
False positives : 1112
True negatives : 1521
false negatives : 8090
Precision : 0.8500337154416723
Recall : 0.437921211700132
f1 : 0.5780447542186353


In [43]:
rbm_dict = {'F1': f1, 'RMSE': rmse, 'precision': precision, 'recall': recall}

In [44]:
################################################################################################################################################################################################################
################################################################################################################################################################################################################
#                                                                                        Model3: AutoEncoders                                                                                                  #
################################################################################################################################################################################################################
################################################################################################################################################################################################################

In [45]:
from models.sae import SAE
from torch.autograd import Variable
from torch import nn

In [46]:
sae = SAE(nb_users,nb_movies)

In [47]:
train_set = pd.read_csv('present_set.csv')
train_set = train_set[train_set.columns[2:]]
train_set = np.array(train_set,dtype = 'int')
test_set = pd.read_csv('future_set.csv')
test_set = test_set[test_set.columns[2:]]
test_set = np.array(test_set,dtype = 'int')

In [48]:
train_set = torch.FloatTensor(train_set)
test_set = torch.FloatTensor(test_set)

In [49]:
fh = open('sae_checkpoints/sae_1000.weights','rb')
weights = torch.load(fh)
fh.close()

In [50]:
sae.load_state_dict(weights)

<All keys matched successfully>

In [51]:
# sae_rec = sae_recommendations(test_set,sae,user_id=164)

In [52]:
# for r in sae_rec:
#     print('recommended movies: {}'.format(movies[movies['movieId']==r]['title']))

In [53]:
criterion = nn.MSELoss()
test_loss = 0
s = 0.
for id_user in range(nb_users):
  input = Variable(train_set[id_user]).unsqueeze(0)
  target = Variable(test_set[id_user]).unsqueeze(0)
  target = target[0,:]
  input = train_set[id_user]
  
  if torch.sum(target.data > 0) > 0:
    output = sae(input)
    target.require_grad = False
    
    output[target == 0] = 0
    loss = criterion(output, target)
    mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
    test_loss += np.sqrt(loss.data*mean_corrector)
    s += 1. 
print('test loss: '+str(test_loss/s))

test loss: tensor(1.0334)


In [54]:
sae_rmse = float((test_loss/s).numpy())
sae_rmse


1.033371925354004

In [55]:
test_set.shape

torch.Size([1208, 3883])

In [56]:
predictions = np.zeros(test_set.shape)
for i in range(train_set.shape[0]):
    input=Variable(train_set[i]).unsqueeze(0)
    output = sae(input)
    predictions[i] = output.detach().numpy()

In [57]:
# predictions

In [58]:
# precision
#Calculate TP,FP,TN,FN at every threshold level (0.0 - 5.0)
final = []
for threshold in np.arange(0, 5.5, 0.5):
    tp=0
    fn=0
    fp=0
    tn=0
    temp = []
    for i in range(train_set.shape[0]):
        for j in range(train_set.shape[1]):
            if train_set[i][j]>0:
                continue
            if test_set[i][j]==0:
                continue
            true_r = test_set[i][j]
            est = predictions[i][j]
            if(true_r>=threshold):
                if(est>=threshold):
                    tp = tp+1
                else:
                    fn = fn+1
            else:
                if(est>=threshold):
                    fp = fp+1
                else:
                    tn = tn+1     
            if tp == 0:
                precision = 0
                recall = 0
                f1 = 0
            else:
                precision = tp / (tp + fp)
                recall = tp / (tp + fn)
                f1 = 2 * (precision * recall) / (precision + recall)  
    temp = [threshold, tp,fp,tn ,fn, precision, recall, f1]
    final.append(temp)
results = pd.DataFrame(final)
results.rename(columns={0:'threshold', 1:'tp', 2: 'fp', 3: 'tn', 4:'fn', 5: 'Precision', 6:'Recall', 7:'F1'}, inplace=True)

In [59]:
results

Unnamed: 0,threshold,tp,fp,tn,fn,Precision,Recall,F1
0,0.0,17026,0,0,0,1.0,1.0,1.0
1,0.5,17025,0,0,1,1.0,0.999941,0.999971
2,1.0,17025,0,0,1,1.0,0.999941,0.999971
3,1.5,16137,857,15,17,0.94957,0.998948,0.973633
4,2.0,15961,779,93,193,0.953465,0.988052,0.970451
5,2.5,13942,2191,442,451,0.864191,0.968665,0.913451
6,3.0,12832,1758,875,1561,0.879507,0.891545,0.885485
7,3.5,7803,3889,3196,2138,0.667379,0.784931,0.721398
8,4.0,4720,1872,5213,5221,0.716019,0.474801,0.570979
9,4.5,753,1033,12060,3180,0.421613,0.191457,0.263333


In [60]:
sae_dict = {'F1':float(results[results.threshold==2.5]['F1']), 'RMSE': sae_rmse, 'precision': float(results[results.threshold==2.5]['Precision']), 'recall': float(results[results.threshold==2.5]['Recall'])}

In [61]:
################################################################################################################################################################################################################
################################################################################################################################################################################################################
#                                                                                        Model4: SVD                                                                                                  #
################################################################################################################################################################################################################
################################################################################################################################################################################################################

In [62]:
from models.svd import SVD
from read_data import preprocessing_for_mf

In [63]:
train_data, test_data, data = preprocessing_for_mf()

load data finished
total data  1000209


In [64]:
f = 20
svd = SVD(train_data, f)

In [65]:
svd.train()

train data size (800167, 3)


In [66]:
svd_rmse = svd.test(test_data)

test data size (200042, 3)
rmse of test data is 0.8792477689599741


In [67]:
predictions = svd.get_recommendations(test_data)

In [68]:
# precision
#Calculate TP,FP,TN,FN at every threshold level (0.0 - 5.0)
test_data = np.array(test_data)
final = []
for threshold in np.arange(0, 5.5, 0.5):
  tp=0
  fn=0
  fp=0
  tn=0
  temp = []
  for i in range(test_data.shape[0]):
    true_r = test_data[i, 2]
    est = predictions[i]
    if true_r>=threshold:
      if est>=threshold:
        tp = tp+1
      else:
        fn = fn+1
    else:
      if est>=threshold:
        fp = fp+1
      else:
        tn = tn+1
    if tp == 0:
      precision = 0
      recall = 0
      f1 = 0
    else:
      precision = tp / (tp + fp)
      recall = tp / (tp + fn)
      f1 = 2 * (precision * recall) / (precision + recall)
  temp = [threshold, tp,fp,tn ,fn, precision, recall, f1]
  final.append(temp)
results = pd.DataFrame(final)
results.rename(columns={0:'threshold', 1:'tp', 2: 'fp', 3: 'tn', 4:'fn', 5: 'Precision', 6:'Recall', 7:'F1'}, inplace=True)

In [69]:
results

Unnamed: 0,threshold,tp,fp,tn,fn,Precision,Recall,F1
0,0.0,200042,0,0,0,1.0,1.0,1.0
1,0.5,200042,0,0,0,1.0,1.0,1.0
2,1.0,200042,0,0,0,1.0,1.0,1.0
3,1.5,188634,10368,832,208,0.9479,0.998899,0.972731
4,2.0,186997,8538,2662,1845,0.956335,0.99023,0.972987
5,2.5,163074,22591,10044,4333,0.878324,0.974117,0.923744
6,3.0,148655,14039,18596,18752,0.913709,0.887986,0.900664
7,3.5,90095,28026,56888,25033,0.762735,0.782564,0.772522
8,4.0,50250,7857,77057,64878,0.864784,0.436471,0.580137
9,4.5,9542,4429,150382,35689,0.682986,0.210962,0.322354


In [70]:
svd_dict = {'F1':float(results[results.threshold==2.5]['F1']), 'RMSE': svd_rmse, 'precision': float(results[results.threshold==2.5]['Precision']), 'recall': float(results[results.threshold==2.5]['Recall'])}
svd_dict

{'F1': 0.9237435990392895,
 'RMSE': 0.8792477689599741,
 'precision': 0.8783238628713005,
 'recall': 0.9741169724085612}

In [71]:
################################################################################################################################################################################################################
################################################################################################################################################################################################################
#                                                                                        Model4: SVD                                                                                                  #
################################################################################################################################################################################################################
################################################################################################################################################################################################################

In [72]:
from models.svdpp import SVDpp
from read_data import preprocessing_for_mf

In [73]:
train_data, test_data, data = preprocessing_for_mf()

load data finished
total data  1000209


In [74]:
f = 20
svdpp = SVDpp(train_data, f)

In [None]:
svdpp.train()

train data size (800167, 3)
step 1 is running
rmse is 0.9490360914440855
step 2 is running
rmse is 0.9179856314806552
step 3 is running
rmse is 0.9137334319693238
step 4 is running
rmse is 0.9087190692056907
step 5 is running
rmse is 0.9026405190551372
step 6 is running
rmse is 0.8977446761192573
step 7 is running


In [None]:
svdpp_rmse = svd.test(test_data)

In [None]:
predictions = svdpp.get_recommendations(test_data)

In [None]:
# precision
#Calculate TP,FP,TN,FN at every threshold level (0.0 - 5.0)
test_data = np.array(test_data)
final = []
for threshold in np.arange(0, 5.5, 0.5):
  tp=0
  fn=0
  fp=0
  tn=0
  temp = []
  for i in range(test_data.shape[0]):
    true_r = test_data[i, 2]
    est = predictions[i]
    if true_r>=threshold:
      if est>=threshold:
        tp = tp+1
      else:
        fn = fn+1
    else:
      if est>=threshold:
        fp = fp+1
      else:
        tn = tn+1
    if tp == 0:
      precision = 0
      recall = 0
      f1 = 0
    else:
      precision = tp / (tp + fp)
      recall = tp / (tp + fn)
      f1 = 2 * (precision * recall) / (precision + recall)
  temp = [threshold, tp,fp,tn ,fn, precision, recall, f1]
  final.append(temp)
results = pd.DataFrame(final)
results.rename(columns={0:'threshold', 1:'tp', 2: 'fp', 3: 'tn', 4:'fn', 5: 'Precision', 6:'Recall', 7:'F1'}, inplace=True)

In [None]:
results

In [None]:
svdpp_dict = {'F1':float(results[results.threshold==2.5]['F1']), 'RMSE': svdpp_rmse, 'precision': float(results[results.threshold==2.5]['Precision']), 'recall': float(results[results.threshold==2.5]['Recall'])}

In [None]:
################################################################################################################################################################################################################
################################################################################################################################################################################################################
#                                                                                             Model Comparisions                                                                                               #
################################################################################################################################################################################################################
################################################################################################################################################################################################################

In [None]:
emb_dict

In [None]:
rbm_dict

In [None]:
sae_dict

In [None]:
svd_dict

In [None]:
svdpp_dict

In [None]:
df_comp = pd.DataFrame(columns = list(sae_dict.keys()))

In [None]:
df_comp.loc[0] = [emb_dict[x] for x in df_comp.columns]
df_comp.loc[1] = [sae_dict[x] for x in df_comp.columns]
df_comp.loc[2] = [rbm_dict[x] for x in df_comp.columns]
df_comp.loc[3] = [svd_dict[x] for x in df_comp.columns]
df_comp.loc[4] = [svdpp_dict[x] for x in df_comp.columns]

In [None]:
df_comp['Model'] = ['EmbeddingNet','Stacked AutoEncoder','Restricted Botlzmann Machine', 'SVD', 'SVD++']
df_comp = df_comp[['Model','F1','RMSE','precision','recall']]
df_comp

##### По результатам Embedding Net дает лучшие предсказания



