In [2]:
import torch
import pandas as pd
import numpy as np

In [3]:
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split

In [4]:
from read_data import read_data, create_dataset, create_tabular_data

In [5]:
ratings, movies = read_data(Path('ml-1m'))

In [6]:
users = pd.read_csv("ml-1m/users.dat",sep = "::", header = None, engine = "python",encoding ="latin-1")

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [10]:
def set_random_seed(state=1):
    gens = (np.random.seed, torch.manual_seed, torch.cuda.manual_seed)
    for set_state in gens:
        set_state(state)
RANDOM_STATE = 1
set_random_seed(RANDOM_STATE)

In [11]:
(n, m), (X, y), _ = create_dataset(ratings)

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid)}

In [13]:
################################################################################################################################################################################################################
################################################################################################################################################################################################################
#                                                                               Model1: Embedding Net                                                                                                          #
################################################################################################################################################################################################################
################################################################################################################################################################################################################

In [14]:
n,m = len(ratings.userId.unique()),len(ratings.movieId.unique()),

In [15]:
minmax = float(ratings.rating.min()), float(ratings.rating.max())

In [16]:
from models.embedding_net import EmbeddingNet
from models.embedding_net import predict_recommendations as emb_recommendations
from models.embedding_net import batches

In [17]:
net = EmbeddingNet(
    n_users=n, n_movies=m, 
    n_factors=150, hidden=[500, 500, 500], 
    embedding_dropout=0.05, dropouts=[0.5, 0.5, 0.25])

In [18]:
fh = open('best.weights','rb')
weights = pickle.load(fh)
net.load_state_dict(weights)
fh.close()


In [19]:
groud_truth, predictions = [], []
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
with torch.no_grad():
    for batch in batches(X_valid,y_valid, shuffle=False, bs=2000):
        x_batch, y_batch = [b for b in batch]
        # import pdb;pdb.set_trace()
        outputs = net(x_batch[:, 0], x_batch[:, 1], minmax)
        # break
        groud_truth.extend(y_batch.tolist())
        predictions.extend(outputs.tolist())



In [20]:
groud_truth = np.asarray(groud_truth).ravel()
predictions = np.asarray(predictions).ravel()

In [21]:
final_loss = np.sqrt(np.mean((predictions - groud_truth)**2))
print(f'Final RMSE: {final_loss:.4f}')

Final RMSE: 0.8855


In [22]:
# precision
#Calculate TP,FP,TN,FN at every threshold level (0.0 - 5.0)
final = []
for threshold in np.arange(0, 5.5, 0.5):
  tp=0
  fn=0
  fp=0
  tn=0
  temp = []
  for i in range(predictions.shape[0]):
    true_r = groud_truth[i]
    est = predictions[i]
    if(true_r>=threshold):
      if(est>=threshold):
        tp = tp+1
      else:
        fn = fn+1
    else:
      if(est>=threshold):
        fp = fp+1
      else:
        tn = tn+1   
    if tp == 0:
      precision = 0
      recall = 0
      f1 = 0
    else:
      precision = tp / (tp + fp)
      recall = tp / (tp + fn)
      f1 = 2 * (precision * recall) / (precision + recall)  
  temp = [threshold, tp,fp,tn ,fn, precision, recall, f1]
  final.append(temp)
results = pd.DataFrame(final)
results.rename(columns={0:'threshold', 1:'tp', 2: 'fp', 3: 'tn', 4:'fn', 5: 'Precision', 6:'Recall', 7:'F1'}, inplace=True)

**Definition of Relevant and Recommended**

Relevant: True Rating > = Thresh

Irrelevant: True Rating < Thresh

Recommended item: Predicted Rating > = Thresh

Not Recommended item: Predicted Rating > = Thresh

### Calculate Precision, Recall and F1 for every threshold for choosing optimal threshold

In [23]:
results

Unnamed: 0,threshold,tp,fp,tn,fn,Precision,Recall,F1
0,0.0,200000,0,0,0,1.0,1.0,1.0
1,0.5,200000,0,0,0,1.0,1.0,1.0
2,1.0,199727,0,0,273,1.0,0.998635,0.999317
3,1.5,188318,9792,1352,538,0.950573,0.997151,0.973305
4,2.0,185754,7809,3335,3102,0.959657,0.983575,0.971468
5,2.5,161107,21007,11709,6177,0.884649,0.963075,0.922198
6,3.0,147046,13530,19186,20238,0.915741,0.87902,0.897005
7,3.5,91288,29723,55340,23649,0.754378,0.794244,0.773798
8,4.0,53002,9018,76045,61935,0.854595,0.46114,0.599038
9,4.5,10159,5019,149711,35111,0.669324,0.224409,0.336124


#### We can choose either 2.5 as the threshold as it has optimal number of  true positives,true negatives, false positives and false negatives.

#### That is recommend movies for which predicted rating is greater than 2.5

**Precision** = 0.884

**Recall** = 0.963

**F1** = 0.922

**RMSE** = 0.885

In [24]:
emb_dict = {'precision':float(results.loc[5]['Precision']),'recall':float(results.loc[5]['Recall']),'F1':float(results.loc[5]['F1']),'RMSE':round(final_loss,4)}

In [25]:
emb_dict

{'F1': 0.9221976084579763,
 'RMSE': 0.8855,
 'precision': 0.884649175790988,
 'recall': 0.9630747710480381}

In [None]:
################################################################################################################################################################################################################
################################################################################################################################################################################################################
#                                                                                      Model2: RBM                                                                                                             #
################################################################################################################################################################################################################
################################################################################################################################################################################################################

In [26]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [27]:
train_set = pd.read_csv('present_set.csv')
train_set = train_set[train_set.columns[2:]]
train_set = np.array(train_set,dtype = 'int')
test_set = pd.read_csv('future_set.csv')
test_set = test_set[test_set.columns[2:]]
test_set = np.array(test_set,dtype = 'int')


In [28]:

# Getting the number of movies and users
nb_users = train_set.shape[0]
nb_movies = train_set.shape[1]


In [29]:
train_set = torch.FloatTensor(train_set)
test_set = torch.FloatTensor(test_set)

In [30]:
from models.rbm import RBM

## TO-DO - After preparing a ucb eval dataset, edit the predict function to not recommend already watched movies

In [33]:
nb_users,nb_movies

(1208, 3883)

In [34]:
nv = nb_movies
nh = 100
rbm = RBM(nv,nh)

In [35]:
fh = open('RBM.weights','rb')
weights = pickle.load(fh)
rbm.load_state_dict(weights)
fh.close()

In [36]:
train_set[train_set==0] = -1
train_set[train_set==1] = 0
train_set[train_set==2] = 0
train_set[train_set>=3] = 1
test_set[test_set==0] = -1
test_set[test_set==1] = 0
test_set[test_set==2] = 0
test_set[test_set>=3] = 1

In [37]:
# rbm_rec=rbm_recommendations(model=rbm,unseen_set=test_set,user_id=164)

In [38]:
# for r in rbm_rec:
#     print('recommended movies: {}'.format(movies[movies['movieId']==r]['title']))

In [39]:
test_loss = 0
s = float(0)
nb_users = test_set.shape[0]
for id_user in range(nb_users):
    
    v = train_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    if len(vt[vt>=0]) > 0:
        _, h = rbm.sample_h(v)   
        _, v = rbm.sample_v(h)
        test_loss += torch.mean(torch.abs(vt[vt>=0]-v[vt>=0]))
        s += 1.
            

In [40]:
rmse = (test_loss/s).cpu().numpy()
print(' Final RMSE loss: '+str(rmse))


 Final RMSE loss: 0.45991853


In [41]:
predictions = np.zeros(train_set.shape)
for id_user in range(nb_users):
    
    v = train_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    if len(vt[vt>=0]) > 0:
        _, h = rbm.sample_h(v)   
        _, v = rbm.sample_v(h)
        predictions[id_user] = v.numpy()
        # break

In [42]:
# precision
#Calculate TP,FP,TN,FN at every threshold level (0.0 - 5.0)
final = []
# for threshold in np.arange(0, 5.5, 0.5):
tp=0
fn=0
fp=0
tn=0
temp = []
for i in range(train_set.shape[0]):
    for j in range(train_set.shape[1]):
        if train_set[i][j]>=0:
            continue
        # if predictions[]
        true_r = test_set[i][j]
        est = predictions[i][j]
        
        if(true_r==1):
            if(est==1):
                tp = tp+1
            else:
                fn = fn+1
        elif(true_r==0):
            if(est==1):
                fp = fp+1
            else:
                tn = tn+1   
        if tp == 0:
            precision = 0
            recall = 0
            f1 = 0
        else:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * (precision * recall) / (precision + recall)  


#### В RBM модель выдает 1 или 0 вместо рейтинга пользователя.
#### 1 - фильм понравится, 0 - иначе.
#### В итоге рекомендуем пользователю фильмы с предсказанием 1

In [43]:
temp = [ tp,fp,tn ,fn, precision, recall, f1]
print("True positives : {}".format(tp))
print("False positives : {}".format(fp))
print("True negatives : {}".format(tn))
print("false negatives : {}".format(fn))
print("Precision : {}".format(precision))
print("Recall : {}".format(recall))
print("f1 : {}".format(f1))

True positives : 2584
False positives : 364
True negatives : 441
false negatives : 2373
Precision : 0.8765264586160109
Recall : 0.5212830340932015
f1 : 0.6537634408602151


In [45]:
rbm_dict = {'F1': f1, 'RMSE': rmse, 'precision': precision, 'recall': recall}

In [None]:
################################################################################################################################################################################################################
################################################################################################################################################################################################################
#                                                                                        Model3: AutoEncoders                                                                                                  #
################################################################################################################################################################################################################
################################################################################################################################################################################################################

In [46]:
from models.sae import SAE
from torch.autograd import Variable
from torch import nn

In [47]:
sae = SAE(nb_users,nb_movies)

In [48]:
train_set = pd.read_csv('present_set.csv')
train_set = train_set[train_set.columns[2:]]
train_set = np.array(train_set,dtype = 'int')
test_set = pd.read_csv('future_set.csv')
test_set = test_set[test_set.columns[2:]]
test_set = np.array(test_set,dtype = 'int')

In [49]:
train_set = torch.FloatTensor(train_set)
test_set = torch.FloatTensor(test_set)

In [50]:
fh = open('sae_checkpoints/sae_1000.weights','rb')
weights = torch.load(fh)
fh.close()

In [51]:
sae.load_state_dict(weights)

<All keys matched successfully>

In [52]:
# sae_rec = sae_recommendations(test_set,sae,user_id=164)

In [53]:
# for r in sae_rec:
#     print('recommended movies: {}'.format(movies[movies['movieId']==r]['title']))

In [54]:
criterion = nn.MSELoss()
test_loss = 0
s = 0.
for id_user in range(nb_users):
  input = Variable(train_set[id_user]).unsqueeze(0)
  target = Variable(test_set[id_user]).unsqueeze(0)
  target = target[0,:]
  input = train_set[id_user]
  
  if torch.sum(target.data > 0) > 0:
    output = sae(input)
    target.require_grad = False
    
    output[target == 0] = 0
    loss = criterion(output, target)
    mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
    test_loss += np.sqrt(loss.data*mean_corrector)
    s += 1. 
print('test loss: '+str(test_loss/s))

test loss: tensor(1.3656)


In [55]:
sae_rmse = float((test_loss/s).numpy())
sae_rmse


1.3655575513839722

In [56]:
test_set.shape

torch.Size([1208, 3883])

In [57]:
predictions = np.zeros(test_set.shape)
for i in range(train_set.shape[0]):
    input=Variable(train_set[i]).unsqueeze(0)
    output = sae(input)
    predictions[i] = output.detach().numpy()

In [58]:
# predictions

In [59]:
# precision
#Calculate TP,FP,TN,FN at every threshold level (0.0 - 5.0)
final = []
for threshold in np.arange(0, 5.5, 0.5):
    tp=0
    fn=0
    fp=0
    tn=0
    temp = []
    for i in range(train_set.shape[0]):
        for j in range(train_set.shape[1]):
            if train_set[i][j]>0:
                continue
            if test_set[i][j]==0:
                continue
            true_r = test_set[i][j]
            est = predictions[i][j]
            if(true_r>=threshold):
                if(est>=threshold):
                    tp = tp+1
                else:
                    fn = fn+1
            else:
                if(est>=threshold):
                    fp = fp+1
                else:
                    tn = tn+1     
            if tp == 0:
                precision = 0
                recall = 0
                f1 = 0
            else:
                precision = tp / (tp + fp)
                recall = tp / (tp + fn)
                f1 = 2 * (precision * recall) / (precision + recall)  
    temp = [threshold, tp,fp,tn ,fn, precision, recall, f1]
    final.append(temp)
results = pd.DataFrame(final)
results.rename(columns={0:'threshold', 1:'tp', 2: 'fp', 3: 'tn', 4:'fn', 5: 'Precision', 6:'Recall', 7:'F1'}, inplace=True)

In [61]:
results

Unnamed: 0,threshold,tp,fp,tn,fn,Precision,Recall,F1
0,0.0,5762,0,0,0,1.0,1.0,1.0
1,0.5,5757,0,0,5,1.0,0.999132,0.999566
2,1.0,5697,0,0,65,1.0,0.988719,0.994328
3,1.5,5199,232,40,291,0.957282,0.946995,0.952111
4,2.0,4781,184,88,709,0.962941,0.870856,0.914586
5,2.5,3739,427,378,1218,0.897504,0.754287,0.819687
6,3.0,2876,264,541,2081,0.915924,0.58019,0.710387
7,3.5,1496,514,1841,1911,0.744279,0.439096,0.552335
8,4.0,595,187,2168,2812,0.76087,0.17464,0.284077
9,4.5,39,64,4314,1345,0.378641,0.028179,0.052455


In [62]:
sae_dict = {'F1':float(results[results.threshold==2.5]['F1']), 'RMSE': sae_rmse, 'precision': float(results[results.threshold==2.5]['Precision']), 'recall': float(results[results.threshold==2.5]['Recall'])}

In [63]:
################################################################################################################################################################################################################
################################################################################################################################################################################################################
#                                                                                             Model Comparisions                                                                                               #
################################################################################################################################################################################################################
################################################################################################################################################################################################################

In [64]:
emb_dict

{'F1': 0.9221976084579763,
 'RMSE': 0.8855,
 'precision': 0.884649175790988,
 'recall': 0.9630747710480381}

In [65]:
rbm_dict

{'F1': 0.6537634408602151,
 'RMSE': array(0.45991853, dtype=float32),
 'precision': 0.8765264586160109,
 'recall': 0.5212830340932015}

In [66]:
sae_dict

{'F1': 0.8196865066315904,
 'RMSE': 1.3655575513839722,
 'precision': 0.8975036005760921,
 'recall': 0.7542868670566875}

In [67]:
df_comp = pd.DataFrame(columns = list(sae_dict.keys()))

In [68]:
df_comp.loc[0] = [emb_dict[x] for x in df_comp.columns]
df_comp.loc[1] = [sae_dict[x] for x in df_comp.columns]
df_comp.loc[2] = [rbm_dict[x] for x in df_comp.columns]

In [69]:
df_comp['Model'] = ['EmbeddingNet','Stacked AutoEncoder','Restricted Botlzmann Machine']
df_comp = df_comp[['Model','F1','RMSE','precision','recall']]
df_comp

Unnamed: 0,Model,F1,RMSE,precision,recall
0,EmbeddingNet,0.922198,0.8855,0.884649,0.963075
1,Stacked AutoEncoder,0.819687,1.36556,0.897504,0.754287
2,Restricted Botlzmann Machine,0.653763,0.45991853,0.876526,0.521283


##### По результатам Embedding Net дает лучшие предсказания



