## Import

In [1]:
import math
import pandas as pd
import numpy as np
import seaborn as sns

from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

from surprise import Reader, Dataset, SVD
from surprise import KNNBasic
from surprise import accuracy
from surprise import accuracy

from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate,Dot

sns.set_style("darkgrid")

2024-03-24 11:13:19.473039: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-24 11:13:26.545728: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-03-24 11:13:26.546004: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


### Importare dataset

In [2]:
df1 = pd.read_csv('./combined_data_1.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

df1['Rating'] = df1['Rating'].astype(float)

print('Dataset 1 shape:',df1.shape)

# Mi genera un nuovo indice da 1
df1 = df1.reset_index()
print('shape:',df1.shape)

print('Dataset')
print(df1[:5])


Dataset 1 shape: (24058263, 2)
shape: (24058263, 3)
Dataset
   index  Cust_Id  Rating
0      0       1:     NaN
1      1  1488844     3.0
2      2   822109     5.0
3      3   885013     4.0
4      4    30878     4.0


### Prendo i primi x , così che il calcolo del film non sia lunghissimo ma genero tante colonne di film

In [3]:
#df1 = df1.head(5000000)
df1 = df1.head(1000000)

### df_nan è un DataFrame che indica True per le righe in cui la colonna 'Rating' è nulla e False altrimenti. Quindi, ogni volta che c'è una transizione da False a True nella colonna 'Rating', rappresenta l'inizio di un nuovo "film"

In [4]:
df_nan = pd.DataFrame(pd.isnull(df1.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1


last_record = np.full((1,len(df1) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

#print('Movie numpy:',movie_np)
#print('Length:',len(movie_np))

df = df1[pd.notnull(df1['Rating'])]

df['Movie_Id'] = movie_np.astype(int)
df['Cust_Id'] = df['Cust_Id'].astype(int)
print('-Dataset examples-')
print(df)


-Dataset examples-
         index  Cust_Id  Rating  Movie_Id
1            1  1488844     3.0         1
2            2   822109     5.0         1
3            3   885013     4.0         1
4            4    30878     4.0         1
5            5   823519     3.0         1
...        ...      ...     ...       ...
999995  999995  1196927     3.0       225
999996  999996   528854     5.0       225
999997  999997   962705     3.0       225
999998  999998  1299323     2.0       225
999999  999999  2026970     4.0       225

[999775 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Movie_Id'] = movie_np.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cust_Id'] = df['Cust_Id'].astype(int)


### Creo matrice di rating

In [5]:
ratings_matrix = df.pivot(index='Cust_Id', columns='Movie_Id', values='Rating')

# eventuali valori mancanti -> con uno zero
ratings_matrix = ratings_matrix.fillna(0)

print(ratings_matrix)

Movie_Id  1    2    3    4    5    6    7    8    9    10   ...  216  217  \
Cust_Id                                                     ...             
6         0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
7         0.0  0.0  0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  ...  0.0  0.0   
10        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
25        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
33        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
2649401   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2649404   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2649409   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2649426   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  4.0  0.0   
2649429   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

### Prendo i primi y così che anche lo user based ha tanti film (un buon profilo ricco) ma riduzo il numero di utenti per non renderlo infinito

In [6]:
ratings_matrix = ratings_matrix.head(10000)

### rimuovo le righe nulle

In [7]:
print(ratings_matrix.shape)

(10000, 225)


# Creo Test set

### prendo il rating da colore che ne hanno almeno due e lo metto a 0

In [8]:
y_test = {}
count = 0
# scorro utenti
for i in ratings_matrix.iterrows():
    if count < 20:
        almenoUno = 0
        # scorro rating
        for j in range(1,len(i[1])):
            if i[1][j] != 0:
                # if solo se già ho incontrato un rating != 0
                if almenoUno:
                    y_test[(i[0],j)] = i[1][j]
                    i[1][j] = 0
                    count +=1
                    break
                # almeno un rating diverso da 0
                else:
                    almenoUno= True
            

In [9]:
y_test

{(6, 157): 3.0,
 (7, 28): 4.0,
 (10, 191): 4.0,
 (42, 191): 3.0,
 (59, 180): 2.0,
 (79, 84): 3.0,
 (87, 111): 2.0,
 (97, 167): 4.0,
 (134, 55): 5.0,
 (183, 197): 4.0,
 (188, 108): 3.0,
 (192, 213): 4.0,
 (195, 175): 5.0,
 (199, 111): 4.0,
 (201, 30): 5.0,
 (261, 191): 2.0,
 (265, 30): 5.0,
 (266, 175): 5.0,
 (268, 58): 5.0,
 (283, 148): 1.0}

#### Calcolo rating medi degli utenti

In [10]:
rating_medi = {}
# calcolo rating medio senza considerare gli 0
for i in ratings_matrix.iterrows():
        # somma elementi != 0
        count = 0
        # quanti elementi
        countn = 0
        for j in i[1].values:
            # se non è 0
            if j != 0:
                  count += j
                  countn += 1
        # se la media ha almeno un valore allora
        if countn != 0:
                rating_medi[i[0]] = count/countn
        # media 0
        else:
                rating_medi[i[0]] = 0

print(rating_medi)

{6: 3.4, 7: 4.666666666666667, 10: 3.5, 25: 3.0, 33: 4.0, 42: 4.0, 59: 3.5, 79: 2.5, 87: 3.5, 94: 2.0, 97: 3.5, 116: 4.0, 131: 2.0, 134: 4.857142857142857, 158: 3.0, 164: 5.0, 168: 4.0, 169: 4.0, 178: 3.0, 183: 4.0, 188: 3.1666666666666665, 192: 3.0, 195: 4.5, 199: 4.0, 201: 3.5, 242: 4.0, 247: 4.0, 261: 4.0, 265: 3.3333333333333335, 266: 4.0, 268: 5.0, 283: 4.0, 291: 3.0, 296: 3.6666666666666665, 298: 3.5, 301: 4.0, 302: 5.0, 304: 4.0, 305: 3.3333333333333335, 307: 3.3333333333333335, 330: 3.6666666666666665, 333: 5.0, 352: 5.0, 363: 5.0, 383: 3.3333333333333335, 384: 4.0, 416: 5.0, 424: 3.75, 437: 2.8333333333333335, 439: 2.5, 440: 4.0, 442: 3.5, 462: 2.6, 471: 3.0, 477: 3.5, 478: 3.0, 481: 4.25, 491: 3.0, 492: 4.0, 508: 4.0, 527: 3.857142857142857, 536: 4.333333333333333, 546: 3.5, 561: 3.75, 578: 3.0, 585: 3.0, 592: 3.6, 596: 4.0, 602: 4.2, 609: 4.25, 614: 3.3333333333333335, 624: 4.0, 660: 4.0, 663: 4.0, 664: 3.6666666666666665, 682: 4.0, 684: 4.1, 685: 3.0, 688: 5.0, 695: 2.5, 71

## Usare user based NN

### fun di similarità

In [11]:
# Definire la funzione per il calcolo della similarità tra utenti utilizzando la person coefficent
def user_similarity(ratings,user_target,id_user_target):
    similarità = {}
    # scorro UTENTI
    for i in ratings.iterrows():
        # se l'UTENTE è diverso dall'utente target
        if i[0] != id_user_target:
            # calcolo similarità
            numeratore = 0
            denominatoreP1 = 0
            denominatoreP2 = 0

            # scorro recensioni dell'utente
            for j in i[1].keys():

                diffUtenteNoTarg = i[1][j]-rating_medi[i[0]]
                diffUtenteTarg = user_target[j]-rating_medi[id_user_target]

                numeratore += ((diffUtenteNoTarg)*(diffUtenteTarg))
                denominatoreP1 += (diffUtenteNoTarg**2)
                denominatoreP2 += (diffUtenteTarg**2)
            
            sim = 0
            # check
            if denominatoreP1 !=0 and denominatoreP2 != 0:
                sim = numeratore/(math.sqrt(denominatoreP1)*math.sqrt(denominatoreP2))
            
            similarità[(i[0],id_user_target)] = sim
    return similarità

### pre-processing sulla similarità

In [12]:
similarita_coppie = {}
for i,_ in y_test:
    # Calcolare la similarità per utenti del tewt set
    similarita_coppie.update(user_similarity(ratings_matrix,ratings_matrix.loc[i],i))

In [13]:
similarita_coppie

{(7, 6): 0.9875805968075406,
 (10, 6): 0.993358800656939,
 (25, 6): 0.9855195059770339,
 (33, 6): 0.9894901728020767,
 (42, 6): 0.9911756710275614,
 (59, 6): 0.9884668290322656,
 (79, 6): 0.9858727020225875,
 (87, 6): 0.9895146596158011,
 (94, 6): 0.9855195059770283,
 (97, 6): 0.9888111485861761,
 (116, 6): 0.9894901728020767,
 (131, 6): 0.9921372840187758,
 (134, 6): 0.9743010929906664,
 (158, 6): 0.9855195059770339,
 (164, 6): 0.9921372840187797,
 (168, 6): 0.9894901728020767,
 (169, 6): 0.9881666171937272,
 (178, 6): 0.9894901728020822,
 (183, 6): 0.9894901728020767,
 (188, 6): 0.9796592643963675,
 (192, 6): 0.9855195059770339,
 (195, 6): 0.9861097823278384,
 (199, 6): 0.9933549330596739,
 (201, 6): 0.9807726925069574,
 (242, 6): 0.9855195059770283,
 (247, 6): 0.9881666171937272,
 (261, 6): 0.9894901728020767,
 (265, 6): 0.9807756725665019,
 (266, 6): 0.9898491511252226,
 (268, 6): 0.9855195059770322,
 (283, 6): 0.9894901728020767,
 (291, 6): 0.9881666171937329,
 (296, 6): 0.9851492

In [17]:
# FACCIO UNA SOGLIO K DI VICINATO SULLA BASE DELLA SIMILARITÀ
def calcoloSimilaritaConSoglia(similarita,id_user_target,soglia):
    sim_soglia = {}
    # scorro coppie di similarità (a,b)
    for i in similarita:
        # se la similarità rispetta la soglia
        if similarita[i] > soglia:
            # verifico che sia del mio utente target
            if i[1] == id_user_target:
                sim_soglia[i] = similarita[i]
    return sim_soglia

#### fun Prediction

In [18]:
# Definire una funzione per la predizione delle valutazioni utilizzando User-Based Collaborative Filtering
def predict_user_based(item,user_similarities,ratings_matrix,rating_medi,id_user_target):
    # Calcolare le valutazioni previste
    num = 0
    den = 0
    pred_ratings = 0
    
    for i in user_similarities:     
        
        num += (user_similarities[i]* (ratings_matrix.loc[i[0]][item]-rating_medi[i[0]]))
        den += user_similarities[i]

    if den != 0:
        pred_ratings = rating_medi[id_user_target] + (num / den)
    else:
        pred_ratings = rating_medi[id_user_target]
        
    #print(pred_ratings)
    return pred_ratings
    #else:
    #    return ratings_matrix.loc[id_user_target][item]

# Ottenere le previsioni per tutte le valutazioni
#predicted_ratings_user_based = predict_user_based(id_item_target, sim_soglia,ratings_matrix,rating_medi,id_user_target)
#print(f'Rating per l item {id_item_target} è di {predicted_ratings_user_based} per l utente {id_user_target}')


### testing model

In [19]:
# Ottenere le previsioni per tutte le valutazioni
y_pred = {}

for i in y_test:
    print(f'{i} starting...')

    # vettore rating utente target
    print(f'user target processing...')
    user_target = ratings_matrix.loc[i[0]]
    # SIMILARITÀ SOPRA CERTA SOGLIA
    print('sim_soglia processing...')
    sim_soglia = calcoloSimilaritaConSoglia(similarita_coppie,i[0],0.9)
    print(len(sim_soglia))
    # predico rating test
    print('predicting...')
    predicted_ratings_user_based = predict_user_based(i[1], sim_soglia, ratings_matrix,rating_medi,i[0])   
    if predicted_ratings_user_based < 0:
        predicted_ratings_user_based = 0
    # rating predetto
    print('Appending...')
    y_pred[i]=round(predicted_ratings_user_based,2)


(6, 157) starting...
user target processing...
sim_soglia processing...
9994
predicting...
Appending...
(7, 28) starting...
user target processing...
sim_soglia processing...
9994
predicting...
Appending...
(10, 191) starting...
user target processing...
sim_soglia processing...
9994
predicting...
Appending...
(42, 191) starting...
user target processing...
sim_soglia processing...
9994
predicting...
Appending...
(59, 180) starting...
user target processing...
sim_soglia processing...
9994
predicting...
Appending...
(79, 84) starting...
user target processing...
sim_soglia processing...
9994
predicting...
Appending...
(87, 111) starting...
user target processing...
sim_soglia processing...
9994
predicting...
Appending...
(97, 167) starting...
user target processing...
sim_soglia processing...
9994
predicting...
Appending...
(134, 55) starting...
user target processing...
sim_soglia processing...
9995
predicting...
Appending...
(183, 197) starting...
user target processing...
sim_soglia

In [20]:
print(f'y_pred:{y_pred}')
print(f'y_test:{y_test}')

y_pred:{(6, 157): 0, (7, 28): 1.45, (10, 191): 1.16, (42, 191): 1.66, (59, 180): 0, (79, 84): 0, (87, 111): 0.11, (97, 167): 0, (134, 55): 1.17, (183, 197): 1.29, (188, 108): 0, (192, 213): 0, (195, 175): 2.08, (199, 111): 0.62, (201, 30): 1.35, (261, 191): 1.66, (265, 30): 1.18, (266, 175): 1.58, (268, 58): 1.5, (283, 148): 0.53}
y_test:{(6, 157): 3.0, (7, 28): 4.0, (10, 191): 4.0, (42, 191): 3.0, (59, 180): 2.0, (79, 84): 3.0, (87, 111): 2.0, (97, 167): 4.0, (134, 55): 5.0, (183, 197): 4.0, (188, 108): 3.0, (192, 213): 4.0, (195, 175): 5.0, (199, 111): 4.0, (201, 30): 5.0, (261, 191): 2.0, (265, 30): 5.0, (266, 175): 5.0, (268, 58): 5.0, (283, 148): 1.0}


### MAE, NMAE, RMSE, Precision, Recall, F1

In [21]:

# Calcola le metriche aggiuntive

precision = precision_score([round(y_test[pred]) for pred in y_test], [round(y_pred[pred]) for pred in y_pred], average='micro')
recall = recall_score([round(y_test[pred]) for pred in y_test], [round(y_pred[pred]) for pred in y_pred], average='micro')
f1 = f1_score([round(y_test[pred]) for pred in y_test], [round(y_pred[pred]) for pred in y_pred], average='micro')
#nmae = accuracy.mae(predictions) / (max(y_true) - min(y_true))
#mae = accuracy.mae(y_pred)

def mae(true,pred):
    n = len(true)
    tot_val = 0
    for i in true:
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'MAE:{(tot_val/n)}')

def Nmae(true,pred):
    n = len(true)
    tot_val = 0
    max = 0
    min = 9999
    for i in true:
        if true[i] < min:
            min = true[i]
        if true[i] > max:
            max = true[i]
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'NMAE:{(tot_val/(n*(max-min)))}')

def rmse(true,pred):
    n = len(true)
    tot_val = 0
    for i in true:
        val = (pred[i]-true[i])**2
        tot_val += val
    rapp = tot_val/n
    print(f'RMSE:{math.sqrt(rapp)}')

mae(y_test,y_pred)
Nmae(y_test,y_pred)
rmse(y_test,y_pred)

# Stampa i risultati
print(f'Precision:',precision)
print(f'Recall:',recall)
print(f'F1 Score:',f1)
#print(f'NMAE:',nmae)
#print(f'MAE:',mae)

MAE:2.7830000000000004
NMAE:0.6957500000000001
RMSE:2.975713359851718
Precision: 0.1
Recall: 0.1
F1 Score: 0.10000000000000002


## Usare item based NN 

In [22]:
ratings_matrix_transpose = ratings_matrix.transpose()
#df.pivot(index='Movie_Id', columns='Cust_Id', values='Rating')

# eventuali valori mancanti con uno zero
ratings_matrix_transpose = ratings_matrix_transpose.fillna(0)


print(ratings_matrix_transpose)

Cust_Id   6      7      10     25     33     42     59     79     87     \
Movie_Id                                                                  
1           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
5           0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
...         ...    ...    ...    ...    ...    ...    ...    ...    ...   
221         0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
222         0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
223         0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
224         0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
225         0.0    0.0    0.0    0.0    0.0    0.0    0.0    2.0    0.0   

Cust_Id   94     ...  93

### fun adjusted cosine similarity

In [23]:
# Definire la funzione per il calcolo della similarità tra utenti utilizzando la person coefficent
def item_similarity(ratings,item_target,id_item_target):
    similarita_transpose = {}
    # scorro UTENTI
    for i in ratings.iterrows():
        # i riga film con rating utenti
        # se l'ITEM è diverso dall'item target
        if i[0] != id_item_target:
            # calcolo similarità
            numeratore = 0
            denominatoreP1 = 0
            denominatoreP2 = 0
            for j in i[1].keys():
                diffFilmNoTarg = i[1][j]-rating_medi[j]
                diffFilmTarg = item_target[j]-rating_medi[j]

                numeratore += ((diffFilmNoTarg)*(diffFilmTarg))
                denominatoreP1 += (diffFilmNoTarg**2)
                denominatoreP2 += (diffFilmTarg**2)
            sim = 0
            if denominatoreP1 != 0 and denominatoreP2 != 0:
                sim = numeratore/(math.sqrt(denominatoreP1)*math.sqrt(denominatoreP2))
            similarita_transpose[(i[0],id_item_target)] = sim

    return similarita_transpose


### Pre processing similarity

In [24]:
similarita_coppie_transpose = {}
for i,j in y_test:
    #print(i)
    # Calcolare la similarità tra utenti
    similarita_coppie_transpose.update(item_similarity(ratings_matrix_transpose,ratings_matrix_transpose.loc[j],j))

In [25]:
similarita_coppie_transpose

{(1, 157): 0.9976562218349205,
 (2, 157): 0.9987256808427822,
 (3, 157): 0.9950650177221603,
 (4, 157): 0.9987637364271761,
 (5, 157): 0.9964464630936106,
 (6, 157): 0.9978470676855263,
 (7, 157): 0.9989720243591168,
 (8, 157): 0.9775773367719883,
 (9, 157): 0.9989546572775541,
 (10, 157): 0.9987047068044203,
 (11, 157): 0.9987671797197006,
 (12, 157): 0.9983717847330804,
 (13, 157): 0.9985522646271777,
 (14, 157): 0.9987532770609118,
 (15, 157): 0.998420467685395,
 (16, 157): 0.996006589531489,
 (17, 157): 0.9900141420244415,
 (18, 157): 0.9788296835647652,
 (19, 157): 0.9980246434015899,
 (20, 157): 0.9987570298297289,
 (21, 157): 0.9986389376177321,
 (22, 157): 0.9989127957235977,
 (23, 157): 0.9982048799540402,
 (24, 157): 0.9974087564091452,
 (25, 157): 0.9969023400594204,
 (26, 157): 0.9925779378226651,
 (27, 157): 0.9989303138428107,
 (28, 157): 0.9258367523591499,
 (29, 157): 0.9976076318587068,
 (30, 157): 0.7534844900588512,
 (31, 157): 0.9985381931454357,
 (32, 157): 0.99517

### selecting item simili

In [26]:
# prendo solo gli item votati dall'utente target
def similaritaSoglia(similarita_transpose,id_item_target,soglia):
    sim_soglia_transpose = {}
    # scorro coppie similirarità
    for i in similarita_transpose:
        # controllo la soglia
        if similarita_transpose[i] > soglia:
            # controllo se è la similarità del mio target
            if i[1] == id_item_target:
                sim_soglia_transpose[i] = similarita_transpose[i]
    return sim_soglia_transpose

### fun predict

In [27]:
# Definire una funzione per la predizione delle valutazioni utilizzando User-Based Collaborative Filtering
def predict_item_based(user,sim_soglia_transpose,ratings_matrix_transpose,id_item_target):
    # Calcolare le valutazioni previste
    num = 0
    den = 0
    #scorro item simili a tutti i target
    for i in sim_soglia_transpose:
        # controllo che sia simili al target passato
        if i[1] == id_item_target:
            # se l'utente l'ha votato
            if ratings_matrix_transpose.loc[i[0]][user] != 0:
                
                num += (sim_soglia_transpose[i]* (ratings_matrix_transpose.loc[i[0]][user]))
                den += sim_soglia_transpose[i]

    pred_ratings = 0
    if den != 0:
        pred_ratings = (num / den)
    return pred_ratings
    


### Testing model

In [28]:
# Ottenere le previsioni per tutte le valutazioni
y_pred_transpose = {}

# scorro utenti target
for i in y_test:
    print(f'{i} starting...')
    # vettore rating utente target
    print(f'user target processing...')
    #item_target = ratings_matrix_transpose.loc[i[1]]
    
    print('sim_soglia processing...')
    # SIMILARITÀ SOPRA CERTA SOGLIA
    sim_soglia_transpose = similaritaSoglia(similarita_coppie_transpose,i[1],0.9)
    print(len(sim_soglia_transpose))
    # predico rating test
    print('predicting...')
    predicted_ratings_item_based = predict_item_based(i[0], sim_soglia_transpose, ratings_matrix_transpose,i[1])   
    
    if predicted_ratings_item_based < 0:
        predicted_ratings_item_based = 0
    # rating predetto
    print('Appending...')
    y_pred_transpose[i]=round(predicted_ratings_item_based,2)


(6, 157) starting...
user target processing...
sim_soglia processing...
220
predicting...
Appending...
(7, 28) starting...
user target processing...
sim_soglia processing...
217
predicting...
Appending...
(10, 191) starting...
user target processing...
sim_soglia processing...
0
predicting...
Appending...
(42, 191) starting...
user target processing...
sim_soglia processing...
0
predicting...
Appending...
(59, 180) starting...
user target processing...
sim_soglia processing...
220
predicting...
Appending...
(79, 84) starting...
user target processing...
sim_soglia processing...
220
predicting...
Appending...
(87, 111) starting...
user target processing...
sim_soglia processing...
218
predicting...
Appending...
(97, 167) starting...
user target processing...
sim_soglia processing...
220
predicting...
Appending...
(134, 55) starting...
user target processing...
sim_soglia processing...
220
predicting...
Appending...
(183, 197) starting...
user target processing...
sim_soglia processing..

In [29]:
print(f'y_pred:{y_pred_transpose}')
print(f'y_true:{y_test}')

y_pred:{(6, 157): 4.0, (7, 28): 4.66, (10, 191): 0, (42, 191): 0, (59, 180): 0, (79, 84): 1.51, (87, 111): 2.49, (97, 167): 4.0, (134, 55): 4.83, (183, 197): 0, (188, 108): 3.25, (192, 213): 3.0, (195, 175): 0, (199, 111): 0, (201, 30): 0, (261, 191): 0, (265, 30): 0, (266, 175): 0, (268, 58): 5.0, (283, 148): 0}
y_true:{(6, 157): 3.0, (7, 28): 4.0, (10, 191): 4.0, (42, 191): 3.0, (59, 180): 2.0, (79, 84): 3.0, (87, 111): 2.0, (97, 167): 4.0, (134, 55): 5.0, (183, 197): 4.0, (188, 108): 3.0, (192, 213): 4.0, (195, 175): 5.0, (199, 111): 4.0, (201, 30): 5.0, (261, 191): 2.0, (265, 30): 5.0, (266, 175): 5.0, (268, 58): 5.0, (283, 148): 1.0}


### MAE, NMAE, RMSE, Precision, Recall, F1

In [30]:
# i'm deleting the rate predicted for evaluationg sim_soglia
""" for i in range(len(y_true)):
    if y_true[i] == 0:
        y_pred[i] = 0 """


precision = precision_score([round(y_test[pred]) for pred in y_test], [round(y_pred_transpose[pred]) for pred in y_pred_transpose], average='micro')
recall = recall_score([round(y_test[pred]) for pred in y_test], [round(y_pred_transpose[pred]) for pred in y_pred_transpose], average='micro')
f1 = f1_score([round(y_test[pred]) for pred in y_test], [round(y_pred_transpose[pred]) for pred in y_pred_transpose], average='micro')
#nmae = accuracy.mae(predictions) / (max(y_true) - min(y_true))
#mae = accuracy.mae(y_pred)

def mae(true,pred):
    n = len(true)
    tot_val = 0
    for i in pred:
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'MAE:{(tot_val/n)}')

def Nmae(true,pred):
    n = len(true)
    tot_val = 0
    max = 0
    min = 9999
    for i in pred:
        if true[i] < min:
            min = true[i]
        if true[i] > max:
            max = true[i]
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'NMAE:{(tot_val/(n*(max-min)))}')

def rmse(true,pred):
    n = len(true)
    tot_val = 0
    for i in pred:
        val = (pred[i]-true[i])**2
        tot_val += val
    rapp = tot_val/n
    print(f'RMSE:{math.sqrt(rapp)}')

mae(y_test,y_pred_transpose)
Nmae(y_test,y_pred_transpose)
rmse(y_test,y_pred_transpose)

# Stampa i risultati
print(f'Precision:',precision)
print(f'Recall:',recall)
print(f'F1 Score:',f1)
#print(f'NMAE:',nmae)
#print(f'MAE:',mae)

MAE:2.253
NMAE:0.56325
RMSE:2.923928863703767
Precision: 0.25
Recall: 0.25
F1 Score: 0.25


## Paradigma : User based vs item base

## Matrix factorization: SVD 

In [31]:
reader = Reader()

data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']][:], reader)
#data.split(n_folds=3)
trainsetData, testsetData = train_test_split(data, test_size=0.2)

In [32]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'],verbose=True,return_train_measures=True,n_jobs=100)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9868  0.9904  0.9853  0.9886  0.9866  0.9876  0.0018  
MAE (testset)     0.7731  0.7749  0.7703  0.7724  0.7734  0.7728  0.0015  
RMSE (trainset)   0.7474  0.7493  0.7491  0.7440  0.7417  0.7463  0.0030  
MAE (trainset)    0.5820  0.5835  0.5823  0.5786  0.5765  0.5806  0.0026  
Fit time          11.60   13.11   14.86   12.33   17.28   13.84   2.04    
Test time         1.95    1.97    2.66    2.01    2.18    2.16    0.26    


{'test_rmse': array([0.98678503, 0.99044346, 0.98534082, 0.98857584, 0.98661147]),
 'train_rmse': array([0.74735853, 0.74927202, 0.74910242, 0.74397587, 0.74167531]),
 'test_mae': array([0.77311319, 0.77493634, 0.77032153, 0.77242574, 0.77341355]),
 'train_mae': array([0.58201321, 0.58346412, 0.5822937 , 0.57857634, 0.57654971]),
 'fit_time': (11.60438346862793,
  13.10857343673706,
  14.856320142745972,
  12.328916311264038,
  17.284837245941162),
 'test_time': (1.9532074928283691,
  1.9704313278198242,
  2.657219409942627,
  2.012779951095581,
  2.1848864555358887)}

## NN

In [33]:
xTrain = pd.DataFrame({'Movie_Id': df['Movie_Id'], 'Cust_Id': df['Cust_Id']})
yTrain = pd.DataFrame({'Rating': df['Rating']})

print(xTrain.head())
print(yTrain.head())

   Movie_Id  Cust_Id
1         1  1488844
2         1   822109
3         1   885013
4         1    30878
5         1   823519
   Rating
1     3.0
2     5.0
3     4.0
4     4.0
5     3.0


In [34]:
trainset = trainsetData
testset = testsetData
# creo dataset per nn
xyTrain = pd.DataFrame(trainset.all_ratings(), columns=['Cust_Id', 'Movie_Id', 'Rating'])
# dataset
xTrain = pd.DataFrame({'Movie_Id': xyTrain['Movie_Id'], 'Cust_Id': xyTrain['Cust_Id']})
# valori target
yTrain = pd.DataFrame({'Rating': xyTrain['Rating']})

In [35]:
xyTest = pd.DataFrame(testset,columns=['Cust_Id', 'Movie_Id', 'Rating'])
xTest = pd.DataFrame({'Movie_Id': xyTest['Movie_Id'], 'Cust_Id': xyTest['Cust_Id']})
yTest = pd.DataFrame({'Rating': xyTest['Rating']})
# Convert DataFrame to numpy arrays
xTest_numpy = {'Cust_Id': np.array(xTest['Cust_Id']), 'Movie_Id': np.array(xTest['Movie_Id'])}
yTest_numpy = np.array(yTest['Rating'])

In [36]:

user_input = Input(shape=(1,), name='Cust_Id')
item_input = Input(shape=(1,), name='Movie_Id')

# embedding per utenti e items
user_embedding = Embedding(input_dim=3000000, output_dim=16, input_length=1)(user_input)
item_embedding = Embedding(input_dim=226, output_dim=16, input_length=1)(item_input)

# from multi to mono
user_embedding = Flatten()(user_embedding)
item_embedding = Flatten()(item_embedding)

# ricompatta i due vettori
dot_product = Dot(axes=1, name='dot_product')([user_embedding, item_embedding])

# Dense layers
dense0 = Dense(8, activation='relu')(dot_product)
#dense2 = Dense(32, activation='relu')(dense0)

# Output layer 
output = Dense(1, activation='linear')(dense0)

# Model
model = Model(inputs=[user_input, item_input], outputs=output)

model.compile(optimizer='adam', loss='mean_squared_error')

# Convert DataFrame to numpy arrays
xTrain_numpy = {'Cust_Id': np.array(xTrain['Cust_Id']), 'Movie_Id': np.array(xTrain['Movie_Id'])}
yTrain_numpy = np.array(yTrain['Rating'])

# training
history = model.fit(x=xTrain_numpy, y=yTrain_numpy, epochs=1, batch_size=256,validation_split=0.2)


2024-02-12 16:00:46.094988: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-12 16:00:48.012244: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2024-02-12 16:00:48.019113: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-02-12 16:00:48.021672: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neur



In [37]:
# Print MSE for training and validation
train_mse = history.history['loss'][-1]
val_mse = history.history['val_loss'][-1]

print(f'Training MSE: {train_mse}')
print(f'Validation MSE: {val_mse}')

Training MSE: 2.2136332988739014
Validation MSE: 1.172098159790039


In [38]:
y_pred = model.predict(xTest_numpy)



In [39]:
y_pred
y_pred_custom = []
for i in y_pred:
    y_pred_custom.append(i[0])

In [40]:
#precision = precision_score(yTrain.values, y_pred_custom, average='micro')
#recall = recall_score(yTrain.values, y_pred_custom, average='micro')
#f1 = f1_score(yTrain.values, y_pred_custom, average='micro')


def mae(true,pred):
    n = len(true)
    tot_val = 0
    for i in range(n):
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'MAE:{(tot_val/n)}')

def Nmae(true,pred):
    n = len(true)
    tot_val = 0
    max = 0
    min = 9999
    for i in range(n):
        if true[i] < min:
            min = true[i]
        if true[i] > max:
            max = true[i]
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'NMAE:{(tot_val/(n*(max-min)))}')

def rmse(true,pred):
    n = len(true)
    tot_val = 0
    for i in range(n):
        val = (pred[i]-true[i])**2
        tot_val += val
    rapp = tot_val/n
    print(f'RMSE:{math.sqrt(rapp)}')

mae(yTest_numpy,y_pred_custom)
Nmae(yTest_numpy,y_pred_custom)
rmse(yTest_numpy,y_pred_custom)

# Stampa i risultati
print(f'Precision:',precision)
print(f'Recall:',recall)
print(f'F1 Score:',f1)
#print(f'NMAE:',nmae)
#print(f'MAE:',mae)

MAE:0.9084952061706484
NMAE:0.2271238015426621
RMSE:1.0827678479328302
Precision: 0.25
Recall: 0.25
F1 Score: 0.25
