## Import

In [32]:
import math
import pandas as pd
import numpy as np
import seaborn as sns

from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

from surprise import Reader, Dataset, SVD
from surprise import KNNBasic
from surprise import accuracy
from surprise import accuracy

from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate

sns.set_style("darkgrid")

### Importare dataset

In [2]:
df1 = pd.read_csv('./combined_data_1.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

df1['Rating'] = df1['Rating'].astype(float)

print('Dataset 1 shape:',df1.shape)

# Mi genera un nuovo indice da 1
df1 = df1.reset_index()
print('Dataset 1 shape without rating null:',df1.shape)

print('-Dataset examples-')
print(df1[:5])


Dataset 1 shape: (24058263, 2)
Dataset 1 shape without rating null: (24058263, 3)
-Dataset examples-
   index  Cust_Id  Rating
0      0       1:     NaN
1      1  1488844     3.0
2      2   822109     5.0
3      3   885013     4.0
4      4    30878     4.0


### Prendo i primi x

In [3]:
df1 = df1.head(1000)

### df_nan è un DataFrame che indica True per le righe in cui la colonna 'Rating' è nulla e False altrimenti. Quindi, ogni volta che c'è una transizione da False a True nella colonna 'Rating', rappresenta l'inizio di un nuovo "film"

In [4]:
df_nan = pd.DataFrame(pd.isnull(df1.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(df1) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

#print('Movie numpy:',movie_np)
#print('Length:',len(movie_np))

df = df1[pd.notnull(df1['Rating'])]

df['Movie_Id'] = movie_np.astype(int)
df['Cust_Id'] = df['Cust_Id'].astype(int)
print('-Dataset examples-')
print(df)


-Dataset examples-
     index  Cust_Id  Rating  Movie_Id
1        1  1488844     3.0         1
2        2   822109     5.0         1
3        3   885013     4.0         1
4        4    30878     4.0         1
5        5   823519     3.0         1
..     ...      ...     ...       ...
995    995   369761     3.0         3
996    996  1065126     4.0         3
997    997  1101467     4.0         3
998    998   393413     3.0         3
999    999   478176     4.0         3

[997 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Movie_Id'] = movie_np.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cust_Id'] = df['Cust_Id'].astype(int)


### Creo matrice di rating

In [5]:
ratings_matrix = df.pivot(index='Cust_Id', columns='Movie_Id', values='Rating')

# Se vuoi gestire eventuali valori mancanti con uno zero, puoi utilizzare il metodo fillna
ratings_matrix = ratings_matrix.fillna(0)

# Se preferisci avere la matrice come array numpy, puoi utilizzare values
#ratings_matrix_array = ratings_matrix.values

print(ratings_matrix)

Movie_Id    1    2    3
Cust_Id                
915       5.0  0.0  0.0
1333      0.0  0.0  4.0
2442      3.0  0.0  0.0
3321      3.0  0.0  0.0
4326      4.0  0.0  0.0
...       ...  ...  ...
2640085   0.0  5.0  0.0
2646060   0.0  0.0  3.0
2646115   0.0  0.0  3.0
2647871   4.0  0.0  0.0
2648861   0.0  3.0  0.0

[985 rows x 3 columns]


### rimuovo le righe nulle

In [517]:
print(ratings_matrix.shape)

(985, 3)


In [518]:
ratings_matrix_support = pd.DataFrame()
for i in ratings_matrix.iterrows():
    val = [not(j) for j in i[1]]
    #print(i[1])
    #print(all(val))
    if not(all(val)):
        ratings_matrix_support = ratings_matrix_support.append(i[1])



  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.append(i[1])
  ratings_matrix_support = ratings_matrix_support.appen

In [519]:
#ratings_matrix = ratings_matrix_support.copy()
print(ratings_matrix_support.shape)

(985, 3)


# Creo Test set

### prendo il rating da colore che ne hanno almeno due e lo metto a 0

In [6]:
y_test = {}
count = 0
for i in ratings_matrix.iterrows():
    if count < 10:
        almenoUno = 0
        for j in range(1,len(i[1])):
            if i[1][j] != 0:
                if almenoUno:
                    y_test[(i[0],j)] = i[1][j]
                    i[1][j] = 0
                    count +=1
                    break
                else:
                    almenoUno= True
            

In [7]:
y_test

{(305344, 2): 1.0,
 (387418, 2): 1.0,
 (515436, 2): 1.0,
 (636262, 2): 1.0,
 (1374216, 2): 1.0,
 (1398626, 2): 3.0,
 (1664010, 2): 4.0,
 (1806515, 2): 3.0,
 (2118461, 2): 4.0,
 (2439493, 2): 1.0}

## Usare user based NN

#### Calcolo rating medi degli utenti

In [8]:
rating_medi = {}
# calcolo rating medio senza considerare gli 0
#for i in ratings_matrix.iterrows():
for i in ratings_matrix.iterrows():
        #print(i[0])
        count = 0
        countn = 0
        for j in i[1].values:
            if j != 0:
                  count += j
                  countn += 1
        if countn != 0:
                rating_medi[i[0]] = count/countn
        else:
                rating_medi[i[0]] = 0
#print(rating_medi[1333])
print(rating_medi)

{915: 5.0, 1333: 4.0, 2442: 3.0, 3321: 3.0, 4326: 4.0, 6689: 4.0, 11409: 5.0, 11589: 3.0, 13651: 3.0, 14756: 4.0, 14924: 5.0, 16272: 4.0, 21722: 4.0, 21983: 4.0, 24344: 4.0, 30245: 5.0, 30878: 4.0, 31913: 4.0, 34907: 3.0, 38052: 3.0, 41371: 5.0, 41422: 4.0, 42921: 3.0, 42930: 3.0, 44783: 3.0, 44937: 5.0, 45117: 5.0, 51230: 4.0, 51334: 4.0, 52540: 1.0, 54774: 4.0, 55016: 3.0, 57961: 4.0, 59052: 2.0, 60343: 5.0, 65932: 3.0, 66414: 5.0, 67315: 4.0, 68033: 4.0, 68959: 3.0, 69809: 5.0, 77266: 2.0, 78931: 5.0, 79160: 4.0, 87113: 2.0, 93986: 5.0, 94565: 4.0, 99400: 5.0, 101597: 5.0, 104768: 5.0, 105086: 5.0, 108052: 4.0, 109089: 5.0, 115498: 3.0, 120491: 5.0, 121073: 5.0, 121318: 4.0, 121456: 4.0, 122197: 1.0, 124105: 4.0, 134001: 4.0, 136106: 3.0, 143274: 3.0, 145873: 3.0, 147386: 5.0, 151004: 5.0, 153249: 4.0, 155164: 4.0, 156078: 5.0, 162854: 4.0, 166041: 4.0, 166100: 4.0, 172264: 4.0, 173930: 4.0, 175763: 4.0, 181592: 4.0, 183215: 4.0, 183903: 5.0, 188416: 3.0, 188613: 4.0, 190418: 3.0, 1

### fun di similarità

In [9]:
# Definire la funzione per il calcolo della similarità tra utenti utilizzando la person coefficent

def user_similarity(ratings,user_target,id_user_target):
    similarità = {}
    # scorro UTENTI
    for i in ratings.iterrows():
        # se l'UTENTE è diverso dall'utente target
        if i[0] != id_user_target:
            # calcolo similarità
            
            numeratore = 0
            denominatoreP1 = 0
            denominatoreP2 = 0
            for j in i[1].keys():
                #if user_target[j] != 0:
                numeratore += (i[1][j]-rating_medi[i[0]])*(user_target[j]-rating_medi[id_user_target])
                
                denominatoreP1 += (i[1][j]-rating_medi[i[0]])**2
                
                denominatoreP2 +=(user_target[j]-rating_medi[id_user_target])**2
            
            sim = 0
            if denominatoreP1 !=0 and denominatoreP2 != 0:
                sim = numeratore/(math.sqrt(denominatoreP1)*math.sqrt(denominatoreP2))
            #print(i[0],id_user_target)
            similarità[(i[0],id_user_target)] = sim
    return similarità

### pre-processing sulla similarità

In [10]:
similarita_coppie = {}
for i in ratings_matrix.iterrows():
    # Calcolare la similarità tra utenti
    similarita_coppie.update(user_similarity(ratings_matrix,i[1],i[0]))

In [84]:
utenteSimileUno = ratings_matrix.loc[261]
utenteSimileDue = ratings_matrix.loc[2044720]
print(utenteSimileUno)
print(utenteSimileDue)
print(user_target)


Movie_Id
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
20    0.0
21    0.0
22    0.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    4.0
Name: 261, dtype: float64
Movie_Id
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     1.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    1.0
18    0.0
19    0.0
20    0.0
21    0.0
22    0.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    4.0
Name: 2044720, dtype: float64
Movie_Id
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
20    0.0
21    0.0
22    0.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    3.0
Name: 2442, dtype

In [12]:
# FACCIO UNA SOGLIO K DI VICINATO SULLA BASE DELLA SIMILARITÀ
def calcoloSimilaritaConSoglia(similarita):
    sim_soglia = {}
    for i in similarita:
        if similarita[i] > 0.95:
            sim_soglia[i] = similarita[i]
    return sim_soglia

#### fun Prediction

In [14]:
# Definire una funzione per la predizione delle valutazioni utilizzando User-Based Collaborative Filtering
def predict_user_based(item,user_similarities,ratings_matrix,rating_medi,id_user_target):
    # Calcolare le valutazioni previste
    num = 0
    den = 0
    pred_ratings = 0
    # verifico che per quell'item il rating sia 0
    #if ratings_matrix.loc[id_user_target][item] == 0:
    for i in user_similarities:
        #if ratings_matrix.loc[i[0]].values[item] != 0:
        num += (user_similarities[i]* (ratings_matrix.loc[i[0]][item]-rating_medi[i[0]]))
        den += user_similarities[i]
    if den != 0:
        pred_ratings = rating_medi[id_user_target] + (num / den)
    else:
        pred_ratings = rating_medi[id_user_target]
    print(pred_ratings)
    return pred_ratings
    #else:
    #    return ratings_matrix.loc[id_user_target][item]

# Ottenere le previsioni per tutte le valutazioni
#predicted_ratings_user_based = predict_user_based(id_item_target, sim_soglia,ratings_matrix,rating_medi,id_user_target)
#print(f'Rating per l item {id_item_target} è di {predicted_ratings_user_based} per l utente {id_user_target}')


### testing model

In [15]:
# Ottenere le previsioni per tutte le valutazioni
y_pred = {}

for i in y_test:
    print(f'{i} starting...')
    # vettore rating utente target
    print(f'user target processing...')
    user_target = ratings_matrix.loc[i[0]]
    
    print('sim_soglia processing...')
    # SIMILARITÀ SOPRA CERTA SOGLIA
    sim_soglia = calcoloSimilaritaConSoglia(similarita_coppie)
    # predico rating test
    print('predicting...')
    predicted_ratings_user_based = predict_user_based(i[1], sim_soglia, ratings_matrix,rating_medi,i[0])   
    if predicted_ratings_user_based < 0:
        predicted_ratings_user_based = 0
    # rating predetto
    print('Appending...')
    y_pred[i]=round(predicted_ratings_user_based,2)


(305344, 2) starting...
user target processing...
sim_soglia processing...
predicting...
-2.559294319289985
Appending...
(387418, 2) starting...
user target processing...
sim_soglia processing...
predicting...
-2.559294319289985
Appending...
(515436, 2) starting...
user target processing...
sim_soglia processing...
predicting...
-2.559294319289985
Appending...
(636262, 2) starting...
user target processing...
sim_soglia processing...
predicting...
-2.559294319289985
Appending...
(1374216, 2) starting...
user target processing...
sim_soglia processing...
predicting...
-1.559294319289985
Appending...
(1398626, 2) starting...
user target processing...
sim_soglia processing...
predicting...
-1.559294319289985
Appending...
(1664010, 2) starting...
user target processing...
sim_soglia processing...
predicting...
1.440705680710015
Appending...
(1806515, 2) starting...
user target processing...
sim_soglia processing...
predicting...
-0.5592943192899851
Appending...
(2118461, 2) starting...
use

In [16]:
print(f'y_pred:{y_pred}')
print(f'y_test:{y_test}')

y_pred:{(305344, 2): 0, (387418, 2): 0, (515436, 2): 0, (636262, 2): 0, (1374216, 2): 0, (1398626, 2): 0, (1664010, 2): 1.44, (1806515, 2): 0, (2118461, 2): 1.44, (2439493, 2): 0}
y_test:{(305344, 2): 1.0, (387418, 2): 1.0, (515436, 2): 1.0, (636262, 2): 1.0, (1374216, 2): 1.0, (1398626, 2): 3.0, (1664010, 2): 4.0, (1806515, 2): 3.0, (2118461, 2): 4.0, (2439493, 2): 1.0}


### MAE, NMAE, RMSE, Precision, Recall, F1

In [18]:
# i'm deleting the rate predicted for evaluationg sim_soglia
""" for i in range(len(y_true)):rating_medi[id_user_target]
    if y_true[i] == 0:
        y_pred[i] = 0 """

# Calcola le metriche aggiuntive

precision = precision_score([round(y_test[pred]) for pred in y_test], [round(y_pred[pred]) for pred in y_pred], average='micro')
recall = recall_score([round(y_test[pred]) for pred in y_test], [round(y_pred[pred]) for pred in y_pred], average='micro')
f1 = f1_score([round(y_test[pred]) for pred in y_test], [round(y_pred[pred]) for pred in y_pred], average='micro')
#nmae = accuracy.mae(predictions) / (max(y_true) - min(y_true))
#mae = accuracy.mae(y_pred)

def mae(true,pred):
    n = len(true)
    tot_val = 0
    for i in true:
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'MAE:{(tot_val/n)}')

def Nmae(true,pred):
    n = len(true)
    tot_val = 0
    max = 0
    min = 9999
    for i in true:
        if true[i] < min:
            min = true[i]
        if true[i] > max:
            max = true[i]
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'NMAE:{(tot_val/n)}')

def rmse(true,pred):
    n = len(true)
    tot_val = 0
    for i in true:
        val = (pred[i]-true[i])**2
        tot_val += val
    rapp = tot_val/n
    print(f'RMSE:{math.sqrt(rapp)}')

mae(y_test,y_pred)
Nmae(y_test,y_pred)
rmse(y_test,y_pred)

# Stampa i risultati
print(f'Precision:',precision)
print(f'Recall:',recall)
print(f'F1 Score:',f1)
#print(f'NMAE:',nmae)
#print(f'MAE:',mae)

MAE:1.7120000000000002
NMAE:1.7120000000000002
RMSE:1.9263229220460416
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


## Usare item based NN 

In [19]:
ratings_matrix_transpose = df.pivot(index='Movie_Id', columns='Cust_Id', values='Rating')

# Se vuoi gestire eventuali valori mancanti con uno zero, puoi utilizzare il metodo fillna
ratings_matrix_transpose = ratings_matrix_transpose.fillna(0)

# Se preferisci avere la matrice come array numpy, puoi utilizzare values
#ratings_matrix_array = ratings_matrix.values

print(ratings_matrix_transpose)

Cust_Id   915      1333     2442     3321     4326     6689     11409    \
Movie_Id                                                                  
1             5.0      0.0      3.0      3.0      4.0      0.0      0.0   
2             0.0      0.0      0.0      0.0      0.0      0.0      5.0   
3             0.0      4.0      0.0      0.0      0.0      4.0      0.0   

Cust_Id   11589    13651    14756    ...  2630337  2630797  2631796  2632461  \
Movie_Id                             ...                                       
1             3.0      3.0      4.0  ...      5.0      5.0      4.0      0.0   
2             0.0      0.0      0.0  ...      0.0      0.0      0.0      0.0   
3             0.0      0.0      0.0  ...      0.0      0.0      0.0      3.0   

Cust_Id   2635437  2640085  2646060  2646115  2647871  2648861  
Movie_Id                                                        
1             4.0      0.0      0.0      0.0      4.0      0.0  
2             0.0      5.0  

### fun adjusted cosine similarity

In [20]:
# Definire la funzione per il calcolo della similarità tra utenti utilizzando la person coefficent
def item_similarity(ratings,item_target,id_item_target):
    similarita_transpose = {}
    # scorro UTENTI
    for i in ratings.iterrows():
        # i riga film con rating utenti
        # se l'ITEM è diverso dall'item target
        if i[0] != id_item_target:
            # calcolo similarità
            #print(i[0])
            numeratore = 0
            denominatoreP1 = 0
            denominatoreP2 = 0
            for j in i[1].keys():
                #print(rating_medi[j])
                # j utente
                #if i[1][j] != 0 and item_target[j] != 0:
                numeratore += (i[1][j]-rating_medi[j])*(item_target[j]-rating_medi[j])
                denominatoreP1 += (i[1][j]-rating_medi[j])**2
                denominatoreP2 +=(item_target[j]-rating_medi[j])**2
            sim = 0
            if denominatoreP1 != 0 and denominatoreP2 != 0:
                sim = numeratore/(math.sqrt(denominatoreP1)*math.sqrt(denominatoreP2))
            similarita_transpose[(i[0],id_item_target)] = sim

    return similarita_transpose
# Calcolare la similarità tra utenti
#item_similarity(ratings_matrix_transpose,item_target)
#print(similarita_transpose)

### Pre processing similarity

In [21]:
similarita_coppie_transpose = {}
for i in ratings_matrix_transpose.iterrows():
    # print(i)
    # Calcolare la similarità tra utenti
    similarita_coppie_transpose.update(item_similarity(ratings_matrix_transpose,i[1],i[0]))

In [22]:
similarita_coppie_transpose

{(2, 1): 0.48504790748652427,
 (3, 1): 0.24872424635819407,
 (1, 2): 0.48504790748652427,
 (3, 2): 0.7228421339246336,
 (1, 3): 0.24872424635819407,
 (2, 3): 0.7228421339246336}

In [118]:
# FACCIO UNA SOGLIO K DI VICINATO SULLA BASE DELLA SIMILARITÀ
sim_soglia_transpose = {}
for i in similarita_transpose:
    if similarita_transpose[i] > 0.0:
        sim_soglia_transpose[i] = similarita_transpose[i]
print(sim_soglia_transpose)

{(2, 1): 0.045871176736922314}


### selecting item simili

In [23]:
# prendo solo gli item votati dall'utente target
def similaritaSoglia(similarita_transpose):
    sim_soglia_transpose = {}
    for i in similarita_transpose:
        if user_target[i[0]] > 0.5:
            sim_soglia_transpose[i] = similarita_transpose[i]
    return sim_soglia_transpose

### fun predict

In [24]:
# Definire una funzione per la predizione delle valutazioni utilizzando User-Based Collaborative Filtering
def predict_item_based(user,item_similarities,ratings_matrix_transpose,id_item_target):
    # Calcolare le valutazioni previste
    num = 0
    den = 0
    #if ratings_matrix_transpose.loc[id_item_target][user] == 0:
    for i in item_similarities:
        #print(i)
        if ratings_matrix_transpose.loc[i[0]][user] != 0:
            num += (item_similarities[i]* (ratings_matrix_transpose.loc[i[0]][user]))
            den += item_similarities[i]
    pred_ratings = 0
    if den != 0:
        pred_ratings = (num / den)
    return pred_ratings
    #else:
        #return ratings_matrix_transpose.loc[id_item_target][user]

# Ottenere le previsioni per tutte le valutazioni
#predicted_ratings_user_based = predict_item_based(id_item_target, similarita_transpose,ratings_matrix_transpose,id_item_target)
#print(f'Rating per l item {id_item_target} è di {predicted_ratings_user_based} per l utente {id_user_target}')


### Testing model

In [25]:
# Ottenere le previsioni per tutte le valutazioni
y_pred_transpose = {}

for i in y_test:
    print(f'{i} starting...')
    # vettore rating utente target
    print(f'user target processing...')
    item_target = ratings_matrix_transpose.loc[i[1]]
    
    print('sim_soglia processing...')
    # SIMILARITÀ SOPRA CERTA SOGLIA
    sim_soglia_transpose = similaritaSoglia(similarita_coppie_transpose)
    # predico rating test
    print('predicting...')
    predicted_ratings_item_based = predict_item_based(i[0], sim_soglia_transpose, ratings_matrix_transpose,i[1])   
    
    if predicted_ratings_item_based < 0:
        predicted_ratings_item_based = 0
    # rating predetto
    print('Appending...')
    y_pred_transpose[i]=round(predicted_ratings_item_based,2)


(305344, 2) starting...
user target processing...
sim_soglia processing...
predicting...
Appending...
(387418, 2) starting...
user target processing...
sim_soglia processing...
predicting...
Appending...
(515436, 2) starting...
user target processing...
sim_soglia processing...
predicting...
Appending...
(636262, 2) starting...
user target processing...
sim_soglia processing...
predicting...
Appending...
(1374216, 2) starting...
user target processing...
sim_soglia processing...
predicting...
Appending...
(1398626, 2) starting...
user target processing...
sim_soglia processing...
predicting...
Appending...
(1664010, 2) starting...
user target processing...
sim_soglia processing...
predicting...
Appending...
(1806515, 2) starting...
user target processing...
sim_soglia processing...
predicting...
Appending...
(2118461, 2) starting...
user target processing...
sim_soglia processing...
predicting...
Appending...
(2439493, 2) starting...
user target processing...
sim_soglia processing...
p

In [26]:
print(f'y_pred:{y_pred_transpose}')
print(f'y_true:{y_test}')

y_pred:{(305344, 2): 1.0, (387418, 2): 1.0, (515436, 2): 1.0, (636262, 2): 1.0, (1374216, 2): 2.0, (1398626, 2): 2.0, (1664010, 2): 5.0, (1806515, 2): 3.0, (2118461, 2): 5.0, (2439493, 2): 1.0}
y_true:{(305344, 2): 1.0, (387418, 2): 1.0, (515436, 2): 1.0, (636262, 2): 1.0, (1374216, 2): 1.0, (1398626, 2): 3.0, (1664010, 2): 4.0, (1806515, 2): 3.0, (2118461, 2): 4.0, (2439493, 2): 1.0}


### MAE, NMAE, RMSE, Precision, Recall, F1

In [27]:
# i'm deleting the rate predicted for evaluationg sim_soglia
""" for i in range(len(y_true)):
    if y_true[i] == 0:
        y_pred[i] = 0 """


precision = precision_score([round(y_test[pred]) for pred in y_test], [round(y_pred_transpose[pred]) for pred in y_pred_transpose], average='micro')
recall = recall_score([round(y_test[pred]) for pred in y_test], [round(y_pred_transpose[pred]) for pred in y_pred_transpose], average='micro')
f1 = f1_score([round(y_test[pred]) for pred in y_test], [round(y_pred_transpose[pred]) for pred in y_pred_transpose], average='micro')
#nmae = accuracy.mae(predictions) / (max(y_true) - min(y_true))
#mae = accuracy.mae(y_pred)

def mae(true,pred):
    n = len(true)
    tot_val = 0
    for i in pred:
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'MAE:{1-(tot_val/n)}')

def Nmae(true,pred):
    n = len(true)
    tot_val = 0
    max = 0
    min = 9999
    for i in pred:
        if true[i] < min:
            min = true[i]
        if true[i] > max:
            max = true[i]
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'NMAE:{1-(tot_val/n)}')

def rmse(true,pred):
    n = len(true)
    tot_val = 0
    for i in pred:
        val = (pred[i]-true[i])**2
        tot_val += val
    rapp = tot_val/n
    print(f'RMSE:{1-math.sqrt(rapp)}')

mae(y_test,y_pred_transpose)
Nmae(y_test,y_pred_transpose)
rmse(y_test,y_pred_transpose)

# Stampa i risultati
print(f'Precision:',precision)
print(f'Recall:',recall)
print(f'F1 Score:',f1)
#print(f'NMAE:',nmae)
#print(f'MAE:',mae)

MAE:0.6
NMAE:0.6
RMSE:0.3675444679663241
Precision: 0.6
Recall: 0.6
F1 Score: 0.6


## Paradigma : User based vs item base

## Matrix factorization: SVD 

In [28]:
reader = Reader()

# get just top 100K rows for faster run time
data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']][:], reader)
#data.split(n_folds=3)
trainsetData, testsetData = train_test_split(data, test_size=0.2)

In [29]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'],verbose=True,return_train_measures=True,n_jobs=100)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0288  1.1406  1.0539  1.1039  1.1003  1.0855  0.0395  
MAE (testset)     0.8150  0.9319  0.8590  0.8994  0.8902  0.8791  0.0396  
RMSE (trainset)   0.7032  0.6807  0.7440  0.7037  0.6914  0.7046  0.0215  
MAE (trainset)    0.5630  0.5437  0.5911  0.5624  0.5548  0.5630  0.0157  
Fit time          0.02    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.0287584 , 1.14058036, 1.05386591, 1.10385707, 1.10030391]),
 'train_rmse': array([0.70317169, 0.68065717, 0.7440008 , 0.70374151, 0.69138451]),
 'test_mae': array([0.81496071, 0.93190821, 0.85901677, 0.89935797, 0.89016553]),
 'train_mae': array([0.56303226, 0.54372737, 0.59106961, 0.56243586, 0.55479555]),
 'fit_time': (0.01925039291381836,
  0.009661674499511719,
  0.010292768478393555,
  0.01019906997680664,
  0.010210752487182617),
 'test_time': (0.001659393310546875,
  0.0008640289306640625,
  0.0009436607360839844,
  0.000835418701171875,
  0.0008630752563476562)}

## NN

In [44]:
xTrain = pd.DataFrame({'Movie_Id': df['Movie_Id'], 'Cust_Id': df['Cust_Id']})
yTrain = pd.DataFrame({'Rating': df['Rating']})

print(xTrain.head())
print(yTrain.head())

AttributeError: 'DataFrame' object has no attribute 'raw_ratings'

In [33]:
# Input layer
ratings_input = Input(shape=(None,2))

# Embedding layers
dense0 = Dense(32, activation='relu')(ratings_input)

# Dense layers
#dense1 = Dense(128, activation='relu')(dense0)
dense2 = Dense(64, activation='relu')(dense0)

# Output layer
output = Dense(6,activation='softmax')(dense2)

# Model
model = Model(inputs=ratings_input, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

2024-01-23 09:31:11.666152: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-01-23 09:31:13.735163: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2024-01-23 09:31:13.735190: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-01-23 09:31:13.740028: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neur

In [34]:
model.fit(xTrain,yTrain, epochs=50, batch_size=128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f67ee29dc30>

In [35]:
y_pred = model.predict(xTrain)



In [40]:
print(y_pred)
y_pred_custom = []
for i in y_pred:
    val = -1
    index = -1
    for j in range(len(i)):
        if i[j] > val:
            val = i[j]
            index = j
    y_pred_custom.append(index)
y_pred_custom = pd.DataFrame(y_pred_custom)
print(y_pred_custom)
print(yTrain)


[[0.         0.         0.         0.         0.         0.99999994]
 [0.         0.         0.         0.         0.         0.99999994]
 [0.         0.         0.         0.         0.         0.99999994]
 ...
 [0.         0.         0.         0.         0.         1.        ]
 [0.         0.         0.         0.         0.         1.        ]
 [0.         0.         0.         0.         0.         1.        ]]
     0
0    5
1    5
2    5
3    5
4    5
..  ..
992  5
993  5
994  5
995  5
996  5

[997 rows x 1 columns]
     Rating
1       3.0
2       5.0
3       4.0
4       4.0
5       3.0
..      ...
995     3.0
996     4.0
997     4.0
998     3.0
999     4.0

[997 rows x 1 columns]


In [39]:
mae = np.mean(y_pred_custom - yTrain)
print(f'MAE: {mae}')

MAE: 0        NaN
Rating   NaN
dtype: float64


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [159]:
mae = np.mean(np.abs(y_pred - yTrain))
print(f'MAE: {mae}')
loss = model.evaluate(xTrain, yTrain)
print(f'Loss on test set: {loss}')


ValueError: Unable to coerce to DataFrame, shape must be (99970, 1): given (99970, 6)