## Import

In [1]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split

sns.set_style("darkgrid")

### Importare dataset

In [121]:
df1 = pd.read_csv('./combined_data_1.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

df1['Rating'] = df1['Rating'].astype(float)

print('Dataset 1 shape:',df1.shape)

# Mi genera un nuovo indice da 1
df1 = df1.reset_index()
print('Dataset 1 shape without rating null:',df1.shape)

print('-Dataset examples-')
print(df1[:5])


Dataset 1 shape: (24058263, 2)
Dataset 1 shape without rating null: (24058263, 3)
-Dataset examples-
   index  Cust_Id  Rating
0      0       1:     NaN
1      1  1488844     3.0
2      2   822109     5.0
3      3   885013     4.0
4      4    30878     4.0


### Prendo i primi 1000

In [122]:
df1 = df1.head(100000)

### df_nan è un DataFrame che indica True per le righe in cui la colonna 'Rating' è nulla e False altrimenti. Quindi, ogni volta che c'è una transizione da False a True nella colonna 'Rating', rappresenta l'inizio di un nuovo "film"

In [123]:
df_nan = pd.DataFrame(pd.isnull(df1.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(df1) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

#print('Movie numpy:',movie_np)
#print('Length:',len(movie_np))

df = df1[pd.notnull(df1['Rating'])]

df['Movie_Id'] = movie_np.astype(int)
df['Cust_Id'] = df['Cust_Id'].astype(int)
print('-Dataset examples-')
print(df)


-Dataset examples-
       index  Cust_Id  Rating  Movie_Id
1          1  1488844     3.0         1
2          2   822109     5.0         1
3          3   885013     4.0         1
4          4    30878     4.0         1
5          5   823519     3.0         1
...      ...      ...     ...       ...
99995  99995   735848     4.0        30
99996  99996   254710     4.0        30
99997  99997   865725     4.0        30
99998  99998   568153     3.0        30
99999  99999  2502775     4.0        30

[99970 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Movie_Id'] = movie_np.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cust_Id'] = df['Cust_Id'].astype(int)


### Creo matrice di rating

In [124]:
ratings_matrix = df.pivot(index='Cust_Id', columns='Movie_Id', values='Rating')

# Se vuoi gestire eventuali valori mancanti con uno zero, puoi utilizzare il metodo fillna
ratings_matrix = ratings_matrix.fillna(0)

# Se preferisci avere la matrice come array numpy, puoi utilizzare values
#ratings_matrix_array = ratings_matrix.values

print(ratings_matrix)

Movie_Id   1    2    3    4    5    6    7    8    9    10  ...   21   22  \
Cust_Id                                                     ...             
7         0.0  0.0  0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  ...  0.0  0.0   
134       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
201       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
261       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
265       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
2649331   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2649336   0.0  0.0  0.0  0.0  0.0  0.0  0.0  4.0  0.0  0.0  ...  0.0  0.0   
2649375   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2649378   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2649426   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

## Usare user based NN

#### Calcolo similarità

In [125]:
id_user_target = 2442

In [126]:
user_target = ratings_matrix.loc[id_user_target]
print(user_target)

Movie_Id
1     3.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
20    0.0
21    0.0
22    0.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    3.0
Name: 2442, dtype: float64


In [127]:
rating_medi = {}
# calcolo rating medio senza considerare gli 0
#for i in ratings_matrix.iterrows():
for i in ratings_matrix.iterrows():
        #print(i[0])
        count = 0
        countn = 0
        for j in i[1].values:
            if j != 0:
                  count += j
                  countn += 1
        rating_medi[i[0]] = count/countn
#print(rating_medi[1333])
print(rating_medi)

{7: 4.5, 134: 5.0, 201: 4.0, 261: 4.0, 265: 3.0, 307: 3.0, 383: 3.0, 384: 4.0, 424: 4.0, 462: 2.0, 478: 4.0, 491: 3.0, 527: 5.0, 592: 4.0, 685: 3.0, 695: 1.0, 734: 1.0, 742: 4.0, 815: 4.0, 834: 3.0, 840: 2.0, 906: 3.0, 911: 4.0, 915: 5.0, 933: 4.0, 967: 1.0, 979: 3.0, 1034: 5.0, 1086: 3.0, 1146: 3.0, 1186: 3.0, 1243: 4.0, 1310: 3.0, 1333: 3.1666666666666665, 1353: 3.0, 1409: 3.0, 1427: 3.0, 1442: 4.0, 1500: 1.0, 1527: 5.0, 1584: 3.0, 1611: 4.0, 1636: 4.0, 1650: 5.0, 1658: 3.0, 1664: 5.0, 1810: 3.0, 1811: 2.5, 1830: 5.0, 1856: 5.0, 1871: 4.0, 1878: 5.0, 1894: 4.0, 1900: 4.0, 1913: 3.0, 1918: 3.0, 1983: 3.0, 2000: 3.5, 2040: 2.0, 2133: 2.3333333333333335, 2213: 4.0, 2225: 3.0, 2264: 3.0, 2273: 3.0, 2276: 3.0, 2307: 3.0, 2363: 4.0, 2442: 3.0, 2455: 3.0, 2469: 2.0, 2555: 5.0, 2614: 3.0, 2630: 2.0, 2678: 2.0, 2757: 2.0, 2775: 5.0, 2787: 2.0, 2878: 5.0, 2905: 5.0, 2976: 3.5, 3039: 3.0, 3104: 3.0, 3168: 4.0, 3184: 2.0, 3210: 5.0, 3285: 1.0, 3292: 4.0, 3321: 2.5714285714285716, 3323: 5.0, 3328

In [128]:
# Definire la funzione per il calcolo della similarità tra utenti utilizzando la person coefficent
similarità = {}
def user_similarity(ratings,user_target):
    # scorro UTENTI
    for i in ratings.iterrows():
        # se l'UTENTE è diverso dall'utente target
        if i[0] != id_user_target:
            # calcolo similarità
            #print(i[1].values[0])
            numeratore = 0
            denominatoreP1 = 0
            denominatoreP2 = 0
            for j in i[1].keys():
                numeratore += (i[1][j]-rating_medi[i[0]])*(user_target[j]-rating_medi[id_user_target])
                
                denominatoreP1 += (i[1][j]-rating_medi[i[0]])**2
                
                denominatoreP2 +=(user_target[j]-rating_medi[id_user_target])**2
                
            sim = numeratore/(math.sqrt(denominatoreP1)*math.sqrt(denominatoreP2))
            similarità[(i[0],id_user_target)] = sim

# Calcolare la similarità tra utenti
user_similarity(ratings_matrix,user_target)
#user_similarity(matrice,user_target)
print(similarità)

{(7, 2442): 0.9281622761265498, (134, 2442): 0.9475142485638551, (201, 2442): 0.9475142485638552, (261, 2442): 0.9826073688810351, (265, 2442): 0.9475142485638551, (307, 2442): 0.9475142485638551, (383, 2442): 0.9475142485638551, (384, 2442): 0.9826073688810351, (424, 2442): 0.9475142485638552, (462, 2442): 0.9475142485638552, (478, 2442): 0.9826073688810351, (491, 2442): 0.9475142485638551, (527, 2442): 0.9475142485638551, (592, 2442): 0.9475142485638552, (685, 2442): 0.9475142485638551, (695, 2442): 0.9475142485638552, (734, 2442): 0.9475142485638552, (742, 2442): 0.9475142485638552, (815, 2442): 0.9475142485638552, (834, 2442): 0.982607368881035, (840, 2442): 0.9475142485638552, (906, 2442): 0.9475142485638551, (911, 2442): 0.9475142485638552, (915, 2442): 0.982607368881035, (933, 2442): 0.9475142485638552, (967, 2442): 0.9475142485638552, (979, 2442): 0.982607368881035, (1034, 2442): 0.9475142485638551, (1086, 2442): 0.9475142485638551, (1146, 2442): 0.9475142485638551, (1186, 2442

In [129]:
utenteSimileUno = ratings_matrix.loc[4326]
utenteSimileDue = ratings_matrix.loc[915]
print(utenteSimileUno)
print(utenteSimileDue)
print(user_target)


Movie_Id
1     4.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    4.0
19    0.0
20    0.0
21    0.0
22    0.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    0.0
Name: 4326, dtype: float64
Movie_Id
1     5.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
20    0.0
21    0.0
22    0.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    0.0
Name: 915, dtype: float64
Movie_Id
1     3.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
20    0.0
21    0.0
22    0.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    3.0
Name: 2442, dtype: f

In [130]:
# FACCIO UNA SOGLIO K DI VICINATO SULLA BASE DELLA SIMILARITÀ
sim_soglia = {}
for i in similarità:
    if similarità[i] > 0.95:
        sim_soglia[i] = similarità[i]
print(sim_soglia)

{(261, 2442): 0.9826073688810351, (384, 2442): 0.9826073688810351, (478, 2442): 0.9826073688810351, (834, 2442): 0.982607368881035, (915, 2442): 0.982607368881035, (979, 2442): 0.982607368881035, (1353, 2442): 0.982607368881035, (1427, 2442): 0.982607368881035, (1810, 2442): 0.982607368881035, (2040, 2442): 0.9826073688810351, (2363, 2442): 0.9710491789484752, (2555, 2442): 0.982607368881035, (2775, 2442): 0.9642857142857143, (3039, 2442): 0.982607368881035, (3493, 2442): 0.9826073688810351, (3549, 2442): 0.982607368881035, (3777, 2442): 0.982607368881035, (4258, 2442): 0.982607368881035, (4326, 2442): 0.9642857142857141, (5202, 2442): 0.9826073688810351, (6228, 2442): 0.9826073688810351, (6689, 2442): 0.9584853234143261, (6736, 2442): 0.982607368881035, (6835, 2442): 0.9826073688810351, (6890, 2442): 0.982607368881035, (7602, 2442): 0.9642857142857141, (7797, 2442): 0.982607368881035, (8191, 2442): 0.9532317628209802, (8221, 2442): 0.982607368881035, (8807, 2442): 0.982607368881035, (

#### Prediction

In [131]:
# Definire una funzione per la predizione delle valutazioni utilizzando User-Based Collaborative Filtering
def predict_user_based(item,user_similarities,ratings_matrix,rating_medi,id_user_target):
    # Calcolare le valutazioni previste
    num = 0
    den = 0
    pred_ratings = 0
    # verifico che per quell'item il rating sia 0
    #if ratings_matrix.loc[id_user_target][item] == 0:
    for i in user_similarities:
        #if ratings_matrix.loc[i[0]].values[item] != 0:
        num += (user_similarities[i]* (ratings_matrix.loc[i[0]][item]-rating_medi[i[0]]))
        den += abs(user_similarities[i])
    pred_ratings = rating_medi[id_user_target] + (num / den)
    return pred_ratings
    #else:
    #    return ratings_matrix.loc[id_user_target][item]

# Ottenere le previsioni per tutte le valutazioni
#predicted_ratings_user_based = predict_user_based(id_item_target, sim_soglia,ratings_matrix,rating_medi,id_user_target)
#print(f'Rating per l item {id_item_target} è di {predicted_ratings_user_based} per l utente {id_user_target}')


### Ranking

In [132]:
# Ottenere le previsioni per tutte le valutazioni
y_pred = []
y_true = []

for item in ratings_matrix.columns:
    predicted_ratings_user_based = predict_user_based(item, sim_soglia, ratings_matrix,rating_medi,id_user_target)   
    #print(f'Rating per l item {item} è di {round(predicted_ratings_user_based,2)} per l utente {id_user_target}')
    if predicted_ratings_user_based < 0:
        predicted_ratings_user_based = 0
    y_pred.append(round(predicted_ratings_user_based,2))
for m in range(len(user_target)):
    #if user_target.values[m] > 0:
    #print(f'Rating ESPLICITI per l item {m} è di {user_target.values[m]} per l utente {id_user_target}')
    y_true.append(user_target.values[m])


In [133]:
print(f'y_pred:{y_pred}')
print(f'y_true:{y_true}')

y_pred:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.84]
y_true:[3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0]


In [134]:
# i'm deleting the rate predicted for evaluationg sim_soglia
for i in range(len(y_true)):
    if y_true[i] == 0:
        y_pred[i] = 0

# Calcola le metriche aggiuntive
print(y_true)
print(y_pred)
precision = precision_score(y_true, [round(pred) for pred in y_pred], average='micro')
recall = recall_score(y_true, [round(pred) for pred in y_pred], average='micro')
f1 = f1_score(y_true, [round(pred) for pred in y_pred], average='micro')
#nmae = accuracy.mae(predictions) / (max(y_true) - min(y_true))
#mae = accuracy.mae(y_pred)

def mae(true,pred):
    n = len(true)
    tot_val = 0
    for i in range(n):
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'MAE:{tot_val/n}')

def Nmae(true,pred):
    n = len(true)
    tot_val = 0
    max = 0
    min = 9999
    for i in range(n):
        if true[i] < min:
            min = true[i]
        if true[i] > max:
            max = true[i]
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'NMAE:{tot_val/n}')

def rmse(true,pred):
    n = len(true)
    tot_val = 0
    for i in range(n):
        val = (pred[i]-true[i])**2
        tot_val += val
    rapp = tot_val/n
    print(f'RMSE:{math.sqrt(rapp)}')

mae(y_true,y_pred)
Nmae(y_true,y_pred)
rmse(y_true,y_pred)

# Stampa i risultati
print(f'Precision:',precision)
print(f'Recall:',recall)
print(f'F1 Score:',f1)
#print(f'NMAE:',nmae)
#print(f'MAE:',mae)

[3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.84]
MAE:0.10533333333333333
NMAE:0.10533333333333333
RMSE:0.5485009875408916
Precision: 0.9666666666666667
Recall: 0.9666666666666667
F1 Score: 0.9666666666666667


In [16]:
# Load the MovieLens dataset
reader = Reader()
data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']][:],reader)

# Create a user-based collaborative filtering model
sim_options = {
    'name': 'cosine',
    'user_based': True  # Compute user similarity
}

# Initialize the KNNBasic algorithm
model = KNNBasic(sim_options=sim_options)

# Perform cross-validation
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...


MemoryError: Unable to allocate 275. GiB for an array with shape (192011, 192011) and data type float64

In [17]:
# Load the dataset
#data = Dataset.load_builtin('ml-100k')
# Split the data into a training set and a test set
trainset, testset = train_test_split(data, test_size=0.25)
# Initialize and fit the model
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)
# Make predictions
predictions = model.test(testset)
# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')

Computing the cosine similarity matrix...


MemoryError: Unable to allocate 257. GiB for an array with shape (185662, 185662) and data type float64

## Usare item based NN 

In [73]:
# Load the MovieLens dataset
reader = Reader()
data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']][:],reader)

# Create a user-based collaborative filtering model
sim_options = {
    'name': 'cosine',
    'item_based': True  # Compute user similarity
}

# Initialize the KNNBasic algorithm
model = KNNBasic(sim_options=sim_options)

# Perform cross-validation
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...


MemoryError: Unable to allocate 34.0 GiB for an array with shape (67513, 67513) and data type float64

In [51]:
# Load the dataset
data = Dataset.load_builtin('ml-100k')
# Split the data into a training set and a test set
trainset, testset = train_test_split(data, test_size=0.25)
# Initialize and fit the model
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)
# Make predictions
predictions = model.test(testset)
# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0166
RMSE: 1.0166491829442186


In [135]:
ratings_matrix_transpose = df.pivot(index='Movie_Id', columns='Cust_Id', values='Rating')

# Se vuoi gestire eventuali valori mancanti con uno zero, puoi utilizzare il metodo fillna
ratings_matrix_transpose = ratings_matrix_transpose.fillna(0)

# Se preferisci avere la matrice come array numpy, puoi utilizzare values
#ratings_matrix_array = ratings_matrix.values

print(ratings_matrix_transpose)

Cust_Id   7        134      201      261      265      307      383      \
Movie_Id                                                                  
1             0.0      0.0      0.0      0.0      0.0      0.0      0.0   
2             0.0      0.0      0.0      0.0      0.0      0.0      0.0   
3             0.0      0.0      0.0      0.0      0.0      0.0      0.0   
4             0.0      0.0      0.0      0.0      0.0      0.0      0.0   
5             0.0      0.0      0.0      0.0      0.0      0.0      0.0   
6             0.0      0.0      0.0      0.0      0.0      0.0      0.0   
7             0.0      0.0      0.0      0.0      0.0      0.0      0.0   
8             5.0      0.0      0.0      0.0      0.0      3.0      0.0   
9             0.0      0.0      0.0      0.0      0.0      0.0      0.0   
10            0.0      0.0      0.0      0.0      0.0      0.0      0.0   
11            0.0      0.0      0.0      0.0      0.0      0.0      0.0   
12            0.0      0.

In [136]:
id_item_target = 1

In [137]:
item_target = ratings_matrix_transpose.loc[id_item_target]
print(item_target)

Cust_Id
7          0.0
134        0.0
201        0.0
261        0.0
265        0.0
          ... 
2649331    0.0
2649336    0.0
2649375    0.0
2649378    0.0
2649426    0.0
Name: 1, Length: 81472, dtype: float64


In [138]:
# Definire la funzione per il calcolo della similarità tra utenti utilizzando la person coefficent
similarita_transpose = {}
def item_similarity(ratings,item_target):
    # scorro UTENTI
    for i in ratings.iterrows():
        # i riga film con rating utenti
        # se l'ITEM è diverso dall'item target
        if i[0] != id_item_target:
            # calcolo similarità
            #print(i[1].values[0])
            numeratore = 0
            denominatoreP1 = 0
            denominatoreP2 = 0
            for j in i[1].keys():
                # j utente
                if i[1][j] != 0 and item_target[j] != 0:
                    numeratore += (i[1][j]-rating_medi[j])*(item_target[j]-rating_medi[j])
                    denominatoreP1 += (i[1][j]-rating_medi[j])**2
                    denominatoreP2 +=(item_target[j]-rating_medi[j])**2
            sim = numeratore/(math.sqrt(denominatoreP1)*math.sqrt(denominatoreP2))
            similarita_transpose[(i[0],id_item_target)] = sim

# Calcolare la similarità tra utenti
item_similarity(ratings_matrix_transpose,item_target)
print(similarita_transpose)

{(2, 1): 0.5050879363944402, (3, 1): 0.19327884898469114, (4, 1): -0.34647173354814115, (5, 1): -0.4122182757582631, (6, 1): -0.5365830749120059, (7, 1): -0.043698053588665596, (8, 1): -0.4123811953268984, (9, 1): -0.26597300190705236, (10, 1): -0.32591559175866175, (11, 1): 0.22292913905910003, (12, 1): -0.03134076609451101, (13, 1): 0.9165202125024452, (14, 1): -0.3729887931737906, (15, 1): -0.1532998654479313, (16, 1): -0.5287456201505534, (17, 1): -0.5226348437114082, (18, 1): -0.23524062217785016, (19, 1): -0.4698494368505132, (20, 1): 0.035937095583284805, (21, 1): -0.43042583662050155, (22, 1): -0.5822464292667661, (23, 1): 0.18225685998255134, (24, 1): -0.49605241819528206, (25, 1): -0.1105078770912336, (26, 1): -0.3420810159525499, (27, 1): -0.03253625171211137, (28, 1): -0.4454359646320336, (29, 1): 0.41117503103888825, (30, 1): -0.4083219594009721}


In [116]:
itemSimileUno = ratings_matrix_transpose.loc[2]
itemSimileDue = ratings_matrix_transpose.loc[5]
print(itemSimileUno)
print(itemSimileDue)
print(item_target)

Cust_Id
685        0.0
695        0.0
915        0.0
967        0.0
1333       0.0
          ... 
2647871    0.0
2648122    0.0
2648204    0.0
2648650    0.0
2648861    3.0
Name: 2, Length: 9619, dtype: float64
Cust_Id
685        3.0
695        0.0
915        0.0
967        0.0
1333       0.0
          ... 
2647871    5.0
2648122    4.0
2648204    0.0
2648650    3.0
2648861    0.0
Name: 5, Length: 9619, dtype: float64
Cust_Id
685        0.0
695        0.0
915        5.0
967        0.0
1333       0.0
          ... 
2647871    4.0
2648122    0.0
2648204    0.0
2648650    0.0
2648861    0.0
Name: 1, Length: 9619, dtype: float64


In [118]:
# FACCIO UNA SOGLIO K DI VICINATO SULLA BASE DELLA SIMILARITÀ
sim_soglia_transpose = {}
for i in similarita_transpose:
    if similarita_transpose[i] > 0.0:
        sim_soglia_transpose[i] = similarita_transpose[i]
print(sim_soglia_transpose)

{(2, 1): 0.045871176736922314}


In [139]:
# prendo solo gli item votati dall'utente target
sim_soglia_transpose = {}
for i in similarita_transpose:
    if user_target[i[0]] != 0:
        sim_soglia_transpose[i] = similarita_transpose[i]
print(sim_soglia_transpose)

{(30, 1): -0.4083219594009721}


In [140]:
# Definire una funzione per la predizione delle valutazioni utilizzando User-Based Collaborative Filtering
def predict_item_based(user,item_similarities,ratings_matrix_transpose,id_item_target):
    # Calcolare le valutazioni previste
    num = 0
    den = 0
    #if ratings_matrix_transpose.loc[id_item_target][user] == 0:
    for i in item_similarities:
        if ratings_matrix_transpose.loc[i[0]][user] != 0:
            num += (item_similarities[i]* (ratings_matrix_transpose.loc[i[0]][user]))
            den += item_similarities[i]
    pred_ratings = 0
    if den != 0:
        pred_ratings = (num / den)
    return pred_ratings
    #else:
        #return ratings_matrix_transpose.loc[id_item_target][user]

# Ottenere le previsioni per tutte le valutazioni
#predicted_ratings_user_based = predict_item_based(id_item_target, similarita_transpose,ratings_matrix_transpose,id_item_target)
#print(f'Rating per l item {id_item_target} è di {predicted_ratings_user_based} per l utente {id_user_target}')


In [141]:
# Ottenere le previsioni per tutte le valutazioni
y_pred = []
y_true = []
for item in ratings_matrix.keys():
    predicted_ratings_user_based = predict_item_based(id_user_target, sim_soglia_transpose, ratings_matrix_transpose,item)   
    #print(f'Rating per l item {item} è di {predicted_ratings_user_based} per l utente {id_user_target}')
    if predicted_ratings_user_based < 0:
        predicted_ratings_user_based = 0
    y_pred.append(predicted_ratings_user_based)
for m in range(len(user_target)):
    #if user_target.values[m] > 0:
    #print(f'Rating ESPLICITI per l item {m} è di {user_target.values[m]} per l utente {id_user_target}')
    y_true.append(user_target.values[m])


In [142]:
print(f'y_pred:{y_pred}')
print(f'y_true:{y_true}')

y_pred:[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0]
y_true:[3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0]


In [143]:
# i'm deleting the rate predicted for evaluationg sim_soglia
""" for i in range(len(y_true)):
    if y_true[i] == 0:
        y_pred[i] = 0 """

print(y_true)
print(y_pred)
precision = precision_score(y_true, [round(pred) for pred in y_pred], average='micro')
recall = recall_score(y_true, [round(pred) for pred in y_pred], average='micro')
f1 = f1_score(y_true, [round(pred) for pred in y_pred], average='micro')
#nmae = accuracy.mae(predictions) / (max(y_true) - min(y_true))
#mae = accuracy.mae(y_pred)

def mae(true,pred):
    n = len(true)
    tot_val = 0
    for i in range(n):
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'MAE:{tot_val/n}')

def Nmae(true,pred):
    n = len(true)
    tot_val = 0
    max = 0
    min = 9999
    for i in range(n):
        if true[i] < min:
            min = true[i]
        if true[i] > max:
            max = true[i]
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'NMAE:{tot_val/n}')

def rmse(true,pred):
    n = len(true)
    tot_val = 0
    for i in range(n):
        val = (pred[i]-true[i])**2
        tot_val += val
    rapp = tot_val/n
    print(f'RMSE:{math.sqrt(rapp)}')

mae(y_true,y_pred)
Nmae(y_true,y_pred)
rmse(y_true,y_pred)

# Stampa i risultati
print(f'Precision:',precision)
print(f'Recall:',recall)
print(f'F1 Score:',f1)
#print(f'NMAE:',nmae)
#print(f'MAE:',mae)

[3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0]
[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0]
MAE:2.8
NMAE:2.8
RMSE:2.898275349237888
Precision: 0.06666666666666667
Recall: 0.06666666666666667
F1 Score: 0.06666666666666667


## Paradigma : User based vs item base

## Matrix factorization: SVD 

In [144]:
reader = Reader()

# get just top 100K rows for faster run time
data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']][:], reader)
#data.split(n_folds=3)
trainsetData, testsetData = train_test_split(data, test_size=0.2)

In [145]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'],verbose=True,return_train_measures=True,n_jobs=100)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0513  1.0468  1.0528  1.0523  1.0566  1.0519  0.0032  
MAE (testset)     0.8407  0.8276  0.8320  0.8361  0.8389  0.8351  0.0047  
RMSE (trainset)   0.6781  0.6847  0.6833  0.6803  0.6853  0.6824  0.0027  
MAE (trainset)    0.5345  0.5403  0.5399  0.5370  0.5407  0.5385  0.0024  
Fit time          1.36    1.19    1.68    1.76    1.21    1.44    0.24    
Test time         0.11    0.11    0.16    0.10    0.10    0.11    0.02    


{'test_rmse': array([1.05126162, 1.0467691 , 1.0528272 , 1.05228992, 1.05657739]),
 'train_rmse': array([0.67812596, 0.68473119, 0.68330916, 0.68033043, 0.6853053 ]),
 'test_mae': array([0.84070953, 0.8276445 , 0.83197029, 0.83610932, 0.83891785]),
 'train_mae': array([0.53453883, 0.54030508, 0.53991321, 0.53695977, 0.54066108]),
 'fit_time': (1.3605248928070068,
  1.1910455226898193,
  1.6816437244415283,
  1.759619951248169,
  1.2132368087768555),
 'test_time': (0.1089625358581543,
  0.11092805862426758,
  0.15633296966552734,
  0.09517264366149902,
  0.1035466194152832)}

## Metriche : Precision vs Recal vs F1 vs MAE vs RMSE vs NMAE

In [49]:
def mae(true,pred):
    n = len(true)
    tot_val = 0
    for i in range(n):
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'MAE:{tot_val/n}')

def Nmae(true,pred):
    n = len(true)
    tot_val = 0
    max = 0
    min = 9999
    for i in range(n):
        if true[i] < min:
            min = true[i]
        if true[i] > max:
            max = true[i]
        val = abs(pred[i]-true[i])
        tot_val += val
    print(f'NMAE:{tot_val/n}')

def rmse(true,pred):
    n = len(true)
    tot_val = 0
    for i in range(n):
        val = (pred[i]-true[i])**2
        tot_val += val
    rapp = tot_val/n
    print(f'RMSE:{math.sqrt(rapp)}')

mae(y_true,y_pred)
Nmae(y_true,y_pred)
rmse(y_true,y_pred)

MAE:0.3826562388494217
NMAE:0.3826562388494217
RMSE:1.0734467184045648


In [44]:
df.head()

Unnamed: 0,index,Cust_Id,Rating,Movie_Id
1,1,1488844,3.0,1
2,2,822109,5.0,1
3,3,885013,4.0,1
4,4,30878,4.0,1
5,5,823519,3.0,1


In [146]:
xTrain = pd.DataFrame({'Movie_Id': df['Movie_Id'], 'Cust_Id': df['Cust_Id']})
yTrain = pd.DataFrame({'Rating': df['Rating']})
print(xTrain.head())
print(yTrain.head())

   Movie_Id  Cust_Id
1         1  1488844
2         1   822109
3         1   885013
4         1    30878
5         1   823519
   Rating
1     3.0
2     5.0
3     4.0
4     4.0
5     3.0


In [151]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate


# Input layer
ratings_input = Input(shape=(None,2))

# Embedding layers
dense0 = Dense(32, activation='relu')(ratings_input)

# Dense layers
#dense1 = Dense(128, activation='relu')(dense0)
dense2 = Dense(64, activation='relu')(dense0)

# Output layer
output = Dense(6,activation='softmax')(dense2)

# Model
model = Model(inputs=ratings_input, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')



In [153]:
model.fit(xTrain,yTrain, epochs=50, batch_size=128)

Epoch 1/50
 18/782 [..............................] - ETA: 2s - loss: 12.6623

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f1b794b7940>

In [156]:
y_pred = model.predict(xTrain)




" mae = np.mean(np.abs(y_pred - yTrain))\nprint(f'MAE: {mae}') "

In [163]:
print(y_pred)
y_pred_custom = []
for i in y_pred:
    val = -1
    index = -1
    for j in range(len(i)):
        if i[j] > val:
            val = i[j]
            index = j
    y_pred_custom.append(index)
y_pred_custom = pd.DataFrame(y_pred_custom)
print(y_pred_custom)


[[0.         0.         0.         0.         0.99999994 0.        ]
 [0.         0.         0.         0.         0.99999994 0.        ]
 [0.         0.         0.         0.         0.99999994 0.        ]
 ...
 [0.         0.         0.         0.         0.99999994 0.        ]
 [0.         0.         0.         0.         1.         0.        ]
 [0.         0.         0.         0.         1.         0.        ]]
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 

In [161]:
mae = np.mean(np.abs(y_pred_custom - yTrain))
print(f'MAE: {mae}')


ValueError: Unable to coerce to Series, length must be 1: given 99970

In [159]:
mae = np.mean(np.abs(y_pred - yTrain))
print(f'MAE: {mae}')
loss = model.evaluate(xTrain, yTrain)
print(f'Loss on test set: {loss}')


ValueError: Unable to coerce to DataFrame, shape must be (99970, 1): given (99970, 6)