In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import numpy as np
import gzip

In [None]:
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate

In [None]:
ModCloth = pd.read_csv('data/ModCloth_clear.csv')

# Consideriamo come user ID la variabile user_name poichè presenta meno valori unici nel dataset.
len(ModCloth.user_id.unique()) # --> 47176
len(ModCloth.user_name.unique()) # --> 31883
df = ModCloth[['user_name', 'item_id', 'quality']]
df.columns = ['user_id', 'item_id', 'rating']


# Estrazione utenti con più di 100 recensioni.
best_users = df.groupby('user_id').count().item_id[df.groupby('user_id').count().item_id > 100].index
df_sample = df[df.user_id.isin(best_users)]


# Modelliamo il dataset per renderlo utilizzabile per allenare il nostro modello (K-NN).
reader = Reader(rating_scale=(df_sample.rating.min(), df.rating.max()))
data = Dataset.load_from_df(df_sample, reader)


# Settiamo i valori degli iperparametri che vogliamo testare.
similarity = ['msd', 'cosine', 'pearson', 'pearson_baseline']
k_grid = np.arange(1,30) 
user_based = [False, True]

res = {}

for based in user_based:
    res = {}
    for pos,s in tqdm(enumerate(similarity)):
        RMSE = []
        MSE = []
        for k in k_grid:
            sim_options = {
                "k": k,
                "name": s,
                "user_based": based, 
            }

            # Per ogni combinazione valutiamo il modello.
            model = KNNBasic(sim_options = sim_options)
            results = cross_validate(model, data, measures=['RMSE', 'MSE'], cv=3, verbose=0, n_jobs=-1)
            RMSE.append(np.mean(results['test_rmse']))
            MSE.append(np.mean(results['test_mse']))
            
        res[s] = (k_grid[np.argmin(RMSE)], min(MSE), min(RMSE))

    results = pd.DataFrame(res, index=['k', 'MSE', 'RMSE'])
    display(results.style.set_caption(f"user_based = {based}"))
    
    # Salviamo i valori che ottimizzano meglio il modello.
    new_k = int(results.loc['k'][np.argmin(results.loc['RMSE'])])
    new_metric = results.columns[np.argmin(results.loc['RMSE'])]
    new_based = based
    new_best_rmse = results[new_metric].loc['RMSE']
    new_best_mse = results[new_metric].loc['MSE']
    if based:
        if new_best_rmse < best_rmse_knn:
            best_k = new_k
            best_metric = new_metric
            best_based = new_based
            best_rmse_knn = new_best_rmse
            best_mse_knn = new_best_mse
    else:
        best_k = new_k
        best_metric = new_metric
        best_based = new_based
        best_rmse_knn = new_best_rmse
        best_mse_knn = new_best_mse




# Estraiamo gli ID di tutti gli utenti e di tutti i prodotti.
unique_users = df_sample.user_id.unique()
unique_users.sort()
unique_items = df_sample.item_id.unique()
unique_items.sort()

# Alleniamo il modello con i valori ottimali trovati in precedenza.
trainset = data.build_full_trainset()
sim_options = {
    "k": best_k,
    "name": best_metric,
    "user_based": best_based, 
}
model = KNNBasic(sim_options = sim_options) 
model.fit(trainset)

# Chiediamo al modello di prevedere la valutazione di ogni utente ad ogni item.
test_set = [[user, item, 3] for user in tqdm(unique_users) for item in unique_items]
pred = [model.predict(i[0], i[1]).est for i in tqdm(test_set)]

# Riempiamo la matrice di rating con i valori predetti.
rating_matrix = np.reshape(pred, (len(unique_users), len(unique_items)))
rating_dataframe = pd.DataFrame(rating_matrix, columns=unique_items, index=unique_users)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# Calcolo delle cosine similarity tra gli utenti.
df_cosine=pd.DataFrame(cosine_similarity(rating_dataframe, dense_output=True))

# Elbow Method.
cs = []
range_k = list(range(1,7))
for i in range_k:
    kmeans = KMeans(n_clusters = i, max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(df_cosine)
    cs.append(kmeans.inertia_)
    

# Prime 2 componenti principali.
data = df_cosine
pca = PCA(2)
transform = pca.fit_transform(data)

# KMeans con valore di k precedentemente trovato.
k = 2
kmeans = KMeans(n_clusters = k)
label = kmeans.fit_predict(transform)
u_labels = np.unique(label)

In [None]:
def top_n_recommendation(user_id, ratings_df, n_items, all_users = False):

    # Controllo se l'utente esiste nella nostra lista.
    users_id = ratings_df["user_id"].unique()
    if user_id not in users_id:
        return('This users do not exist')

    # Consideriamo solo item non recensiti dall'utente.
    items_id = ratings_df["item_id"].unique()
    item_ids_user = ratings_df.loc[df["user_id"] == user_id, "item_id"]
    item_ids_to_pred = np.setdiff1d(items_id, item_ids_user)

    # Estraiamo i valori predetti.
    test_set = [[user_id, item, 3] for item in item_ids_to_pred]
    predictions = model.test(test_set)
    pred_ratings = np.array([pred.est for pred in predictions])

    # Riportiamo i top n item con miglior rating predetto.
    index_max = (-pred_ratings).argsort()[:n_items]
    
    if not all_users:
        print("Top {0} item recommendations for user {1}:".format(n_items, user_id))
        for i in index_max:
            item_id = items_id[i]
            print(f'Item: {item_id} rating: {pred_ratings[i]}')
    else:
        return [items_id[i] for i in index_max]



n_items = 5

list_n_item = [top_n_recommendation(user, df_sample, n_items, all_users=True) for user in tqdm(unique_users)];
recommended_matrix = np.reshape(list_n_item, (len(unique_users), n_items))
rating_dataframe = pd.DataFrame(recommended_matrix, columns=list(range(0,n_items)), index=unique_users)

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

In [None]:
# Adattiamo i nostri dati in modo da renderli utilizzabili per il modello (SVD).
reader = Reader(rating_scale=(df_sample.rating.min(), df.rating.max()))
data = Dataset.load_from_df(df_sample, reader)


grid = {'n_factors':[50, 100, 150],
        'n_epochs': [5, 10, 20, 30], 
        'lr_all': [.0025, .005, .001, .01],
        'reg_all':[0.02,0.1]}

gs = GridSearchCV(SVD, grid, measures=['RMSE', 'MSE'], cv=3)
gs.fit(data)

best_n_factors = gs.best_params['rmse']['n_factors']
best_n_epochs = gs.best_params['rmse']['n_epochs']
best_lr = gs.best_params['rmse']['lr_all']
best_reg = gs.best_params['rmse']['reg_all']

best_rmse_mf = gs.best_score['rmse']
best_mse_mf = gs.best_score['mse']



unique_users = df_sample.user_id.unique()
unique_users.sort()
unique_items = df_sample.item_id.unique()
unique_items.sort()

# Training del modello.
trainset = data.build_full_trainset()

model = SVD(n_factors=best_n_factors, n_epochs=best_n_epochs, lr_all=best_lr, reg_all=best_reg)
model.fit(trainset)

test_set = [(user, item) for user in unique_users for item in unique_items]
pred = [model.predict(i[0], i[1]).est for i in test_set]

rating_matrix = np.reshape(pred, (len(unique_users), len(unique_items)))
rating_dataframe = pd.DataFrame(rating_matrix, columns=unique_items, index=unique_users)