# ALGOS DE RECO #

In [38]:
# on importe les differentes librairies
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from scipy import stats

# on importe les différentes librairies surprise de scikit
from surprise import SVD
from surprise import dataset
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans

# on importe notre fonction de tri
from tri_threshold import filter_reviews

# on importe notre bdd
avis = pd.read_csv("BDD/avis.csv", index_col="Unnamed: 0")
avis_norm = pd.read_csv("BDD/avis_norm.csv")

In [3]:
avis.head(1)

Unnamed: 0,author,date_published,title_review,note,title,url,comment
0,Monsieur Guillaume,2021-01,Voyages sur les ailes des papillons,8,Mariposas,https://www.trictrac.net/jeu-de-societe/maripo...,"Lorsque le jeu est jeu, bon, réflexif, joli po..."


Il faut mettre en place la matrice sparse (calculer la sparsité + plot cette sparsite)

In [79]:
num_users  = avis["author"].nunique()
num_items  = avis["title"].nunique()

print(f"Il y'a {num_users} auteurs et {num_items} jeux")

sparsity = (len(avis) / (num_users*num_items))*100
print(f"Notre matrice a une sparsité de {sparsity:.2f}% ")

matrice_sparse = csr_matrix(avis['note'])
matrice_sparse



Il y'a 2459 auteurs et 3337 jeux
Notre matrice a une sparsité de 1.44% 


<1x118107 sparse matrix of type '<class 'numpy.int64'>'
	with 117772 stored elements in Compressed Sparse Row format>

On test maintenant les differents algos (SVD + KNN pour commencer)

## SVD ##

In [34]:
algo = SVD(n_factors=50,n_epochs=20,lr_all=0.005,reg_all=0.05)

reader = Reader(line_format='user item rating', rating_scale=(0,10))

class MyDataset(dataset.DatasetAutoFolds):
    def __init__(self, df, reader):
        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in
                            zip(df['author'], df['title'], df['note'])]
        self.reader = reader


data_test = MyDataset(avis, reader)
cross_validate(algo, data_test, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.8357  1.8436  1.8322  1.8225  1.8354  1.8339  0.0068  
MAE (testset)     1.4094  1.4119  1.4052  1.4052  1.4117  1.4087  0.0030  
Fit time          0.80    0.76    0.73    0.72    0.74    0.75    0.03    
Test time         0.14    0.12    0.24    0.12    0.12    0.15    0.05    


{'test_rmse': array([1.83565491, 1.84359888, 1.83219208, 1.82252249, 1.83537106]),
 'test_mae': array([1.40940921, 1.41192308, 1.40521954, 1.40516334, 1.41170408]),
 'fit_time': (0.7972726821899414,
  0.7590610980987549,
  0.7279651165008545,
  0.7210249900817871,
  0.741337776184082),
 'test_time': (0.13532304763793945,
  0.1205759048461914,
  0.2422800064086914,
  0.1186528205871582,
  0.11806201934814453)}

In [5]:
# Define the Reader object specifying the rating scale
reader = Reader(line_format='user item rating', rating_scale=(0,1))

# Custom Dataset class to load our dataset
class MyDataset(dataset.DatasetAutoFolds):
    def __init__(self, df, reader):
        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in
                            zip(df['author'], df['title'], df['note'])]
        self.reader = reader

# Instantiate the dataset with your data
data = MyDataset(avis_norm, reader)

# Initialize the SVD algorithm
algo = SVD()

# Perform cross-validation
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1924  0.1936  0.1936  0.1940  0.1936  0.1934  0.0005  
MAE (testset)     0.1489  0.1494  0.1501  0.1499  0.1499  0.1496  0.0004  
Fit time          1.29    1.18    1.28    1.13    1.13    1.20    0.07    
Test time         0.15    0.13    0.23    0.12    0.22    0.17    0.05    


{'test_rmse': array([0.19243127, 0.19357872, 0.19358855, 0.19400189, 0.19355009]),
 'test_mae': array([0.14887866, 0.14936135, 0.15006491, 0.14994892, 0.14985967]),
 'fit_time': (1.287646770477295,
  1.1836957931518555,
  1.282294750213623,
  1.1305840015411377,
  1.1334259510040283),
 'test_time': (0.14888310432434082,
  0.13055419921875,
  0.23337507247924805,
  0.12401986122131348,
  0.22162699699401855)}

In [23]:
# Assuming 'avis' is your DataFrame and you're analyzing the 'note' column for outliers
z_scores = stats.zscore(avis['note'])
abs_z_scores = abs(z_scores)
filtered_entries = (abs_z_scores < 2)  # Adjust the threshold as necessary
cleaned_data = avis[filtered_entries]

In [36]:
# Define the Reader object specifying the rating scale
reader = Reader(line_format='user item rating', rating_scale=(0,10))

# Custom Dataset class to load our dataset
class MyDataset(dataset.DatasetAutoFolds):
    def __init__(self, df, reader):
        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in
                            zip(df['author'], df['title'], df['note'])]
        self.reader = reader

# Instantiate the dataset with your data
data_clean = MyDataset(cleaned_data, reader)

# Initialize the SVD algorithm
algo = SVD(n_factors=20,n_epochs=30,lr_all=0.005,reg_all=0.2)

# Perform cross-validation
cross_validate(algo, data_clean, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5358  1.5511  1.5466  1.5423  1.5585  1.5469  0.0077  
MAE (testset)     1.2186  1.2389  1.2309  1.2282  1.2420  1.2317  0.0083  
Fit time          0.79    0.78    0.85    0.76    0.77    0.79    0.03    
Test time         0.12    0.23    0.12    0.22    0.12    0.16    0.05    


{'test_rmse': array([1.53583739, 1.55111713, 1.54655354, 1.54229144, 1.55852338]),
 'test_mae': array([1.21862773, 1.23891788, 1.23088559, 1.22815876, 1.24196016]),
 'fit_time': (0.7898938655853271,
  0.7755639553070068,
  0.8460330963134766,
  0.7645361423492432,
  0.7690260410308838),
 'test_time': (0.12297272682189941,
  0.22511911392211914,
  0.12236738204956055,
  0.22308707237243652,
  0.11952781677246094)}

In [33]:
# Define the parameter grid
param_grid = {
    'n_factors': [50, 100, 150],  # Number of factors
    'n_epochs': [20, 30],         # Number of iterations
    'lr_all': [0.005, 0.01],      # Learning rate
    'reg_all': [0.02, 0.05]       # Regularization term
}

# Setup GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

# Fit GridSearchCV
gs.fit(data_test)

# Best RMSE score
print(gs.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])


1.8351725410783521
{'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.05}


In [35]:
param_grid = {
    'n_factors': [20, 50, 100, 150, 200],  # More options for the number of factors
    'n_epochs': [5, 10, 20, 30, 50],       # Broader range of iterations
    'lr_all': [0.002, 0.005, 0.01, 0.02],  # Wider range of learning rates
    'reg_all': [0.02, 0.05, 0.1, 0.2],     # Wider range of regularization terms
    'biased': [True, False],               # Whether to use baseline factors or not
}

# Setup GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

# Fit GridSearchCV
gs.fit(data_clean)

# Best RMSE score
print(gs.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.5532953164878684
{'n_factors': 20, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.2, 'biased': True}


In [41]:
cleaned_data_processed = filter_reviews(cleaned_data)

In [42]:
cleaned_data_processed.describe()

Unnamed: 0,note
count,117680.0
mean,7.587483
std,1.792083
min,4.0
25%,6.0
50%,8.0
75%,9.0
max,10.0


In [43]:
# Define the Reader object specifying the rating scale
reader = Reader(line_format='user item rating', rating_scale=(0,10))

# Custom Dataset class to load our dataset
class MyDataset(dataset.DatasetAutoFolds):
    def __init__(self, df, reader):
        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in
                            zip(df['author'], df['title'], df['note'])]
        self.reader = reader

# Instantiate the dataset with your data
data_clean_proc = MyDataset(cleaned_data_processed, reader)

# Initialize the SVD algorithm
algo = SVD(n_factors=20,n_epochs=30,lr_all=0.005,reg_all=0.2)

# Perform cross-validation
cross_validate(algo, data_clean_proc, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5379  1.5478  1.5563  1.5368  1.5405  1.5439  0.0073  
MAE (testset)     1.2238  1.2321  1.2406  1.2248  1.2233  1.2289  0.0067  
Fit time          0.81    0.79    0.81    0.80    0.82    0.81    0.01    
Test time         0.11    0.41    0.11    0.11    0.11    0.17    0.12    


{'test_rmse': array([1.53794386, 1.54783974, 1.55625567, 1.53678063, 1.54054518]),
 'test_mae': array([1.22382164, 1.23211835, 1.240613  , 1.22479521, 1.22327498]),
 'fit_time': (0.806251049041748,
  0.7937178611755371,
  0.8136401176452637,
  0.80082106590271,
  0.8179540634155273),
 'test_time': (0.11278700828552246,
  0.41478991508483887,
  0.10965609550476074,
  0.10850787162780762,
  0.10822772979736328)}

## k-NN ##

In [81]:
algo = KNNBasic()

reader = Reader(line_format='user item rating', rating_scale=(0,10))

class MyDataset(dataset.DatasetAutoFolds):

    def __init__(self, jeux, reader):

        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in
                            zip(avis['author'], avis['title'], avis['note'])]
        self.reader=reader

data = MyDataset(jeux, reader)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9243  1.8922  1.9211  1.9242  1.8988  1.9121  0.0138  
MAE (testset)     1.4846  1.4692  1.4867  1.4951  1.4724  1.4816  0.0095  
Fit time          0.39    0.41    0.41    0.42    0.40    0.41    0.01    
Test time         0.94    0.95    0.91    0.96    0.90    0.93    0.02    


{'test_rmse': array([1.92429276, 1.89222614, 1.92111249, 1.92417751, 1.89876231]),
 'test_mae': array([1.48456234, 1.46921646, 1.48668215, 1.49510341, 1.47237861]),
 'fit_time': (0.38838887214660645,
  0.41173720359802246,
  0.4122331142425537,
  0.41510462760925293,
  0.40060949325561523),
 'test_time': (0.9362239837646484,
  0.945655107498169,
  0.9128246307373047,
  0.9616246223449707,
  0.9030351638793945)}