## MICROSOFT RECOMMENDER, SAR ALGORITHM  

# Load Packages

In [None]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import scrapbook as sb
import matplotlib.pyplot as plt
import scipy, logging, sys, warnings, joblib
from sklearn.preprocessing import minmax_scale
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

import recommenders
from recommenders.models.sar import SAR 
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.utils.python_utils import binarize
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import (precision_at_k, mae, rsquared)

%load_ext autoreload
%autoreload 2
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
warnings.filterwarnings('ignore')
print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

# Define a list of top_k items that will be recommended to users

In [None]:
list_top_k=[3, 5, 10]

# Load the data
	MovieTweetings dataset
    
https://www.kaggle.com/datasets/tunguz/movietweetings 

https://github.com/sidooms/MovieTweetings

In [None]:
# load the the Movietweetings (Electronic) dataset after that we add the columns names as 
# userID, itemID, rating
data=pd.read_csv('MT_ratings.dat',sep='::')
data.head()

In [None]:
data.shape

In [None]:
data=data[:50000]
#data=data.sample(n=50000, random_state=0)
#data=data.reset_index(drop=True)

In [None]:
# check the shape of dataset
print('shape of the dataset:',data.shape)

In [None]:
# convert the ratings to float32 to reduce memory usage
data['rating'] = data['rating'].astype(np.float32)
data.head()

In [None]:
# check again the number of messing values
data.isna().sum()

# Split the data into train and test sets
	75% train set and 25% test set

In [None]:
train, test = python_stratified_split(data, 
                                      ratio=0.75,
                                      col_user='userID',
                                      col_item='itemID',
                                      seed=0)
(train.shape, test.shape)

In [None]:
print(
f"""Train: 
Total Ratings: {len(train)}
Unique Users:  {len(train['userID'].unique())}
Unique Items:  {len(train['itemID'].unique())}
"""
     )

print(
f"""Test: 
Total Ratings: {len(test)}
Unique Users:  {len(test['userID'].unique())}
Unique Items:  {len(test['itemID'].unique())}
"""
     )

# Define the list of the similarities that will be investigated
Similarity type must be one of available similarity metrics:
	
- "cooccurrence", "jaccard", "lift", "HD_JACCARD", "DICE", "JACCARD_3W", "SOKAL_SNEATH_I", "COSINE", "SORGENFREI", "MOUNTFORD", "MCCONNAUGHEY", "KULCZYNSKI_II", "DRIVER_KROEBER", "JOHNSON", "SIMPSON", "BRAUN_BANQUET", "FAGER_MCGOWAN", "EUCLID", "MINKOWSKI", "LANCE_WILLIAMS", "HELLINGER", "CHORD",
 
 
- "SOKAL_MICHENER", "SOKAL_SNEATH_II", "SOKAL_SNEATH_IV", "SOKAL_SNEATH_V",  "PEARSON_I", "PEARSON_II", "PEARSON_III", "PEARSON_HERON_I", "PEARSON_HERON_II", "BARONI_URBANI_BUSER_I", "BARONI_URBANI_BUSER_II", "FORBES_I", "FORBES_II", "YULEQ", "YULEQ_W", "TARANTULA", "AMPLE", "ROGERS_TANIMOTO", "FAITH", "GOWER_LEGENDRE", "INNERPRODUCT", "RUSSELL_RAO", "TARWID", "DENNIS", "GOWER", "STILES", "FOSSUM", "DISPERSON", "HAMANN", "MICHAEL", "PEIRCE", "EYRAUD", "YULEQ_D", "MEAN_MANHATTAN", "VARI", "SHAPEDIFFERENCE", "PATTERNDIFFERENCE

In [None]:
# The list of the similarity metrics that already implemented in SAR algorithm
list_already_exist=[
    "jaccard" , 
    "lift"
    ]
print(len(list_already_exist),'similarity metrics already implemented')

In [None]:
#Define the list of the similarity metrics without negative co-occurrences
list_metrics=[
    # similarities
    "DICE", "JACCARD_3W", "SOKAL_SNEATH_I", "COSINE",
    "SORGENFREI", "MOUNTFORD","KULCZYNSKI_II", 
    "JOHNSON","SIMPSON", "BRAUN_BANQUET", "FAGER_MCGOWAN",
    # distances
    "EUCLID", "MINKOWSKI", "LANCE_WILLIAMS", "HELLINGER", "CHORD"
    ]
print(len(list_metrics),'similarity metrics without d')
# "MCCONNAUGHEY","DRIVER_KROEBER": 0 SAMPLES IN PREDICTION SO CANNOT COMPUTE MAE, R², AND P@K

In [None]:
#Define the list of the similarity metrics with negative co-occurrences
list_metrics_d=[
# similarities with negative co-occurrences 
"SOKAL_MICHENER", "SOKAL_SNEATH_II", "SOKAL_SNEATH_IV", "SOKAL_SNEATH_V",  "PEARSON_I", 
"PEARSON_II", "PEARSON_III", "PEARSON_HERON_I", "PEARSON_HERON_II", "BARONI_URBANI_BUSER_I", 
"BARONI_URBANI_BUSER_II",  "FORBES_I",  "FORBES_II", "YULEQ", "YULEQ_W", 
"ROGERS_TANIMOTO", "FAITH",  "GOWER_LEGENDRE", "INNERPRODUCT", "RUSSELL_RAO", "TARWID",
"DENNIS", "GOWER",  "STILES", "FOSSUM", "DISPERSON",  "HAMANN",  "MICHAEL", "PEIRCE", "EYRAUD",

# distances with negative co-occurrences 
"YULEQ_D", "MEAN_MANHATTAN", "VARI", "PATTERNDIFFERENCE" ]

print(len(list_metrics_d),'similarity metrics with d')
#"TARANTULA", "AMPLE", "FOSSUM": 0 SAMPLES IN PREDICTIONS SO CANNOT COMPUTE MAE, R², AND P@K

In [None]:
# combine all similarities to train them in the sar algorithm 
list_all_metrics=list_already_exist+list_metrics+list_metrics_d
print('Total N° of SM: ',len(list_all_metrics))

# Initialization  of models

In [None]:
list_models=[]
for metric in list_all_metrics:
    model                  = SAR(
    col_user               = "userID",
    col_item               = "itemID",
    col_rating             = "rating",
    similarity_type        =  metric,    
    normalize              =  True, 
    # IF THERE IN NO TIMESTAMP IN THE DATASET THEN COMMENT THE FOLLOWING LINES
    time_decay_coefficient =  30, 
    timedecay_formula      =  True,
    col_timestamp          = "timestamp"
    )
    
    list_models.append(model)
print('Initiated models : ',len(list_models))

# Train models

In [None]:
with Timer() as train_time:
    i=0
    for model in list_models:
        model.fit(train)
        print(f"model_{i+1}_{list_all_metrics[i]} trained.")
        # save the model to disk to make checkpoints 
        filename = f'model_{i+1}_{list_all_metrics[i]}.sav'
        joblib.dump(model, filename)  
        print(f"model_{i+1}_{list_all_metrics[i]} saved.")
        i+=1

print(f"Took {train_time.interval} seconds for training.")

In [None]:
print('All models are trained and saved into disk')

In [None]:
# Load the saved models
list_models=[]
for i in range(len(list_all_metrics)):
    #print(filename)
    filename = f'model_{i+1}_{list_all_metrics[i]}.sav'
    loaded_model = joblib.load(filename)
    list_models.append(loaded_model)
    del loaded_model

print('list_loaded_model:',len(list_models))  

# Make recommendations (predictions) : top_k=3,5,10

In [None]:
# generate the recommendations
list_of_list_k_items=[]
with Timer() as test_time:
    for i in list_top_k:
        print(f'Recommending Top_{i}')
        list_models_K_items=[]
        for model in list_models:
            list_models_K_items.append(model.recommend_k_items(test,i,remove_seen=True))
        list_of_list_k_items.append(list_models_K_items)
        print(f'Top_k_{i} is done')
        del list_models_K_items
print(f"Took {test_time.interval} seconds for prediction.")

In [None]:
# Save the predicions
with open("prediction", "wb") as fp:   #Pickling
    pickle.dump(list_of_list_k_items, fp)
print('All predictions are saved into disck')

In [None]:
# Load the predictions
with open("prediction", "rb") as fp:   # Unpickling
    list_of_list_k_items = pickle.load(fp)
len(list_of_list_k_items)
print('All predictions are loaded from disck')

# Evaluation
	compute the MAE, P@K, and R² for each Top_K

In [None]:
list_of_list_PRECISION, list_of_list_MAE, list_of_list_R_SQUARED=[], [], []
for i in range(len(list_top_k)):
    list_MAE_1,list_PRECISION_1,list_R_SQUARED_1=[], [], []
    j=1
    for top_k in list_of_list_k_items[i]:
        list_MAE_1.append(mae(test, top_k, col_user='userID', col_item='itemID', col_rating='rating'))
        list_R_SQUARED_1.append(rsquared(test, top_k, col_user='userID', col_item='itemID', col_rating='rating'))
        list_PRECISION_1.append(precision_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', 
                                               k=list_top_k[i]))
        #print(j)
        j+=1
    print(f'list for Top_k={list_top_k[i]} done')
    list_of_list_MAE.append(list_MAE_1)
    list_of_list_PRECISION.append(list_PRECISION_1)
    list_of_list_R_SQUARED.append(list_R_SQUARED_1)
    del list_MAE_1,list_PRECISION_1,list_R_SQUARED_1

positivity_threshold = 2
test_bin             = test.copy()
test_bin['rating']   = binarize(test_bin['rating'], positivity_threshold)

# Save the results

In [None]:
# To make difference between the similarity metrics without negative cooccurrences 
# and similarity metrics without negative cooccurrences
#Define the list of the similarity metrics with negative co-occurrences
list_metrics_dd=[
# similarities with negative co-occurrences 
"D_SOKAL_MICHENER", "D_SOKAL_SNEATH_II", "D_SOKAL_SNEATH_IV",  "D_PEARSON_I", "D_SOKAL_SNEATH_V",
"D_PEARSON_II", "D_PEARSON_III", "D_PEARSON_HERON_I", "D_PEARSON_HERON_II", "D_BARONI_URBANI_BUSER_I", 
"D_BARONI_URBANI_BUSER_II",  "D_FORBES_I",  "D_FORBES_II", "D_YULEQ", "D_YULEQ_W",  
"D_ROGERS_TANIMOTO", "D_FAITH",  "D_GOWER_LEGENDRE", "D_INNERPRODUCT", "D_RUSSELL_RAO",
"D_TARWID", "D_DENNIS", "D_GOWER",  "D_STILES", "D_DISPERSON",  "D_HAMANN", 
"D_MICHAEL", "D_PEIRCE", "D_EYRAUD",

# distances with negative co-occurrences 
"D_YULEQ_D", "D_MEAN_MANHATTAN", "D_VARI", "D_SHAPEDIFFERENCE", "D_PATTERNDIFFERENCE" ]

print(len(list_metrics_dd),'similarity metrics with d and a prefix D')

In [None]:
list_dic=[]
for i in range(len(list_top_k)):
    dic={
        "Top K": list_top_k[i],
        "MAE": list_of_list_MAE[i],
        "Precision@K": list_of_list_PRECISION[i],
        "R2": list_of_list_R_SQUARED[i],
        }
    list_dic.append(dic)

In [None]:
list_dataframe=[]
for i in range(len(list_top_k)):
    dataframe=pd.DataFrame.from_dict(list_dic[i])
    dataframe.index=list_already_exist+list_metrics+list_metrics_dd
    list_dataframe.append(dataframe)
    dataframe.to_excel(f"Evaluation_Matrix_Top{list_top_k[i]}.xlsx")
    del dataframe

In [None]:
DATA=list_dataframe[0]
for i in range(1,len(list_top_k)):
    DATA=pd.concat([DATA, list_dataframe[i]], axis=0)
DATA.head(10)

In [None]:
# save the tables a Excel format
DATA.to_excel("Evaluation Metrics Top_3_5_10.xlsx")

In [None]:
split=len(list_already_exist+list_metrics)
split

# Plot the MAE, P@K, and R² for the similarity metric without negative co-occurrences

In [None]:
 #list_dataframe[0][:split].sort_values("MAE",ascending=False).index

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3 , figsize=(20,8))
col=['r','g','b']
for i in range(len(list_top_k)):
    data=list_dataframe[i][:split][['MAE']].sort_values(by='MAE',ascending=True)
    data['MAE'].plot(ax=axes[i], kind='bar', color=col[i], legend='MAE', x='Similarity', )
    axes[i].set_title(f'Top {list_top_k[i]}')
    plt.subplots_adjust(wspace=0.1, hspace=0)
    fig.savefig("1M_MAE.jpg", bbox_inches='tight', dpi=1000)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3 , figsize=(20,8))
col=['r','g','b']
for i in range(len(list_top_k)):
    data=list_dataframe[i][:split][['R2']].sort_values(by='R2',ascending=True)
    data['R2'].plot(ax=axes[i], kind='bar', color=col[i], y='R2', x='Similarity', )
    axes[i].set_title(f'Top {list_top_k[i]}')
    plt.subplots_adjust(wspace=0.1, hspace=0)
    fig.savefig("1M_R2.jpg", bbox_inches='tight', dpi=1000)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3 , figsize=(20,8))
col=['r','g','b']
for i in range(len(list_top_k)):
    data=list_dataframe[i][:split][['Precision@K']].sort_values(by='Precision@K',ascending=True)
    data['Precision@K'].plot(ax=axes[i], kind='bar', color=col[i], y='Precision@K', x='Similarity', )
    axes[i].set_title(f'Top {list_top_k[i]}')
    plt.subplots_adjust(wspace=0.1, hspace=0)
    fig.savefig("1M_P@K.jpg", bbox_inches='tight', dpi=1000)

# Plot the MAE, P@K, and R² for the similarity metrics with negative co-occurrences

In [None]:
 #list_dataframe[0][split:].index

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3 , figsize=(20,8))
col=['r','g','b']
for i in range(len(list_top_k)):
    data=list_dataframe[i][split:][['MAE']].sort_values(by='MAE',ascending=True)
    data['MAE'].plot(ax=axes[i], kind='bar', color=col[i], legend='MAE', x='Similarity', )
    axes[i].set_title(f'Top {list_top_k[i]}')
    plt.subplots_adjust(wspace=0.1, hspace=0)
    fig.savefig("1M_D_MAE.jpg", bbox_inches='tight', dpi=1000)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3 , figsize=(20,8))
col=['r','g','b']
for i in range(len(list_top_k)):
    data=list_dataframe[i][split:][['R2']].sort_values(by='R2',ascending=True)
    data['R2'].plot(ax=axes[i], kind='bar', color=col[i], y='R2', x='Similarity', )
    axes[i].set_title(f'Top {list_top_k[i]}')
    plt.subplots_adjust(wspace=0.1, hspace=0)
    fig.savefig("1M_D_R2.jpg", bbox_inches='tight', dpi=1000)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3 , figsize=(20,8))
col=['r','g','b']
for i in range(len(list_top_k)):
    data=list_dataframe[i][split:][['Precision@K']].sort_values(by='Precision@K',ascending=True)
    data['Precision@K'].plot(ax=axes[i], kind='bar', color=col[i], y='Precision@K', x='Similarity', )
    axes[i].set_title(f'Top {list_top_k[i]}')
    plt.subplots_adjust(wspace=0.1, hspace=0)
    fig.savefig("1M_D_P@K.jpg", bbox_inches='tight', dpi=1000)

# Plot the clustering of the similarity metrics with CAH 

In [None]:
for i in range(len(list_top_k)):
    fig = plt.figure(figsize=(10,5))
    Z = linkage(list_dataframe[i].drop('R2',axis=1),method='ward',metric='euclidean') 
    # complete average ward single
    plt.title(f"CAH_METRICS :Top_{list_top_k[i]}")
    dendrogram(Z,labels=list_dataframe[i].index,orientation='top',color_threshold=1.5)
    plt.savefig(f'Top_{list_top_k[i]} clustring.png',dpi=1000)

In [None]:
# Save the cluster into local
#classter.to_excel("classterS.xlsx")

# Plot the correlation between the similarity metrics 

In [None]:
for i in range(len(list_top_k)):
    plt.figure(figsize=(15,15))
    sns.heatmap(data=list_dataframe[i].T.corr())
    plt.title(f"Correlation :Top_{list_top_k[i]}")
    plt.savefig(f'correlation ML_1M Top_{list_top_k[i]}.png',dpi=1000)

	The end