## MICROSOFT RECOMMENDER, SAR ALGORITHM  

# Load Packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy, logging, sys, warnings, joblib
from sklearn.preprocessing import minmax_scale
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from recommenders.models.sar import SAR 
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.utils.python_utils import binarize
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import (precision_at_k, mae, rsquared)

%load_ext autoreload
%autoreload 2
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
warnings.filterwarnings('ignore')
print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

### Define a list of top_k items that will be recommended to users

In [None]:
list_top_k=[1,3,5,7,10]

# Load the data

In [None]:
MOVIELENS_DATA_SIZE = "100k"
data = movielens.load_pandas_df(size=MOVIELENS_DATA_SIZE)
data.head()

In [None]:
# # local loading 
# data=pd.read_csv('ML_100K.csv', sep='\t' )
# data.head()

In [None]:
# check the shape of dataset
data.shape

In [None]:
# convert the ratings to float32 to reduce memory usage
data['rating'] = data['rating'].astype(np.float32)
data.head()

# 4-folds Cross-validation
 Split the data into train and test sets

In [None]:
n_folds = 4
kf = KFold(n_splits=n_folds, shuffle=False)
folds= kf.split(data)

# Define the list of the similarities that will be investigated
similarity type must be one  of available similarity metrics:
	
    "SOKAL_MICHENER", "SOKAL_SNEATH_II", "SOKAL_SNEATH_IV", "SOKAL_SNEATH_V",  "PEARSON_I", "PEARSON_II", "PEARSON_III", "PEARSON_HERON_I", "PEARSON_HERON_II", "BARONI_URBANI_BUSER_I", "BARONI_URBANI_BUSER_II", "FORBES_I", "FORBES_II", "YULEQ", "YULEQ_W", "TARANTULA", "AMPLE", "ROGERS_TANIMOTO", "FAITH", "GOWER_LEGENDRE", "INNERPRODUCT", "RUSSELL_RAO", "TARWID", "DENNIS", "GOWER", "STILES", "FOSSUM", "DISPERSON", "HAMANN", "MICHAEL", "PEIRCE", "EYRAUD", "YULEQ_D", "MEAN_MANHATTAN", "VARI", "SHAPEDIFFERENCE", "PATTERNDIFFERENCE

In [None]:
#Define the list of the similarity metrics with negative co-occurrences
list_metrics_d=[
# similarities with negative co-occurrences 
"SOKAL_MICHENER", "SOKAL_SNEATH_II", "SOKAL_SNEATH_IV", "SOKAL_SNEATH_V",  "PEARSON_I", 
"PEARSON_II", "PEARSON_III", "PEARSON_HERON_I", "PEARSON_HERON_II", "BARONI_URBANI_BUSER_I", 
"BARONI_URBANI_BUSER_II",  "FORBES_I",  "FORBES_II", "YULEQ", "YULEQ_W", "TARANTULA",  "AMPLE",
"ROGERS_TANIMOTO", "FAITH",  "GOWER_LEGENDRE", "INNERPRODUCT", "RUSSELL_RAO", "TARWID",
"DENNIS", "GOWER",  "STILES", "FOSSUM", "DISPERSON",  "HAMANN",  "MICHAEL", "PEIRCE", "EYRAUD",

# distances with negative co-occurrences 
"YULEQ_D", "MEAN_MANHATTAN", "VARI", "SHAPEDIFFERENCE", "PATTERNDIFFERENCE" 
]

print(len(list_metrics_d),'similarity metrics')

# Initialization  of models
    For each metric in list_metrics_d, a new SAR model is initialized with specific parameters:

- col_user, col_item, col_rating, and col_timestamp specify the column names in the dataset.
- similarity_type is set to the current metric from the loop.
- time_decay_coefficient is set to 30.
- normalize is set to True to normalize similarity scores.
- timedecay_formula is set to True to apply the time decay form

In [None]:
list_models=[]
for metric in list_metrics_d:
    model                  = SAR(
    col_user               = "userID",
    col_item               = "itemID",
    col_rating             = "rating",
    col_timestamp          = "timestamp",

    similarity_type        =  metric,   
    time_decay_coefficient =  30, 
    normalize              =  True, 
    timedecay_formula      =  True
    )
    list_models.append(model)
print('Initiated models : ',len(list_models))

# Train models

In [None]:
# Define base directory for saving models
base_dir = 'Models_ML_100K_cv/'

# Initialize lists to store evaluation metrics across models and folds
list_list_PRECISION, list_list_MAE, list_list_R_SQUARED = [], [], []

# Start a timer to measure the training and evaluation time
with Timer() as train_time:
    
    # Iterate over each model and its corresponding metric
    for model_index, model in enumerate(list_models):
        
        # Get the metric name corresponding to the current model
        metric = list_metrics_d[model_index]
        print(f"Starting evaluation for model {metric}")
        
        # Initialize lists to store metrics for each fold
        fold_list_MAE, fold_list_PRECISION, fold_list_R_SQUARED = [], [], []

        # Perform k-fold cross-validation
        for fold_index, (train_index, test_index) in enumerate(kf.split(data)):
            # List to store top-k recommendations for each fold
            model_K_items=[] 
            
            # Split data into training and test sets
            train, test = data.iloc[train_index], data.iloc[test_index]  
           
            # Fit the model on the training data of the current fold
            model.fit(train)
            
            # Save the model to disk to make checkpoints
            print(f"Model {model_index+1} ({metric}) on Fold {fold_index+1} trained.")
            filename = f'model_{model_index+1}_{metric}_fold_{fold_index+1}.sav'
            full_path = f'{base_dir}{filename}'
            joblib.dump(model, full_path)
            print(f"Model {model_index+1} ({metric}) on Fold {fold_index+1} saved.")


            # Load the model from disk
            filename = f'model_{model_index+1}_{metric}_fold_{fold_index+1}.sav'
            full_path = f'{base_dir}{filename}'
            model = joblib.load(full_path)
            print(f"Model {metric} has been loaded")

            # Top k recommendation  
            for i in list_top_k:
                print(f'Recommending Top_{i} for fold {fold_index+1}')
                # Generate top-k recommendations for the current fold
                model_K_items.append(model.recommend_k_items(test,i,remove_seen=True))
                print(f'Top_k_{i} is done')
            
            # Evaluation
            ## Initialize lists to store individual metric values for each top-k
            list_MAE_1,list_PRECISION_1,list_R_SQUARED_1=[], [], []
            j=1
            
            # Evaluate the recommendations
            for index, top_k in enumerate(model_K_items):
                # Calculate MAE for the current top-k recommendations
                list_MAE_1.append(mae(test, top_k, col_user='userID', col_item='itemID', col_rating='rating'))
                # Calculate R-Squared for the current top-k recommendations
                list_R_SQUARED_1.append(rsquared(test, top_k, col_user='userID', col_item='itemID', col_rating='rating'))
                # Calculate Precision at k for the current top-k recommendations
                list_PRECISION_1.append(precision_at_k(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', k=list_top_k[index]))
                j+=1
           
            # Append the evaluation metrics for the current fold
            fold_list_MAE.append(list_MAE_1)
            fold_list_PRECISION.append(list_PRECISION_1)
            fold_list_R_SQUARED.append(list_R_SQUARED_1)
        
        # Calculate the mean of the evaluation metrics across all folds
        mean_MAE = np.mean(fold_list_MAE, axis=0)
        mean_PRECISION = np.mean(fold_list_PRECISION, axis=0)
        mean_R_SQUARED = np.mean(fold_list_R_SQUARED, axis=0)

        # Append the mean metrics for the current model to the respective lists
        list_list_MAE.append(mean_MAE)
        list_list_PRECISION.append(mean_PRECISION)
        list_list_R_SQUARED.append(mean_R_SQUARED)
        print(f"Model {metric} is done")
                
# Print the total time taken for training and evaluation       
print(f"Took {train_time.interval} seconds for training and evaluating.")

In [None]:
# Define a threshold for binarizing ratings
positivity_threshold = 2

# Create a copy of the test dataset to avoid modifying the original data
test_bin = test.copy()

# Binarize the ratings in the copied test dataset
# Convert ratings into binary values based on the positivity_threshold
# Ratings >= positivity_threshold are set to 1 (positive), Ratings < positivity_threshold are set to 0 (negative)
test_bin['rating'] = binarize(test_bin['rating'], positivity_threshold)

In [None]:
import os

# Save results

In [None]:
# Define the output directory for saving results
output_directory = 'ResultsTables_ML_100K_cv/'
# Iterate over each value in the list of top-k values
for i in range(len(list_top_k)):
    # Create a dictionary to store the evaluation metrics for the current top-k value
    data = {
        # "Metric": list_metrics_d,
        "Top K": [list_top_k[i]] * len(list_metrics_d),
        "MAE": [list_list_MAE[j][i] for j in range(len(list_list_MAE))],
        "Precision@K": [list_list_PRECISION[j][i] for j in range(len(list_list_PRECISION))],
        "R2": [list_list_R_SQUARED[j][i] for j in range(len(list_list_R_SQUARED))]
    }
    # Create a dataframe from the dictionary
    dataframe = pd.DataFrame(data)
    dataframe.index=list_metrics_d
    # Define the output file path for saving the dataframe as an Excel file
    output_file = os.path.join(output_directory, f"Evaluation_Matrix_{list_top_k[i]}.xlsx")
    # Save the dataframe to an Excel file
    dataframe.to_excel(output_file, index=False)

In [None]:
dataframe

# Graphes

In [None]:
data = dataframe
output_directory = 'ResultsPlots_ML_100K_cv/'
# Iterate over the list of top-k values
for i in range(len(list_top_k)):
    # Create a figure with 3 subplots arranged in 1 row and 3 columns, with a specified figure size
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 8))

    # Sort the data by the 'MAE' column in ascending order
    data = data.sort_values(by=['MAE'], ascending=True)
    # Plot the 'MAE' values as a bar chart on the first subplot
    data['MAE'].plot(ax=axes[0], kind='bar', color='r', y='MAE', x='Similarity', legend='MAE')

    
    # Sort the data by the 'Precision@K' column in descending order
    data = data.sort_values(by=['Precision@K'], ascending=False)
    # Plot the 'Precision@K' values as a bar chart on the second subplot
    data['Precision@K'].plot(ax=axes[1], kind='bar', color='g', y='Precision@K', x='Similarity', legend='Precision@K')
    
    # Sort the data by the 'R2' column in descending order
    data = data.sort_values(by=['R2'], ascending=False)
    # Plot the 'R2' values as a bar chart on the third subplot
    data['R2'].plot(ax=axes[2], kind='bar', color='b', y='R2', x='Similarity', legend='R2')
    
    # Set the title of the entire figure to indicate the current top-k value being evaluated
    plt.suptitle(f'Top K = {list_top_k[i]}',fontsize=15)
    # Adjust the spacing between subplots
    plt.subplots_adjust(wspace=0.1, hspace=0)
    
    # Save the figures
    output_file = os.path.join(output_directory, f"100K_MAE_Precision@K_R²_Top_{list_top_k[i]}_D.jpg")
    fig.savefig(output_file, bbox_inches='tight', dpi=1000)


	The end