In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML 

import re
import os
from tqdm import tqdm
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise import NormalPredictor, KNNBasic, SVD, SVDpp

In [2]:
from recommender_metrics import RecommenderMetrics
from movie_lens_data import MovieLensData
from evaluator import Evaluator

In [3]:
path = "./data/ml-100k"
movie_lens_data = MovieLensData(
    users_path = os.path.join(path, "u.user"),
    ratings_path = os.path.join(path, "u.data_new0"),
    movies_path = os.path.join(path, "u.item"), 
    genre_path = os.path.join(path, "u.genre") 
    )

evaluation_data = movie_lens_data.read_ratings_data()
movie_data = movie_lens_data.read_movies_data()
popularity_rankings = movie_lens_data.get_popularity_ranks()
ratings = movie_lens_data.get_ratings()

In [4]:
evaluator = Evaluator(evaluation_data, popularity_rankings)

Number of full trainset users: 935
Number of full trainset items: 1645
Number of trainset users: 935
Number of trainset items: 1645
Size of testset: 79275
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


# Add item-based collaborative filtering RecSys to evaluator


Using the sim_options, we specify the type of similarity calculation and if the collaborative filtering is user based (in this case, No)

In [5]:
item_KNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': False})
evaluator.add_algorithm(item_KNN, "Item KNN")

ItemKnn

In [6]:
user_KNN = KNNBasic(sim_options = {'name': 'pearson', 'user_based': True})
evaluator.add_algorithm(user_KNN, "User KNN")

In [7]:
SVD = SVD()
evaluator.add_algorithm(SVD, "SVD")

In [8]:
SVD_plus_plus = SVDpp()
evaluator.add_algorithm(SVD_plus_plus, "SVD++")

In [9]:
# adding random evaluator
algo_np = NormalPredictor()
evaluator.add_algorithm(algo_np, "Random")

In [10]:
# evaluator.evaluate(do_top_n=False)

In [11]:
# Time consuming, uncomment optionally
evaluator.evaluate(do_top_n=True)

Evaluating  Item KNN ...
Evaluating accuracy...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating top-N with leave-one-out...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing hit-rate and rank metrics...
Computing recommendations with full data set...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Analyzing coverage, diversity, and novelty...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Analysis complete.
Evaluating  User KNN ...
Evaluating accuracy...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating top-N with leave-one-out...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing hit-rate and rank metrics...
Computing recommendations with full data set...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Analyzing coverage, diversity, and novelty...
Comput

# Evaluate topN recommendations

In [12]:
# evaluator.sample_top_n_recs(movie_lens_data, test_subject=85, k=10)

In [13]:
# evaluator.sample_top_n_recs(movie_lens_data, test_subject=314, k=5)

In [14]:
# print(movie_lens_data.get_movie_name(44))
# evaluator.sample_top_n_recs(movie_lens_data, test_subject=44, k=10)

In [15]:
# list_of_recommendations=evaluator.sample_top_n_recs(movie_lens_data, test_subject=1, k=10)
# print(list_of_recommendations)

# Evaluation

In [17]:
import numpy as np

def ndcg_score(y_true, y_pred, k=None):
    # Calculate the Normalized Discounted Cumulative Gain (NDCG) score.
    # Convert inputs to numpy arrays
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    
    # Ensure y_true and y_pred have the same shape
    if y_true.shape != y_pred.shape:
        raise ValueError("y_true and y_pred must have the same shape.")
    
    # Sort indices of y_pred in descending order
    sorted_indices = np.argsort(y_pred)[::-1]
    
    # Select top-K elements if k is specified
    if k is not None:
        sorted_indices = sorted_indices[:k]
    
    # Calculate DCG
    dcg = np.sum((2 ** y_true[sorted_indices] - 1) / np.log2(np.arange(2, len(sorted_indices) + 2)))
    
    # Sort y_true in descending order and calculate ideal DCG
    ideal_sorted_indices = np.argsort(y_true)[::-1]
    ideal_dcg = np.sum((2 ** y_true[ideal_sorted_indices] - 1) / np.log2(np.arange(2, len(ideal_sorted_indices) + 2)))
    
    # Calculate NDCG
    ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0
    return ndcg

In [18]:
def average_precision(y_true, y_pred, k=None):
    """
    Calculate the Average Precision (AP) score for a single query.
    """
    if k is not None:
        y_true = y_true[:k]
        y_pred = y_pred[:k]
    
    # Initialize variables
    precision_sum = 0.0
    true_positives = 0
    
    # Calculate precision at each relevant document
    for i, (true, pred) in enumerate(zip(y_true, y_pred), 1):
        if true:
            true_positives += 1
            precision = true_positives / i
            precision_sum += precision
    
    # Calculate Average Precision
    if true_positives == 0:
        return 0.0
    else:
        return precision_sum / true_positives

In [19]:

def mean_average_precision(y_true_list, y_pred_list, k=None):
    """
    Calculate the Mean Average Precision (MAP) score for a list of queries.
    
    Parameters:
    y_true_list (list of arrays): List of true relevance scores for each query.
    y_pred_list (list of arrays): List of predicted relevance scores for each query.
    
    Returns:
    float: The Mean Average Precision (MAP) score.
    """
    total_queries = len(y_true_list)
    sum_ap = 0.0
    
    for y_true, y_pred in zip(y_true_list, y_pred_list):
        sum_ap += average_precision(y_true, y_pred, k)
    
    # Calculate MAP
    map_score = sum_ap / total_queries
    return map_score



In [20]:
def mean_rating_deviation(y_true, y_pred, k=None):
    if k is not None:
        y_true = y_true[:k]
        y_pred = y_pred[:k]
    
    # Convert inputs to numpy arrays
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    
    # Ensure y_true and y_pred have the same shape
    if y_true.shape != y_pred.shape:
        raise ValueError("y_true and y_pred must have the same shape.")
    
    # Calculate Mean Rating
    mrd_score = np.mean(y_pred)-np.mean(y_true)
    return mrd_score

In [56]:
def mean_reciprocal_rank(y_true_list, y_pred_list, k=None):

    if k is not None:
        y_true = y_true[:k]
        y_pred = y_pred[:k]
    
    total_users = len(y_true_list)
    sum_rr = 0.0

    for y_true, y_pred in zip(y_true_list, y_pred_list):
        # Sort predicted ratings and corresponding true ratings
        sorted_items = sorted(range(len(y_pred)), key=lambda i: y_pred[i], reverse=True)
        true_sorted = [y_true[i] for i in sorted_items]

        # Find the rank of the first relevant item
        first_relevant_rank = next((i + 1 for i, rating in enumerate(true_sorted) if rating), None)

        # Calculate reciprocal rank
        reciprocal_rank = 1 / first_relevant_rank if first_relevant_rank else 0

        sum_rr += reciprocal_rank

    # Calculate MRR
    mrr_score = sum_rr / total_users
    return mrr_score


In [22]:
def calc_score(algo_dict, userid, y_true, y_pred, k=None):
    ndcg = ndcg_score(y_true, y_pred,k)
    # print("NDCG score:", ndcg)
    ap=average_precision(y_true, y_pred,k)
    # print('Average Precision', ap)
    mrd_score = mean_rating_deviation(y_true, y_pred,k)
    # print("Mean Rating (MR) deviation score:", mrd_score)

    if userid in algo_dict:
        # If the key exists
        algo_dict[userid].append([ndcg, ap, mrd_score])
    else:
        # If the key does not exist
        algo_dict[userid]=([ndcg, ap, mrd_score])
    return algo_dict

In [33]:
def dataGen(userid):
    path="./Data"
    algo=0
    file_path=os.path.join(path, f'selected{userid}_{algo}.csv')
    df=pd.read_csv(file_path,)
    os.remove(file_path)
    
    algo+=1
    file_path=os.path.join(path, f'selected{userid}_{algo}.csv')
    df_1=pd.read_csv(file_path,)
    os.remove(file_path)
    
    algo+=1
    file_path=os.path.join(path, f'selected{userid}_{algo}.csv')
    df_2=pd.read_csv(file_path,)
    os.remove(file_path)
    
    algo+=1
    file_path=os.path.join(path, f'selected{userid}_{algo}.csv')
    df_3=pd.read_csv(file_path,)
    os.remove(file_path)
    
    algo+=1
    file_path=os.path.join(path, f'selected{userid}_{algo}.csv')
    df_4=pd.read_csv(file_path,)
    os.remove(file_path)

    df = df.assign(UserKNN=df_1['est'])
    df = df.assign(SVD=df_2['est'])
    df = df.assign(SVDpp=df_3['est'])
    df = df.assign(Random=df_4['est'])
    df = df.rename(columns={'est':'ItemKnn','SVDpp':'SVD++'})
    df.to_csv(f'.//data//RecSys_{userid}.csv', index=False)

In [23]:
file_path = "D:\\Projects\\P_DSc\\Intern-IITP\\NB\\data\\ml-100k\\u.data.test"  # Specify the path to your TSV file
test_data = pd.read_csv(file_path, sep='\t')

order=['userid', 'itemid', 'rating', 'timestamp']
test_data.columns= order

def sort_column_within_group(group):
    group=group.sort_values(by='rating', ascending=False)
    return group
test_data = test_data.groupby('userid').apply(sort_column_within_group)
test_data = test_data.reset_index(drop=True)
print(test_data)

       userid  itemid  rating  timestamp
0           1      15       5  875071608
1           1      93       5  875071484
2           1     253       5  874965970
3           1     223       5  876892918
4           1     216       5  876892701
...       ...     ...     ...        ...
20277     943     419       2  888638920
20278     943      97       2  888639445
20279     943     720       1  888640048
20280     943     941       1  888639725
20281     943     399       1  888639886

[20282 rows x 4 columns]


In [None]:
score_dict={
    'ItemKNN':{},
    'UserKNN':{},
    'SVD':{},
    'SVD++':{},
    'Random':{},
}
score_dict_5={
    'ItemKNN':{},
    'UserKNN':{},
    'SVD':{},
    'SVD++':{},
    'Random':{},
    'userid':{},
}

userids = set(test_data['userid'])
# userids1 = set(test_data['userid'])
# intersection = set1 & set2

for userid in tqdm(userids):
    
    list_of_recommendations=evaluator.sample_top_n_recs(movie_lens_data, test_subject=userid, k=10)
    
    test_i_user = test_data[test_data['userid'].isin([userid])].drop(columns=['userid', 'timestamp']).sort_values(by='itemid', ascending=True).reset_index(drop=True)
    # test_i_user.to_csv(f'.//data//test_{userid}.csv')
    # print(test_i_user)
    print(f'\n{userid}\n')
    score_dict

    for i in range(5):
        order=['itemid', 'act', 'est']
        prediction_df=pd.DataFrame(list_of_recommendations[i], columns=order).sort_values(by='itemid', ascending=True).reset_index(drop=True)
        
        selected_items = prediction_df[prediction_df['itemid'].isin(test_i_user['itemid'])]
        selected_items['act'] = test_i_user.loc[test_i_user['itemid'].isin(selected_items['itemid']), 'rating'].values
        selected_items=selected_items.sort_values(by='est', ascending=False)
        # display(selected_items)
        selected_items.to_csv(f'.//data//selected{userid}_{i}.csv', index=False)

        k=None
        
        y_true = selected_items['act'].values
        y_pred = selected_items['est'].values
        
        algo=('ItemKNN') if (i==0) else ('UserKNN') if(i==1) else ('SVD') if (i==2) else ('SVD++') if (i==3) else ('Random') if(i==4) else ('algo')
        algo_dict=score_dict[algo]
        algo_dict=calc_score(algo_dict, userid, y_true, y_pred, k)
        
        algo_dict_5=score_dict_5[algo]
        algo_dict_5=calc_score(algo_dict_5, userid, y_true, y_pred, k=5)
    dataGen(userid)

In [52]:
# generate list of (list of act and est) for each user
def read_csv_and_append_lists(csv_file, y_true_list, y_pred_list, algo):
    # Read CSV into DataFrame
    df = pd.read_csv(csv_file)
    true_list=[]
    pred_list=[]
    for index, row in df.iterrows():
        true_list.append(row[df.columns[1]])
        pred_list.append(row[df.columns[algo+2]])
    
    y_true_list.append(true_list)
    y_pred_list.append(pred_list)
    return y_true_list, y_pred_list

In [137]:
def map_mrr_eva(df, k=None):
    map_list=['Total_map']
    mrr_list=['Total_mrr']
    for algo in range(5):
        y_true_list=[]
        y_pred_list=[]
        for userid in tqdm(userids):
            read_csv_and_append_lists(f'.//Data//RecSys_{userid}.csv', y_true_list, y_pred_list, algo)
            # read recsys_{userid}.csv file in df append the y_true_list with col[0] and y_pred_list with column[algo+1]
        map_list.append(mean_average_precision(y_true_list,y_pred_list,k))
        mrr_list.append(mean_reciprocal_rank(y_true_list,y_pred_list,k))

    # df.loc[len(df.index)] = map_list
    print(map_list)
    print(mrr_list)
    # df.loc[len(df.index)] = mrr_list
    # return df

In [138]:
# map_list=['tat',1,1,1,1,1]
# map_df=pd.DataFrame([[1,1,1,1,1,1]],columns=['userid', 'ItemKNN', 'UserKNN', 'SVD', 'SVD++', 'Random'])
# map_df.loc[len(df.index)] = map_list
score_df = map_mrr_eva(score_df, k=None)
display(score_df)

100%|██████████| 935/935 [00:01<00:00, 564.75it/s]
100%|██████████| 935/935 [00:01<00:00, 566.98it/s]
100%|██████████| 935/935 [00:01<00:00, 548.36it/s]
100%|██████████| 935/935 [00:01<00:00, 521.00it/s]
100%|██████████| 935/935 [00:01<00:00, 554.59it/s]

['Total_map', 1.0, 1.0, 1.0, 1.0, 1.0]
['Total_mrr', 1.0, 1.0, 1.0, 1.0, 1.0]





Unnamed: 0,userid,ItemKNN,UserKNN,SVD,SVD++,Random
1,1,"[0.913833377284391, 1.0, 0.028218400445717773]","[0.903913221476052, 1.0, -0.03201403323678775]","[0.883227526388415, 1.0, 0.057193770345039585]","[0.8839403603743479, 1.0, -0.03217240607797667]","[0.8211583945321769, 1.0, -0.19606327524910938]"
2,2,"[0.7876114186959077, 1.0, 0.03886367782789213]","[0.9290354062872561, 1.0, -0.028514534166727667]","[0.9304093393052109, 1.0, -0.058486899471716924]","[0.968565215805775, 1.0, 0.12118752850663128]","[0.9261133219164371, 1.0, -0.08584599087559619]"
3,3,"[0.6301432645196707, 1.0, 0.12180420158433947]","[0.7507555555602629, 1.0, 0.6103075219295526]","[0.7175993560461386, 1.0, 0.10086121570028883]","[0.7390208704910455, 1.0, 0.038999739348986484]","[0.6379201487463476, 1.0, 0.6755159597811504]"
4,4,"[0.9953395684454479, 1.0, 0.0348836678265565]","[0.921310321485301, 1.0, -1.039002271660347]","[0.9639021258532504, 1.0, -0.3675108050550273]","[0.9953395684454479, 1.0, -0.2129825046617153]","[0.9073866124623576, 1.0, -0.758871402672388]"
5,5,"[0.9018010021736702, 1.0, -0.07362574691777901]","[0.9263369142592415, 1.0, 0.4140071713037332]","[0.8391843772352486, 1.0, -0.016783820969987318]","[0.9547372212458504, 1.0, 0.0029755187678688344]","[0.7313587319307674, 1.0, 0.41029879335633]"
...,...,...,...,...,...,...
939,939,"[0.8486582655649633, 1.0, -0.02914841451013217]","[0.9838939725958612, 1.0, -1.1690702644453963]","[0.9995843660616693, 1.0, -0.12424925976173462]","[0.980813297963661, 1.0, -0.12377103385161004]","[0.8880534438167289, 1.0, -0.4563167135369177]"
940,940,"[0.7775411711141598, 1.0, -0.015473788107752462]","[0.9029190085899952, 1.0, 0.2811384844641647]","[0.8575570345510818, 1.0, -0.011677010067402183]","[0.925158263698964, 1.0, 0.11236576964102918]","[0.853855202652721, 1.0, -0.12068238036529255]"
941,941,"[0.8862372091245206, 1.0, -0.4539089251043582]","[0.8862372091245206, 1.0, -0.5036857033870805]","[0.9544372751968143, 1.0, -0.3450241277501176]","[0.8862372091245206, 1.0, -0.4772939950143096]","[0.8862372091245206, 1.0, -0.5750858255225881]"
942,942,"[0.907470106580751, 1.0, 0.2039601319047808]","[0.9059593591032901, 1.0, -0.28666691624051666]","[0.8316012109081095, 1.0, 0.16361012648468343]","[0.8468808116865268, 1.0, 0.1717625692413689]","[0.8934390074553678, 1.0, -0.5055084170179232]"


In [None]:
print(score_dict)
score_df=pd.DataFrame(score_dict)
score_df.insert(0, 'userid', score_df.index)
# map_df = map_mrr_eva(score_df, k)
score_df.to_csv(f'.//data//score.csv', index=False)

score_df_5=pd.DataFrame(score_dict_5)
score_df_5.drop(columns=['userid'],axis=1,inplace=True)
score_df_5.insert(0, 'userid', score_df_5.index)
# score_df_5 = map_mrr_eva(score_df, k=5)
score_df_5.to_csv(f'.//data//score_5.csv', index=False)