In [None]:
import os
import numpy as np
import pandas as pd
import kagglehub

from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.metrics import root_mean_squared_error, mean_absolute_error



from cf import *
from utils import *

## Preparing dataset

In [None]:
# Download latest version
dataset_path = kagglehub.dataset_download("prajitdatta/movielens-100k-dataset")
dataset_path = os.path.join(dataset_path, 'ml-100k')
print("Path to dataset files:", dataset_path)

In [3]:
splits = [
    (f'u{i}.base', f'u{i}.test')
    for i in range(1, 6)
]


In [4]:
items_header = [
    'movie id',
    'movie title',
    'release date',
    'video release date',
    'IMDb URL',
    'unknown',
    'Action',
    'Adventure',
    'Animation',
    "Children's",
    'Comedy',
    'Crime',
    'Documentary',
    'Drama',
    'Fantasy',
    'Film-Noir',
    'Horror',
    'Musical',
    'Mystery',
    'Romance',
    'Sci-Fi',
    'Thriller',
    'War',
    'Western'
]

items_df = pd.read_csv(
    os.path.join(dataset_path, 'u.item'),
    sep='|',
    names=items_header,
    encoding='latin-1'
)

In [5]:
items_df

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
def names_by_movie_id(movies_ids: list[int]) -> list[str]:
    return items_df[items_df['movie id'].isin(movies_ids)]['movie title'].to_list()


def prepare_rating_matrix(train_data: pd.DataFrame) -> pd.DataFrame:
    rating_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating')
    rating_matrix.reindex(columns=items_df['movie id'], fill_value=np.nan)
    return rating_matrix.copy()

In [7]:
train_file = 'ua.base'
train_df = pd.read_csv(os.path.join(dataset_path, train_file), sep='\t',
                       names=['user_id', 'item_id', 'rating', 'timestamp'])
test_file = 'ua.test'
test_df = pd.read_csv(os.path.join(dataset_path, test_file), sep='\t',
                      names=['user_id', 'item_id', 'rating', 'timestamp'])

In [8]:
train_matrix = prepare_rating_matrix(train_df)
train_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


## Prediction examples

In [9]:
cf_instance = ItemBasedCF(
    similarity_threshold=-0.33,
    similarity_computer=JaccardMeasureComputer()
)
cf_instance.fit(train_matrix, verbose=True)

Fitting: 100%|██████████| 1680/1680 [00:32<00:00, 51.15it/s]


In [10]:
recommendations = cf_instance.recommend_top_k(user_id=10, k=5)
names_by_movie_id(recommendations)

['Great Day in Harlem, A (1994)',
 'They Made Me a Criminal (1939)',
 'Prefontaine (1997)',
 'Marlene Dietrich: Shadow and Light (1996) ',
 'Santa with Muscles (1996)']

In [11]:
cf_instance.predict_rating(23, 43)

np.float64(2.794843650295526)

## Evaluation

In [12]:

metrics_func = {
    'Pearson': PearsonSimilarity(),
    'Jaccard': JaccardMeasureComputer(),
    'CosineWithZeros': CosineSimilarityWithZeros()
}

best_threshold = {
    'Pearson': None,
    'Jaccard': None,
    'CosineWithZeros': None
}

metrics = {
    'Pearson': [],
    'Jaccard': [],
    'CosineWithZeros': []
}

algorithms = {
    'user-based': UserBasedCF,
    'item-based': ItemBasedCF,
}

train_matrix = prepare_rating_matrix(train_df)
rmse_estimator = RatingEvaluator(metric=root_mean_squared_error)
top_recommender = TopRecommenderEvaluator(good_rating_threshold=4)
mae_estimator = RatingEvaluator(metric=mean_absolute_error)


for similarity_name, similarity_metric in metrics_func.items():

    similarity_matrix = similarity_metric.compute(train_matrix)
    min_similarity = similarity_matrix.min().min()
    max_similarity = similarity_matrix.max().max()
    for algorithm_name, algorithm in algorithms.items():
        for threshold in np.linspace(min_similarity, max_similarity, 10):
            cf_model = algorithm(similarity_computer=similarity_metric, similarity_threshold=threshold)
            cf_model.fit(train_matrix.copy(), verbose=False)
            rmse = rmse_estimator.evaluate(cf_model, test_df, threshold=threshold, verbose=False)
            precision = top_recommender.evaluate(cf_model, test_df, threshold=threshold, verbose=False)
            recall = top_recommender.evaluate(cf_model, test_df, threshold=threshold, mode='recall', verbose=False)
            mae = mae_estimator.evaluate(cf_model, test_df, threshold=threshold, verbose=False)
            metrics[similarity_name].append({
                'algorithm': algorithm_name,
                'threshold': threshold,
                'precision': precision,
                'recall': recall,
                'rmse': rmse,
                'mae': mae,
            })


user_based_cf_data = []
baseline_model = BaselineCF()
baseline_model.fit(train_matrix, verbose=False)

base_line_mae = mae_estimator.evaluate(baseline_model, test_df, verbose=False)
base_line_precision = top_recommender.evaluate(baseline_model, test_df, verbose=False)
base_line_recall = top_recommender.evaluate(baseline_model, test_df, verbose=False)
base_line_rmse = rmse_estimator.evaluate(baseline_model, test_df, verbose=False)


user_based_cf_data.append({
    'algorithm': 'baseline',
    'threshold': np.nan,
    'precision':  base_line_precision,
    'recall': base_line_recall,
    'rmse': base_line_rmse,
    'similarity metric': 'N/A',
    'mae': base_line_mae,
})

for metric_type, metrics_list in metrics.items():
    for metric_data in metrics_list:
        row = {
            'algorithm':  metric_data['algorithm'],
            'similarity metric': metric_type,
            'threshold': round(metric_data['threshold'], 2),
            'precision': round(metric_data['precision'], 3),
            'recall': round(metric_data['recall'], 3),
            'rmse': round(metric_data['rmse'], 2),
            'mae': round(metric_data['mae'], 2),
        }
        user_based_cf_data.append(row)

user_based_metrics_df = pd.DataFrame(user_based_cf_data)
user_based_metrics_df

Unnamed: 0,algorithm,threshold,precision,recall,rmse,similarity metric,mae
0,baseline,,0.020236,0.020236,1.043095,,0.832565
1,user-based,-1.00,0.000000,0.000000,0.970000,Pearson,0.770000
2,user-based,-0.78,0.000000,0.000000,0.970000,Pearson,0.760000
3,user-based,-0.56,0.000000,0.000000,0.970000,Pearson,0.760000
4,user-based,-0.33,0.000000,0.000000,0.960000,Pearson,0.760000
...,...,...,...,...,...,...,...
56,item-based,0.56,0.009000,0.013000,1.040000,CosineWithZeros,0.820000
57,item-based,0.67,0.006000,0.009000,1.040000,CosineWithZeros,0.830000
58,item-based,0.78,0.001000,0.002000,1.040000,CosineWithZeros,0.840000
59,item-based,0.89,0.000000,0.000000,1.040000,CosineWithZeros,0.840000


In [13]:
user_based_cf_data = []
baseline_model = BaselineCF()
baseline_model.fit(train_matrix, verbose=False)

base_line_mae = mae_estimator.evaluate(baseline_model, test_df, verbose=False)
base_line_precision = top_recommender.evaluate(baseline_model, test_df, verbose=False)
base_line_recall = top_recommender.evaluate(baseline_model, test_df, verbose=False)
base_line_rmse = rmse_estimator.evaluate(baseline_model, test_df, verbose=False)


user_based_cf_data.append({
    'algorithm': 'baseline',
    'threshold': np.nan,
    'precision':  base_line_precision,
    'recall': base_line_recall,
    'rmse': base_line_rmse,
    'similarity metric': 'N/A',
    'mae': base_line_mae,
})

for metric_type, metrics_list in metrics.items():
    for metric_data in metrics_list:
        row = {
            'algorithm':  metric_data['algorithm'],
            'similarity metric': metric_type,
            'threshold': round(metric_data['threshold'], 2),
            'precision': round(metric_data['precision'], 3),
            'recall': round(metric_data['recall'], 3),
            'rmse': round(metric_data['rmse'], 2),
            'mae': round(metric_data['mae'], 2),
        }
        user_based_cf_data.append(row)

user_based_metrics_df = pd.DataFrame(user_based_cf_data)
user_based_metrics_df

Unnamed: 0,algorithm,threshold,precision,recall,rmse,similarity metric,mae
0,baseline,,0.020236,0.020236,1.043095,,0.832565
1,user-based,-1.00,0.000000,0.000000,0.970000,Pearson,0.770000
2,user-based,-0.78,0.000000,0.000000,0.970000,Pearson,0.760000
3,user-based,-0.56,0.000000,0.000000,0.970000,Pearson,0.760000
4,user-based,-0.33,0.000000,0.000000,0.960000,Pearson,0.760000
...,...,...,...,...,...,...,...
56,item-based,0.56,0.009000,0.013000,1.040000,CosineWithZeros,0.820000
57,item-based,0.67,0.006000,0.009000,1.040000,CosineWithZeros,0.830000
58,item-based,0.78,0.001000,0.002000,1.040000,CosineWithZeros,0.840000
59,item-based,0.89,0.000000,0.000000,1.040000,CosineWithZeros,0.840000


In [14]:
user_based_metrics_df[user_based_metrics_df['algorithm'] == 'item-based']

Unnamed: 0,algorithm,threshold,precision,recall,rmse,similarity metric,mae
11,item-based,-1.0,0.002,0.002,0.96,Pearson,0.76
12,item-based,-0.78,0.001,0.002,0.96,Pearson,0.76
13,item-based,-0.56,0.001,0.002,0.96,Pearson,0.75
14,item-based,-0.33,0.001,0.002,0.95,Pearson,0.75
15,item-based,-0.11,0.002,0.002,0.95,Pearson,0.75
16,item-based,0.11,0.002,0.003,0.96,Pearson,0.75
17,item-based,0.33,0.006,0.009,1.01,Pearson,0.79
18,item-based,0.56,0.011,0.017,1.08,Pearson,0.85
19,item-based,0.78,0.008,0.014,1.11,Pearson,0.87
20,item-based,1.0,0.0,0.0,1.04,Pearson,0.84


In [15]:
def generate_summary_tables(metrics_df: pd.DataFrame):

    idx_best_rmse = metrics_df.groupby(['algorithm', 'similarity metric'])['rmse'].idxmin()
    best_rmse_data = metrics_df.loc[idx_best_rmse].copy()
    best_rmse_data = best_rmse_data[['algorithm', 'similarity metric', 'rmse', 'mae', 'threshold']]
    best_rmse_data.rename(columns={
        'rmse': 'best_rmse',
        'mae': 'mae_at_best_rmse',
        'threshold': 'threshold_for_best_rmse'
    }, inplace=True)


    idx_best_mae = metrics_df.groupby(['algorithm', 'similarity metric'])['mae'].idxmin()
    best_mae_data = metrics_df.loc[idx_best_mae].copy()
    best_mae_data = best_mae_data[['algorithm', 'similarity metric', 'mae', 'rmse', 'threshold']]
    best_mae_data.rename(columns={
        'mae': 'best_mae',
        'rmse': 'rmse_at_best_mae',
        'threshold': 'threshold_for_best_mae'
    }, inplace=True)


    summary_rmse_mae_df = pd.merge(
        best_rmse_data,
        best_mae_data,
        on=['algorithm', 'similarity metric']
    )

    summary_rmse_mae_df = summary_rmse_mae_df[[
        'algorithm', 'similarity metric',
        'best_rmse', 'mae_at_best_rmse', 'threshold_for_best_rmse',
        'best_mae', 'rmse_at_best_mae', 'threshold_for_best_mae'
    ]]


    idx_best_precision = metrics_df.groupby(['algorithm', 'similarity metric'])['precision'].idxmax()
    best_precision_data = metrics_df.loc[idx_best_precision].copy()
    best_precision_data = best_precision_data[['algorithm', 'similarity metric', 'precision', 'recall', 'threshold']]
    best_precision_data.rename(columns={
        'precision': 'best_precision',
        'recall': 'recall_at_best_precision',
        'threshold': 'threshold_for_best_precision'
    }, inplace=True)


    idx_best_recall = metrics_df.groupby(['algorithm', 'similarity metric'])['recall'].idxmax()
    best_recall_data = metrics_df.loc[idx_best_recall].copy()
    best_recall_data = best_recall_data[['algorithm', 'similarity metric', 'recall', 'precision', 'threshold']]
    best_recall_data.rename(columns={
        'recall': 'best_recall',
        'precision': 'precision_at_best_recall',
        'threshold': 'threshold_for_best_recall'
    }, inplace=True)


    summary_topk_df = pd.merge(
        best_precision_data,
        best_recall_data,
        on=['algorithm', 'similarity metric']
    )

    summary_topk_df = summary_topk_df[[
        'algorithm', 'similarity metric',
        'best_precision', 'recall_at_best_precision', 'threshold_for_best_precision',
        'best_recall', 'precision_at_best_recall', 'threshold_for_best_recall'
    ]]

    return summary_rmse_mae_df, summary_topk_df

In [16]:
rating_metrics_best, top_k_metrics_best = generate_summary_tables(user_based_metrics_df)

In [17]:
rating_metrics_best

Unnamed: 0,algorithm,similarity metric,best_rmse,mae_at_best_rmse,threshold_for_best_rmse,best_mae,rmse_at_best_mae,threshold_for_best_mae
0,baseline,,1.043095,0.832565,,0.832565,1.043095,
1,item-based,CosineWithZeros,0.95,0.75,0.0,0.75,0.95,0.0
2,item-based,Jaccard,0.95,0.74,0.0,0.74,0.95,0.0
3,item-based,Pearson,0.95,0.75,-0.33,0.75,0.96,-0.56
4,user-based,CosineWithZeros,0.97,0.76,0.0,0.76,0.97,0.0
5,user-based,Jaccard,0.97,0.76,0.0,0.76,0.97,0.0
6,user-based,Pearson,0.96,0.76,-0.33,0.76,0.97,-0.78


In [18]:
top_k_metrics_best

Unnamed: 0,algorithm,similarity metric,best_precision,recall_at_best_precision,threshold_for_best_precision,best_recall,precision_at_best_recall,threshold_for_best_recall
0,baseline,,0.020236,0.020236,,0.020236,0.020236,
1,item-based,CosineWithZeros,0.01,0.015,0.44,0.015,0.01,0.44
2,item-based,Jaccard,0.013,0.02,0.23,0.02,0.013,0.23
3,item-based,Pearson,0.011,0.017,0.56,0.017,0.011,0.56
4,user-based,CosineWithZeros,0.033,0.056,0.44,0.056,0.033,0.44
5,user-based,Jaccard,0.038,0.074,0.23,0.074,0.038,0.23
6,user-based,Pearson,0.02,0.036,1.0,0.036,0.02,1.0
