# Section 2 and 3
## Fine-tuning and evaluation of the KNN Baseline and SVD CF systems.

In [2]:
from surprise.prediction_algorithms.predictions import Prediction
from typing import Dict, List
import numpy as np
import pandas as pd

In [38]:
test = pd.read_table('test_new.tsv',sep='\s+',header=0).sort_values(['user_id', 'item_id', 'rating', 'timestamp'])
train = pd.read_table('train_new.tsv',sep='\s+',header=0).sort_values(['user_id', 'item_id', 'rating', 'timestamp'])

In [11]:
from surprise import Reader, Dataset
import random

reader = Reader(rating_scale=(1, 5))
training_matrix = Dataset.load_from_df(train[['user_id', 'item_id', 'rating']], reader)

def set_seeds(seed=0):
    random.seed(seed)
    np.random.seed(seed)
set_seeds()

## TopPop recommender system

In [None]:
def top_pop(train_data: pd.DataFrame, k: int) -> Dict[str, List[str]]:

    freq = train_data[train_data['rating'] >= 3].groupby('item_id').size().reset_index(name='freq')
    freq = freq.sort_values(['freq', 'item_id'], ascending=[False, True], kind='stable')

    top_k_items = freq['item_id'].head(k).tolist()
    user_ids = train_data['user_id'].unique()
    recommendations = {uid: top_k_items for uid in user_ids}
    
    return recommendations

# KNNBaseline

## Initalization of parameters for fine-tuning of hyperparameters


In [13]:
from surprise.model_selection import KFold
kf = KFold(n_splits=5)
mae_result = {}

list_neighbour = [3, 5, 8, 10, 15, 20] 
settings = ['user_based', 'item_based']
similarity_measures = ["cosine", "msd", "pearson", "pearson_baseline"]

## Fine-tuning

In [14]:
from surprise import KNNBaseline
from sklearn.metrics import mean_absolute_error as mae
from surprise.model_selection import KFold

set_seeds()

precision_at_k_results = {}

for neighbour in list_neighbour:
    mae_result[neighbour] = {}
    for setting in settings:
        mae_result[neighbour][setting] = {}
        for sim in similarity_measures:
            mae_result[neighbour][setting][sim] = {}
            knn_algo = KNNBaseline(k=neighbour,
                                sim_options={"name": sim,
                                             "user_based": True if setting == 'user_based' else False if setting == 'item_based' else ValueError  
                                             },
                                bsl_options = {"method": "als", "n_epochs": 10, "reg_u": 15, "reg_i": 10},
                                verbose=False,
                                random_state=0)
            for fold, (trainset, testset) in enumerate(kf.split(training_matrix)):
                knn_algo.fit(trainset)
                predictions_KNN = knn_algo.test(testset)
                df_pred_KNN = pd.DataFrame(predictions_KNN)
                mae_result[neighbour][setting][sim][fold] = mae(y_true=df_pred_KNN['r_ui'],
                                                                  y_pred=df_pred_KNN['est']) 

knn_final_results = []

for neighbour, setting_dict in mae_result.items():
    #final_results[neighbour] = {}
    for user_based_val, sim_dict in setting_dict.items():
        #final_results[neighbour][user_based_val] = {}
        for sim, fold_dict in sim_dict.items():
            fold_scores = list(fold_dict.values())
            avg_mae = np.mean(fold_scores)
            std_mae = np.std(fold_scores)
            knn_final_results.append((neighbour, user_based_val, sim, avg_mae))
        #    final_results[neighbour][user_based_val][sim] = avg_rmse

## Find the best parameters from the fine-tuning

In [16]:
k_values = [3, 5, 8, 10, 15, 20]
similarity_measures = ['cosine', 'msd', 'pearson', 'pearson_baseline']
user_df = pd.DataFrame(index=k_values, columns=similarity_measures)
user_df.index.name = 'k'
item_df = user_df.copy()

user_based_results = [result for result in knn_final_results if result[1] == 'user_based']
item_based_results = [result for result in knn_final_results if result[1] == 'item_based']

for result in user_based_results:
    k = round(result[0], 3)
    similarity = result[2]
    mae = round(result[3], 3)
    user_df.at[k, similarity] = round(mae, 3)

for result in item_based_results:
    k = round(result[0], 3)
    similarity = result[2]
    mae = round(result[3], 3)
    item_df.at[k, similarity] = round(mae, 3)

print(user_df)
print(user_df.to_latex())

   cosine    msd pearson pearson_baseline
k                                        
3   0.629  0.615   0.626            0.616
5   0.622  0.608   0.621            0.616
8   0.616  0.605   0.619            0.609
10  0.613  0.601   0.621             0.62
15  0.614  0.603   0.619            0.611
20   0.61  0.604   0.619            0.618
\begin{tabular}{lllll}
\toprule
 & cosine & msd & pearson & pearson_baseline \\
k &  &  &  &  \\
\midrule
3 & 0.629000 & 0.615000 & 0.626000 & 0.616000 \\
5 & 0.622000 & 0.608000 & 0.621000 & 0.616000 \\
8 & 0.616000 & 0.605000 & 0.619000 & 0.609000 \\
10 & 0.613000 & 0.601000 & 0.621000 & 0.620000 \\
15 & 0.614000 & 0.603000 & 0.619000 & 0.611000 \\
20 & 0.610000 & 0.604000 & 0.619000 & 0.618000 \\
\bottomrule
\end{tabular}



In [None]:
knn_best_setting = min(knn_final_results, key=lambda x: x[3])
print(f"Best MAE: {knn_best_setting[3]:.4f}")

Best MAE: 0.5688


## SVD Model

## Initalization of parameters for fine-tuning of hyperparameters


In [18]:
kf = KFold(n_splits=5)
n_factors = [5, 10, 20, 30, 40]
n_epochs = [10, 20, 30, 40, 50, 60]
mae_result = {}

In [19]:
from surprise import SVD, Reader, Dataset
from sklearn.metrics import mean_absolute_error as mae
from surprise.model_selection import KFold

set_seeds()

for latent_factor in n_factors:
    mae_result[latent_factor] = {}
    for epochs in n_epochs:
        mae_result[latent_factor][epochs] = {}
        svd_algo = SVD(n_epochs=epochs, n_factors=latent_factor, random_state=0, lr_all=0.005, reg_all=0.1, biased=True)
        for fold, (trainset, testset) in enumerate(kf.split(training_matrix)):
            svd_algo.fit(trainset)
            predictions_KNN = svd_algo.test(testset)
            df_pred_KNN = pd.DataFrame(predictions_KNN)
            mae_result[latent_factor][epochs][fold] = mae(y_true=df_pred_KNN['r_ui'],
                                                                  y_pred=df_pred_KNN['est']) 
            final_results = []

for latent_factor, epochs_dict in mae_result.items():
        for sim, fold_dict in sim_dict.items():
            fold_scores = list(fold_dict.values())
            avg_rmse = np.mean(fold_scores)
            std_rmse = np.std(fold_scores)
            final_results.append((latent_factor, epochs_dict, avg_rmse))

In [20]:
avg_mae = {}

for latent_factor in n_factors:
    avg_mae[latent_factor] = {}
    for epochs in n_epochs:
        # Get all fold MSEs for this combination
        fold_mses = list(mae_result[latent_factor][epochs].values())
        
        # Convert MSE to RMSE and average
        fold_rmses = [np.sqrt(mse) for mse in fold_mses]
        avg_rmse = np.mean(fold_rmses)
        avg_mae[latent_factor][epochs] = avg_rmse

best_mae = float('inf')

for latent_factor in n_factors:
    for epochs in n_epochs:
        current_mae = avg_mae[latent_factor][epochs]
        if current_mae < best_mae:
            best_mae = current_mae
            best_params = {'n_factors': latent_factor, 'n_epochs': epochs}

print("\nAverage MAE for each combination:")
for latent_factor in n_factors:
    print(f"\nLatent Factors: {latent_factor}")
    for epochs in n_epochs:
        print(f"  Epochs: {epochs}, RMSE: {avg_mae[latent_factor][epochs]:.4f}")

print("\nBest Parameters:")
print(f"Latent Factors: {best_params['n_factors']}")
print(f"Epochs: {best_params['n_epochs']}")
print(f"Best MAE: {best_mae:.4f}")



Average MAE for each combination:

Latent Factors: 5
  Epochs: 10, RMSE: 0.7752
  Epochs: 20, RMSE: 0.7606
  Epochs: 30, RMSE: 0.7546
  Epochs: 40, RMSE: 0.7535
  Epochs: 50, RMSE: 0.7528
  Epochs: 60, RMSE: 0.7517

Latent Factors: 10
  Epochs: 10, RMSE: 0.7759
  Epochs: 20, RMSE: 0.7607
  Epochs: 30, RMSE: 0.7542
  Epochs: 40, RMSE: 0.7541
  Epochs: 50, RMSE: 0.7531
  Epochs: 60, RMSE: 0.7528

Latent Factors: 20
  Epochs: 10, RMSE: 0.7750
  Epochs: 20, RMSE: 0.7610
  Epochs: 30, RMSE: 0.7558
  Epochs: 40, RMSE: 0.7527
  Epochs: 50, RMSE: 0.7505
  Epochs: 60, RMSE: 0.7525

Latent Factors: 30
  Epochs: 10, RMSE: 0.7762
  Epochs: 20, RMSE: 0.7625
  Epochs: 30, RMSE: 0.7556
  Epochs: 40, RMSE: 0.7531
  Epochs: 50, RMSE: 0.7520
  Epochs: 60, RMSE: 0.7532

Latent Factors: 40
  Epochs: 10, RMSE: 0.7764
  Epochs: 20, RMSE: 0.7622
  Epochs: 30, RMSE: 0.7572
  Epochs: 40, RMSE: 0.7538
  Epochs: 50, RMSE: 0.7521
  Epochs: 60, RMSE: 0.7513

Best Parameters:
Latent Factors: 20
Epochs: 50
Best MAE

In [None]:
df = pd.DataFrame(avg_mae)
df.columns.name = 'Epochs'
df.index.name = 'Latent Factors'

df = df.round(3)  
df
print(df.to_latex())

\begin{tabular}{lrrrrr}
\toprule
Epochs & 5 & 10 & 20 & 30 & 40 \\
Latent Factors &  &  &  &  &  \\
\midrule
10 & 0.775000 & 0.776000 & 0.775000 & 0.776000 & 0.776000 \\
20 & 0.761000 & 0.761000 & 0.761000 & 0.763000 & 0.762000 \\
30 & 0.755000 & 0.754000 & 0.756000 & 0.756000 & 0.757000 \\
40 & 0.753000 & 0.754000 & 0.753000 & 0.753000 & 0.754000 \\
50 & 0.753000 & 0.753000 & 0.751000 & 0.752000 & 0.752000 \\
60 & 0.752000 & 0.753000 & 0.752000 & 0.753000 & 0.751000 \\
\bottomrule
\end{tabular}



# Running the models with the optimal hyperparameters to the whole training set

In [22]:
full_train_set = training_matrix.build_full_trainset()


In [23]:
set_seeds()

best_knn = KNNBaseline(k=10,
                                sim_options={"name": 'MSD',
                                             "user_based": False
                                             },
                                bsl_options = {"method": "als", "n_epochs": 10, "reg_u": 15, "reg_i": 10},
                                verbose=False,
                                random_state=0)
best_knn = best_knn.fit(full_train_set)

In [24]:
set_seeds()

best_svd = SVD(n_epochs=50, n_factors=20, random_state=0, lr_all=0.005, reg_all=0.02, biased=True)
best_svd = best_svd.fit(full_train_set)

# Predicting non-rated items for each user

In [None]:
# Get all unique item IDs in the dataset
all_item_ids = set(train['item_id'].unique())

# For each user, find items they haven't rated
user_non_rated = {}
for user_id in train['user_id'].unique():
    rated_items = train[train['user_id'] == user_id]['item_id'].unique()
    non_rated = list(all_item_ids - set(rated_items))
    user_non_rated[user_id] = non_rated


In [26]:
knn_user_rankings = {}

for user_id, non_rated_items in user_non_rated.items():
    predictions = [best_knn.predict(user_id, item_id).est for item_id in non_rated_items]
    
    item_ratings = list(zip(non_rated_items, predictions))
    item_ratings.sort(key=lambda x: x[1], reverse=True) 
    
    knn_user_rankings[user_id] = [item_id for item_id, _ in item_ratings]

In [27]:
import json
with open('knn_user_rankings.json', 'w') as fp:
    json.dump(knn_user_rankings, fp)

In [28]:
svd_user_rankings = {}

for user_id, non_rated_items in user_non_rated.items():
    predictions = [best_svd.predict(user_id, item_id).est for item_id in non_rated_items]
    
    item_ratings = list(zip(non_rated_items, predictions))
    item_ratings.sort(key=lambda x: x[1], reverse=True)
    
    svd_user_rankings[user_id] = [item_id for item_id, _ in item_ratings]

# Evaluation (week 8)

In [29]:
from sklearn.metrics import mean_squared_error as mse

In [30]:
preds = []
for row in test[['user_id', 'item_id']].itertuples(index=False):
    user_id = row.user_id
    item_id = row.item_id
    
    pred = best_knn.predict(user_id, item_id)
    preds.append(pred.est) 
rmse = np.sqrt(mse(y_true=test['rating'], y_pred=preds))
print(f"RMSE: {rmse:.4f}")


RMSE: 1.0810


In [31]:
preds = []
# Iterate over rows using .itertuples()
for row in test[['user_id', 'item_id']].itertuples(index=False):
    user_id = row.user_id
    item_id = row.item_id
    
    pred = best_svd.predict(user_id, item_id)
    preds.append(pred.est) 
rmse = np.sqrt(mse(y_true=test['rating'], y_pred=preds))
print(f"RMSE: {rmse:.4f}")

RMSE: 0.9922


# Computing utility-based measures

In [None]:
from collections import defaultdict
test_ground_truth = defaultdict(set)
test['relevant'] = (test['rating'] >= 3).astype(int)
for uid, iid, rel in zip(test['user_id'], test['item_id'], test['relevant']):
    if rel == 1:  
        test_ground_truth[uid].add(iid)

def compute_precision_at_k(top_recs, ground_truth, k=10):
    top_recs = top_recs[:k]
    ground_truth = ground_truth
    hits = len(set(top_recs) & set(ground_truth))
    return hits / k

def compute_ap(top_recs, ground_truth):
    ground_truth_set = set(ground_truth)    
    G = len(ground_truth_set)
    if G == 0:
        return 0.0
    cumulative_precision = 0.0
    relevant_count = 0
    for rank, item in enumerate(top_recs, 1):
        if item in ground_truth_set:
            relevant_count += 1
            precision_at_k = relevant_count / rank  # P@k
            cumulative_precision += precision_at_k  # Sum of P@k for relevant ranks
    ap = cumulative_precision / G
    return ap
def compute_metrics(user_ids, top_recommendations, k=10, catalog = None):
    hit_rates = []
    precision_scores = []
    aps = []
    rrs = []
    if catalog is None:
        total_items_in_catalog = len(set(train['item_id']).union(set(test['item_id'])))
    else:
        total_items_in_catalog = catalog
    recommended_items = set()
    for user_id in user_ids:
        ground_truth : set = test_ground_truth.get(user_id)
        top_recs : set = top_recommendations.get(user_id)[:k]
        recommended_items.update(list(top_recs))

        if ground_truth is None: # :(
            #continue
            ground_truth = set()
        #else:         
        hit = False
        hits = len(set(top_recs) & ground_truth)
        hit = (hits > 0).real

        precision_at_k = compute_precision_at_k(top_recs, ground_truth, k=10)    
        precision_scores.append(precision_at_k)
        
        ap = compute_ap(top_recs, ground_truth)

        for rank, item in enumerate(top_recs, 1):
            if item in ground_truth:
                rr = 1 / rank
                break
            else:
                rr = 0
        hit_rates.append(hit)            
        aps.append(ap)
        rrs.append(rr)
    coverage =  len(recommended_items) / total_items_in_catalog
    return {'PRECISION@k:': round(np.mean(precision_scores), 3), 'MAP@k:': round(np.mean(aps), 3), 'MRR@k:': round(np.mean(rrs), 3), 'Hit rate': round(np.mean(hit_rates), 3), 'Coverage': round(coverage, 3)}

## Computing utilty-based measures for KNN Baseline

In [34]:
metrics = compute_metrics(test['user_id'].unique(), knn_user_rankings)
metrics

{'PRECISION@k:': 0.01,
 'MAP@k:': 0.01,
 'MRR@k:': 0.035,
 'Hit rate': 0.092,
 'Coverage': 0.639}

## Computing utilty-based measures for SVD

In [35]:
compute_metrics(test['user_id'].unique(), svd_user_rankings)

{'PRECISION@k:': 0.01,
 'MAP@k:': 0.009,
 'MRR@k:': 0.027,
 'Hit rate': 0.092,
 'Coverage': 0.284}

## Computing utilty-based measures for TopPop

In [36]:
compute_metrics(test['user_id'].unique(), top_pop(train, 10))

{'PRECISION@k:': 0.032,
 'MAP@k:': 0.034,
 'MRR@k:': 0.116,
 'Hit rate': 0.254,
 'Coverage': 0.019}

# Analysis on the effect of the long tail

## Top 20 and bottom 20 users

In [39]:
from collections import defaultdict
# top 20 & bottom 20
z = train['user_id'].value_counts(sort=True).reset_index()
top_20_users = z[:int((len(z)/5))]
last_20_users = z[int(len(z)/(4/3)):]
test['relevant'] = (test['rating'] >= 3).astype(int)

### TopPop

In [40]:
def compute_hit_rate(users, user_rankings: Dict[str, List[str]],
             ground_truth_vector: Dict[str, set[str]]) -> float:
    hit_rates = []
    for uid in users:
        recommended = set(user_rankings)
        relevant = ground_truth_vector[uid]
        hit = len(recommended.intersection(relevant)) > 0
        hit_rates.append(1 if hit else 0)
    return np.mean(hit_rates)
    

In [41]:
top_pop(train, 10).get('AF7CC34DK36SQJS7WXI44DREGWJA')[:10]

['B0BPJ4Q6FJ',
 'B0BSGM6CQ9',
 'B09857JRP2',
 'B0BCK6L7S5',
 'B0BTC9YJ2W',
 'B08R5GM6YB',
 'B0B95V41NR',
 'B004XNK7AI',
 'B08SJY4T7K',
 'B09V91H5XM']

In [53]:
top_hit_rate = compute_metrics(top_20_users['user_id'], top_pop(train, 10), 10).get('Hit rate')
print(f'TopPop hit rate for the top 20: {top_hit_rate}')
bottom_hit_rate = compute_metrics(last_20_users['user_id'], top_pop(train, 10), 10).get('Hit rate')
print(f'TopPop hit rate for the bottom 20: {bottom_hit_rate}')

knn_top_hit_rate = compute_metrics(top_20_users['user_id'], knn_user_rankings, 10).get('Hit rate')
print(f'KNN Baseline hit rate for the top 20: {knn_top_hit_rate}')
knn_bottom_hit_rate = compute_metrics(last_20_users['user_id'], knn_user_rankings, 10).get('Hit rate')
print(f'KNN Baseline hit rate for the bottom 20: {knn_bottom_hit_rate}')


svd_top_hit_rate = compute_metrics(top_20_users['user_id'], svd_user_rankings, 10).get('Hit rate')
print(f'SVD hit rate for the top 20: {svd_top_hit_rate}')


svd_bottom_hit_rate = compute_metrics(last_20_users['user_id'], svd_user_rankings, 10).get('Hit rate')
print(f'SVD hit rate for the top 20: {svd_bottom_hit_rate}')

TopPop hit rate for the top 20: 0.05
TopPop hit rate for the bottom 20: 0.35
KNN Baseline hit rate for the top 20: 0.025
KNN Baseline hit rate for the bottom 20: 0.08
SVD hit rate for the top 20: 0.031
SVD hit rate for the top 20: 0.13


## Top 20 and bottom 20 items

### TopPop

In [59]:
item_count = train['item_id'].value_counts(sort=True).reset_index()
top_20_items = item_count[:round(len(item_count)/5)]
last_20_items = item_count[(len(item_count) - round(len(item_count)/5)):]

In [60]:
top_pop_recommendations = set(top_pop(train, k=10).get(next(iter(top_pop(train, k=10)))))

In [61]:
top_20_items_set = set(top_20_items['item_id'])
num = top_pop_recommendations.intersection(top_20_items_set)
coverage_top_20 = len(num) / len(top_20_items_set)
print(f'Coverage top 20, TopPop: {round(coverage_top_20, 3)}')

Coverage top 20, TopPop: 0.098


In [62]:
bottom_20_items_set = set(last_20_items['item_id'])
num = top_pop_recommendations.intersection(bottom_20_items_set)
coverage_bottom_20 = len(num) / len(bottom_20_items_set)
print(f'Coverage bottom 20, TopPop: {coverage_bottom_20}')


Coverage bottom 20, TopPop: 0.0


In [63]:
k = 10
knn_recommendations = set()
for uid in train['user_id']:
    for item in knn_user_rankings[uid][:k]:
        knn_recommendations.add(item)
num = knn_recommendations.intersection(top_20_items_set)
coverage_top_20 = len(num) / len(top_20_items_set)
num = knn_recommendations.intersection(bottom_20_items_set)

coverage_bottom_20 = len(num) / len(bottom_20_items_set)
print(f'Coverage top 20, KNN: {round(coverage_top_20, 3)}')
print(f'Coverage bottom 20, KNN: {round(coverage_bottom_20, 3)}')

Coverage top 20, KNN: 0.461
Coverage bottom 20, KNN: 0.961


In [64]:
k = 10
svd_recommendations = set()
for uid in train['user_id']:
    for item in svd_user_rankings[uid][:k]:
        svd_recommendations.add(item)
num = svd_recommendations.intersection(top_20_items_set)
coverage_top_20 = len(num) / len(top_20_items_set)
num = svd_recommendations.intersection(bottom_20_items_set)
coverage_bottom_20 = len(num) / len(bottom_20_items_set)
print(f'Coverage top 20, SVD: {round(coverage_top_20, 3)}')
print(f'Coverage bottom 20, SVD: {round(coverage_bottom_20, 3)}')

Coverage top 20, SVD: 0.451
Coverage bottom 20, SVD: 0.284


# Error Analysis, KNN Baseline

In [65]:
rrs = []
for uid in test['user_id'].unique():
    relevant = test_ground_truth[uid]

    recommended = knn_user_rankings[uid]
    # MRR@k
    for rank, item in enumerate(recommended, 1):
        if item in relevant:
            rr = 1 / rank
            rrs.append(rr)
            break
    else:
        rrs.append(0)

In [66]:
rr_res = pd.DataFrame(test['user_id'].unique())
rr_res['rr'] = rrs
rr_res.sort_values(by='rr', ascending=False)[300:]

Unnamed: 0,0,rr
423,AHUXXNY2JWBI6E5TXSK6TABTFZFA,0.005291
229,AG4ZLTRHVAZRU33BPY5Y643IZXPQ,0.005263
43,AEFCFIMIWI4KROMC3NDEKIBZQQ5A,0.005208
427,AHVTPXABLBUBOHIQYQ3QQG6WR6LA,0.005181
151,AFDS5HV3F3ONORE7U7PMXMVMZAUA,0.005128
...,...,...
225,AG44UYFDZHI6FRBQSLDWOGIEOY4A,0.000000
205,AFWETPPC5GPGN3ITLARC2OHTB5HA,0.000000
383,AHJ6S3TB7HAXJQ4OFAFHDO5VFNQA,0.000000
165,AFI3PUWOT5VQ2RUUVOCEFT3B74BQ,0.000000


In [74]:
rr_res.sort_values(by='rr', ascending=False)

Unnamed: 0,0,rr
204,AFWEOWK32WTIBX4MC7D3WFDLCKJQ,1.0
10,AE4HL6GTIQJ7HRI53XDKIGVJSPRQ,1.0
348,AH7I57UVCR75DY4O5MDVVGWT2CJQ,1.0
290,AGOJCISATDQUG3HCXSIFAPT7JMOA,1.0
375,AHGSLXHSI2QM36AEUPEGTU25V2QA,1.0
...,...,...
225,AG44UYFDZHI6FRBQSLDWOGIEOY4A,0.0
205,AFWETPPC5GPGN3ITLARC2OHTB5HA,0.0
383,AHJ6S3TB7HAXJQ4OFAFHDO5VFNQA,0.0
165,AFI3PUWOT5VQ2RUUVOCEFT3B74BQ,0.0


In [76]:
rr_res[rr_res[rr_res.columns[0]] == 'AFWEOWK32WTIBX4MC7D3WFDLCKJQ']

Unnamed: 0,0,rr
204,AFWEOWK32WTIBX4MC7D3WFDLCKJQ,1.0


In [79]:
high_rr_user = 'AFWEOWK32WTIBX4MC7D3WFDLCKJQ'
low_rr_user = 'AHL4BCH2O33HFN7KSBTIJUHCJYDA'

In [80]:
high_rr_history = train[train['user_id'] == high_rr_user]
low_rr_history = train[train['user_id'] == low_rr_user]
high_rr_mean = round(np.mean(high_rr_history['rating']), 2)
high_rr_std = round(np.std(high_rr_history['rating']), 2)
print(f"The high RR user has rated {len(high_rr_history)} items. Its rating distribution is N({high_rr_mean}, {high_rr_std})")
low_rr_mean = round(np.mean(low_rr_history['rating']), 2)
low_rr_std = round(np.std(low_rr_history['rating']), 2)
print(f"The low RR user has rated {len(low_rr_history)} items. Its rating distribution is N({low_rr_mean}, {low_rr_std})")

The high RR user has rated 33 items. Its rating distribution is N(4.48, 0.74)
The low RR user has rated 6 items. Its rating distribution is N(5.0, 0.0)


In [None]:
from scipy.sparse import csr_matrix

user_item_matrix = train.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=0)

user_ids = user_item_matrix.index.tolist() 
item_ids = user_item_matrix.columns.tolist() 

user_item_matrix_sparse = csr_matrix(user_item_matrix.values)


In [83]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_item_matrix_sparse)


In [85]:
high_rr_index = user_ids.index(high_rr_user)
low_rr_index = user_ids.index(low_rr_user)  

high_rr_neighbors = user_similarity[high_rr_index].argsort()[-11:-1][::-1]
low_rr_neighbors = user_similarity[low_rr_index].argsort()[-11:-1][::-1]  

high_rr_neighbor_ids = [user_ids[i] for i in high_rr_neighbors]
low_rr_neighbor_ids = [user_ids[i] for i in low_rr_neighbors]

print("High RR User Neighbors:", high_rr_neighbor_ids)
print("Low RR User Neighbors:", low_rr_neighbor_ids)

High RR User Neighbors: ['AEMISRDYQCHSNOXKV6TQ67X2TIOA', 'AGZANHWCCITIE2FQA5NKROW5ZGHQ', 'AEXLRZ34GTMM35NQRDZ4CDKKXLIA', 'AGODDVRWGAQPWOIVFD4V76XIST3A', 'AHEIWAU3PKILH7VPQZGXPBS5BKEQ', 'AG7AKOBJWYLRDRZ4U4CSDM2L3BVQ', 'AHO4IOGAHKIQPKGVMKTDFCKGLJ7A', 'AHWXEOF3AV66R44YXMV6OL3GTLWQ', 'AFA2DVBHFESJRRZOEFN7GAEHV4BQ', 'AE6R5VSHXSCOZJBEJY7BCRM7A6OQ']
Low RR User Neighbors: ['AEYIGWWXTT5AD4ZONPSHMK53WR2Q', 'AEEJO5MWV3OVEQ2RNF6ZBGGHCGGA', 'AETS7HNSQIFRSMRCWQMHIWZ2IBOQ', 'AEDKV2KYMJZD43MAADUEKE7Z3PPA', 'AHXN24SQSFO4TJAXQSERXXGLO5FA', 'AH7VVPGAXBTVXWJOKQS4FFLNLFWQ', 'AF2O4GMP7Z6ZV7WZGJM5MGT27GAA', 'AHNMR3M7H4JDTEO5LXIKZC4DJUUA', 'AEFPGSOOIM5GMCPX7FOCCH3UF6ZA', 'AFN6HSXIFGJROQZ2NN3FSUNNM6QA']


In [86]:
high_rr_neighbor_history = train[train['user_id'].isin(high_rr_neighbor_ids)]
low_rr_neighbor_history = train[train['user_id'].isin(low_rr_neighbor_ids)]

print("High RR User Neighbors Rating History:")
print(high_rr_neighbor_history)
print("\nLow RR User Neighbors Rating History:")
print(low_rr_neighbor_history)

High RR User Neighbors Rating History:
         item_id                       user_id  rating      timestamp
431   B004XNK7AI  AE6R5VSHXSCOZJBEJY7BCRM7A6OQ     5.0  1567019697687
432   B00USQY0CK  AE6R5VSHXSCOZJBEJY7BCRM7A6OQ     5.0  1574456020905
433   B00UTD0YG8  AE6R5VSHXSCOZJBEJY7BCRM7A6OQ     5.0  1572994270881
434   B01DPCONFM  AE6R5VSHXSCOZJBEJY7BCRM7A6OQ     5.0  1566827507188
435   B078KTV8L4  AE6R5VSHXSCOZJBEJY7BCRM7A6OQ     5.0  1567881451837
...          ...                           ...     ...            ...
9715  B0BFKQ9QXD  AHWXEOF3AV66R44YXMV6OL3GTLWQ     4.0  1568243217003
9716  B0BPJ4Q6FJ  AHWXEOF3AV66R44YXMV6OL3GTLWQ     5.0  1560437273975
9717  B0BSGM6CQ9  AHWXEOF3AV66R44YXMV6OL3GTLWQ     4.0  1356981373000
9718  B0BT2ZRCY2  AHWXEOF3AV66R44YXMV6OL3GTLWQ     5.0  1480049029000
9719  B0BTC9YJ2W  AHWXEOF3AV66R44YXMV6OL3GTLWQ     4.0  1569245265543

[108 rows x 4 columns]

Low RR User Neighbors Rating History:
         item_id                       user_id  rating    

In [None]:
high_rr_items = set(high_rr_history['item_id'])

low_rr_items = set(low_rr_history['item_id'])

high_rr_neighbor_items = set(high_rr_neighbor_history['item_id'])

low_rr_neighbor_items = set(low_rr_neighbor_history['item_id'])

# Calculate overlap
high_rr_overlap = len(high_rr_items.intersection(high_rr_neighbor_items))
low_rr_overlap = len(low_rr_items.intersection(low_rr_neighbor_items))

print(f"High RR User Overlap with Neighbors: {high_rr_overlap} items")
print(f"Low RR User Overlap with Neighbors: {low_rr_overlap} items")

High RR User Overlap with Neighbors: 18 items
Low RR User Overlap with Neighbors: 6 items


In [None]:
knn_user_rankings_with_ratings = {}

for user_id, non_rated_items in user_non_rated.items():
    predictions = [best_knn.predict(user_id, item_id).est for item_id in non_rated_items]
    
    item_ratings = list(zip(non_rated_items, predictions))
    item_ratings.sort(key=lambda x: x[1], reverse=True) 
    
    knn_user_rankings_with_ratings[user_id] = [(item_id, rating) for item_id, rating in item_ratings]

In [None]:
with open('knn_user_rankings_with_ratings.json', 'w') as fp:
    json.dump(knn_user_rankings_with_ratings, fp)

In [None]:
rr_res.sort_values(by='rr', ascending=False)[0:20]

Unnamed: 0,0,rr
422,AHUPJSMPURGM4QXXHW7VZLHZHITQ,1.0
415,AHSCA5GKZMVKAYKINVSXB6E3XMTQ,1.0
85,AERWQO4KG65FDWZTSAU44MKDCWFA,1.0
120,AF2MGAQBAT3E4XQC7NNAQFYG4MIQ,1.0
119,AF2GPIP26H7E5EHDWC6P7RKZ3OUQ,1.0
372,AHGDGGMCSMMALHTX6WGLJFFUVJXA,0.5
290,AGOJCISATDQUG3HCXSIFAPT7JMOA,0.5
28,AEBD7F5ITOHM4VSB4V7L2G35YBSQ,0.333333
27,AEATZO22KZMV7ECXWBK5KSURDZAA,0.333333
262,AGFP4L46THLV4ZT2OKLNYP7FZ3LQ,0.333333


In [None]:
low_rrs = rr_res[rr_res['rr'] <= 0.05]
mid_rrs = rr_res[(rr_res['rr'] > 0.05) & (rr_res['rr'] <= 1.00001/3)]
high_rrs = rr_res[rr_res['rr'] >= 0.5]

In [None]:
means = train[['user_id', 'rating']].groupby('user_id').mean()
stds = train[['user_id', 'rating']].groupby('user_id').std()
count = train[['user_id', 'item_id']].groupby('user_id').count()

In [None]:
low_rrs[low_rrs.columns[0]]

0      AE23LDQTB7L76AP6E6WPBFVYL5DA
1      AE2BV2H57ERXAPW7SOAXFLWA2S2Q
2      AE2NWSTL7JJJWOCBKZCZF6KDQIZQ
3      AE2OQ55HLV5XO54DWLE4PB5XUNPA
4      AE37RAW77LNOTEDKMDKGXSGQHD5Q
                   ...             
432    AHXI5OYTSU227DFONJML5ZLE4MZQ
433    AHXN24SQSFO4TJAXQSERXXGLO5FA
434    AHYAO24VSRJF47XSA3XNLHHDE4VA
435    AHYQKMLIXHKFDHC7K2Y4JXM4YLOQ
436    AHZER36Y6IL7USHGBCELVMN56VOQ
Name: 0, Length: 378, dtype: object

In [None]:
def compute_stats(rrs, name : str, k: int = 20):
    users = rrs[rrs.columns[0]]
    rrs_means = means[means.index.isin(rrs[rrs.columns[0]])]
    rrs_std = stds[stds.index.isin(rrs[rrs.columns[0]])]
    rrs_count = count[count.index.isin(rrs[rrs.columns[0]])]
    print(f'{name} Stats:')
    print(f'Amount of users: {len(users)}')
    print(f'Distribution: N({round(rrs_means.mean()[0], 2)}, {round(rrs_std.mean()[0], 2)})')
    print(f'Average number of items rated: {round(rrs_count.mean(), 2)}')
    arbitrarity = []
    for user in users:
        items_n_ratings = knn_user_rankings_with_ratings[user][:k]
        z = list(zip(*items_n_ratings))[1]
        if z[0] == z[-1]:
            arbitrarity.append(1)
            continue
        arbitrarity.append(0)
    rrs_arbitrarity = sum(arbitrarity) / len(arbitrarity)
    print(f'Average Arbitrarity: {round(rrs_arbitrarity, 2)}')
    return [rrs_means, rrs_std, rrs_count]


low_rr_stats = compute_stats(low_rrs, 'Low RRs')
mid_rr_stats = compute_stats(mid_rrs, 'Mid RRs')
high_rr_stats = compute_stats(high_rrs, 'High RRs')

Low RRs Stats:
Amount of users: 378
Distribution: N(4.5, 0.72)
Average number of items rated: item_id    10.71
dtype: float64
Average Arbitrarity: 0.68
Mid RRs Stats:
Amount of users: 52
Distribution: N(4.59, 0.65)
Average number of items rated: item_id    11.25
dtype: float64
Average Arbitrarity: 0.71
High RRs Stats:
Amount of users: 7
Distribution: N(4.37, 0.84)
Average number of items rated: item_id    10.57
dtype: float64
Average Arbitrarity: 0.29


  print(f'Distribution: N({round(rrs_means.mean()[0], 2)}, {round(rrs_std.mean()[0], 2)})')
