In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report

In [2]:
k = 5
max_err = 1.5

In [3]:
try:
    ui_matrix = pickle.load(open('./Models/ui_matrix_ua.pkl', 'rb'))
except:
    df = pd.read_csv('./ml-100k/ua.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    ui_matrix = df.pivot(index='user_id', columns='item_id', values='rating')
    ui_matrix.fillna(0, inplace=True)

pickle.dump(ui_matrix, open('./Models/ui_matrix_ua.pkl', 'wb'))

In [4]:
try:
    user_similarity = pickle.load(open('./Models/user_similarity_ua.pkl', 'rb'))
except:
    user_similarity = cosine_similarity(ui_matrix)
    user_similarity = pd.DataFrame(user_similarity, index=ui_matrix.index, columns=ui_matrix.index)

pickle.dump(user_similarity, open('./Models/user_similarity_ua.pkl', 'wb'))

In [5]:
# index of record in ui_matrix which column i is not 0
def users_who_rated_for_item(ui_matrix: pd.DataFrame, i):
    user_ids = ui_matrix[ui_matrix[i] != 0].index
    rates = ui_matrix[ui_matrix[i] != 0][i]
    ans = pd.DataFrame({'user_id': user_ids, 'rating': rates})
    return ans

# similarity list of user u and users who rated for item i
def similarity_list(user_similarity, ui_matrix, u, i):
    return user_similarity.loc[u, users_who_rated_for_item(ui_matrix, i)['user_id']]

# predict rating of user u for item i by top k similar users
def predict_rating_by_top_k(user_similarity, ui_matrix, u, i, k) -> float:
    if u not in ui_matrix.index or i not in ui_matrix.columns: return -1
    if ui_matrix.loc[u, i] != 0:
        return ui_matrix.loc[u, i]
    df = users_who_rated_for_item(ui_matrix, i)
    df['similarity'] = similarity_list(user_similarity, ui_matrix, u, i)
    df = df.sort_values(by='similarity', ascending=False)
    df = df.iloc[:k, :]
    return (df['similarity'] * df['rating']).sum() / df['similarity'].sum()

# items which user u may interested in
def recommend_items(user_similarity, ui_matrix, u, k):
    assert u in ui_matrix.index and k in range(1, ui_matrix.shape[0])
    pred_rate = [predict_rating_by_top_k(user_similarity, ui_matrix, u, i, k) for i in ui_matrix.columns]
    ans = pd.DataFrame({'item_id': ui_matrix.columns, 'rating': pred_rate})
    ans = ans.sort_values(by='rating', ascending=False)
    return ans.iloc[:k, :]

In [6]:
test_df = pd.read_csv('./ml-100k/ua.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
test_df['pred_rating'] = [predict_rating_by_top_k(user_similarity, ui_matrix, u, i, k) for u, i in zip(test_df['user_id'], test_df['item_id'])]
test_df['pred_rating_rounded'] = test_df['pred_rating'].apply(lambda x: round(x))
test_df['err'] = test_df['pred_rating'] - test_df['rating']

In [7]:
errs = test_df['err'].to_numpy()
good_err = errs[abs(errs) < max_err]
acc = len(good_err) / len(test_df)
print(f'Accuracy: {acc}')

Accuracy: 0.8444326617179215
