In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
book_path = '../data/book_score.csv'
movie_path = '../data/movie_score.csv'
book_df = pd.read_csv(book_path, encoding='utf-8')
movie_df = pd.read_csv(movie_path, encoding='utf-8')
book_df.head()
print(book_df.nunique())
print(movie_df.nunique())

User      4419
Book      1200
Rate         6
Time    634776
Tag      59960
dtype: int64
User       1023
Movie      1200
Rate          6
Time     712198
Tag      110708
dtype: int64


In [3]:
# print(movie_df[movie_df["Rate"] == 0].nunique())
print(book_df["Rate"].value_counts())

Rate
0    233447
4    165052
5    131574
3     92447
2     11296
1      3438
Name: count, dtype: int64


In [4]:
book_df.rename(columns={'Book': 'Item'}, inplace=True)
movie_df.rename(columns={'Movie': 'Item'}, inplace=True)

book_df.drop(book_df[book_df["Rate"] == 0].index, inplace=True)
movie_df.drop(movie_df[movie_df["Rate"] == 0].index, inplace=True)

print(movie_df["Rate"].value_counts())

Rate
4    203661
3    162926
5    115492
2     32510
1      9059
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split
data = movie_df
train_data, test_data = train_test_split(data, test_size=0.5, random_state=19260817)

In [6]:
matrix = train_data.pivot_table(index='User', columns='Item', values='Rate')
user_mean = matrix.mean(axis=1)
matrix = matrix.subtract(matrix.mean(axis=1), axis = "rows")
user_mean.head()

User
1000068    3.157895
1000174    3.324324
1000263    2.000000
1000905    4.206751
1001093    3.709571
dtype: float64

In [7]:
user_similarity = matrix.T.corr(method="pearson")
user_similarity.head()

User,1000068,1000174,1000263,1000905,1001093,1002057,1002987,1003080,1004613,1005928,...,36855984,37829070,39742649,42444500,43828497,44605628,45987769,46788722,48190738,48360271
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000068,1.0,0.801784,,0.586302,0.229416,0.353553,0.942809,0.811666,0.976187,0.528694,...,0.443203,,,0.21693,,0.800132,0.343522,,,
1000174,0.801784,1.0,,0.251527,0.45986,0.365693,0.122703,0.427518,0.46867,0.488585,...,0.610025,0.517855,0.052414,0.540931,0.152075,0.442632,0.224569,,,
1000263,,,,,,,,,,,...,,,,,,,,,,
1000905,0.586302,0.251527,,1.0,0.297081,0.108776,-0.034857,0.484901,0.358721,0.339228,...,0.234768,0.015103,0.208514,0.201591,0.368928,0.195698,0.31348,,,
1001093,0.229416,0.45986,,0.297081,1.0,0.396934,0.259053,0.28464,0.385643,0.23578,...,0.500564,0.140963,-0.345485,0.11863,0.200944,0.366021,0.468643,,,0.0


In [51]:
# use the similarity matrix to predict the user's rating for an item
similar_users_storage = dict()
def predict(user, item, similarity_threshold = 0.5, similar_users_num = 30):
    if(user not in matrix.index):
        return user_mean.mean()
        
    user_rating = matrix.loc[user, item]
    if np.isnan(user_rating):

        item_rating = matrix[item]
        item_rating = item_rating.dropna()

        if(similar_users_storage.get(user) is None):
            similar_users = user_similarity[user].dropna()
            similar_users = similar_users[similar_users > similarity_threshold]
            similar_users = similar_users.sort_values(ascending=False)
            similar_users_storage[user] = similar_users
        else:
            similar_users = similar_users_storage[user]
        
        # Only consider similar users who have rated the item
        common_users = list(set(similar_users.index).intersection(set(item_rating.index)))
        common_users.sort(key = lambda x: similar_users[x], reverse=True)
        
        common_users = common_users[:similar_users_num if len(common_users) > similar_users_num else len(common_users)]
        item_rating = item_rating.loc[common_users]

        if len(item_rating) == 0:
            return user_mean[user]
        else:
            return item_rating.mean() + user_mean[user]
    else:
        return user_rating

In [9]:
from sklearn.metrics import ndcg_score
def compute_ndcg(group):
    true_ratings = group['true_rating'].tolist()
    pred_ratings = group['predict_rating'].tolist()
    return ndcg_score([true_ratings], [pred_ratings], k = 50)

In [54]:
from tqdm.notebook import tqdm
predict_rating = []
for idx, (user, item, true_rating, time, tag) in tqdm(test_data.iterrows()):
    # print(user, item, true_rating, time, tag)
    predict_rating.append(predict(user, item, similarity_threshold=0.4, similar_users_num=50))
    # print(true_rating, predict_rating)

results_df = pd.DataFrame({'user': test_data['User'], 'true_rating': test_data['Rate'], 'predict_rating': predict_rating})

print(results_df.head())


0it [00:00, ?it/s]

KeyboardInterrupt: 

In [53]:
# 助教是这么写的，就很神奇, 但是这样对只有一个项目求ndcg会报错
# ndcg_scores = results_df.groupby('user').apply(compute_ndcg)
ndcg_scores = results_df[results_df.groupby('user')['user'].transform('count') > 1].groupby('user').apply(compute_ndcg)

avg_ndcg = ndcg_scores.mean()
print(f"Average NDCG score: {avg_ndcg}")

Average NDCG score: 0.9148088464886232
