In [12]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
book_path = '../data/book_score.csv'
movie_path = '../data/movie_score.csv'
book_df = pd.read_csv(book_path, encoding='utf-8')
movie_df = pd.read_csv(movie_path, encoding='utf-8')
book_df.head()
print(book_df.nunique())
print(movie_df.nunique())

User      4419
Book      1200
Rate         6
Time    634776
Tag      59960
dtype: int64
User       1023
Movie      1200
Rate          6
Time     712198
Tag      110708
dtype: int64


In [14]:
# print(movie_df[movie_df["Rate"] == 0].nunique())
print(book_df["Rate"].value_counts())

Rate
0    233447
4    165052
5    131574
3     92447
2     11296
1      3438
Name: count, dtype: int64


In [15]:
book_df.rename(columns={'Book': 'Item'}, inplace=True)
movie_df.rename(columns={'Movie': 'Item'}, inplace=True)

book_df.drop(book_df[book_df["Rate"] == 0].index, inplace=True)
movie_df.drop(movie_df[movie_df["Rate"] == 0].index, inplace=True)

print(movie_df["Rate"].value_counts())

Rate
4    203661
3    162926
5    115492
2     32510
1      9059
Name: count, dtype: int64


In [16]:
from sklearn.model_selection import train_test_split
data = book_df
train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)

In [17]:
matrix = train_data.pivot_table(index='User', columns='Item', values='Rate')
user_mean = matrix.mean(axis=1)
matrix = matrix.subtract(matrix.mean(axis=1), axis = "rows")
user_mean.head()

User
1000068    2.600000
1000147    4.129032
1000152    3.922078
1000153    4.200000
1000166    3.654545
dtype: float64

In [18]:
user_similarity = matrix.T.corr(method="pearson")
user_similarity.head()

User,1000068,1000147,1000152,1000153,1000166,1000182,1000288,1000375,1000430,1000553,...,45337884,46178728,46419900,46876853,47047302,47271690,47489193,47513621,48486558,48943819
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000068,1.0,,,,,,,,,,...,,,,,,,,,,
1000147,,1.0,,,0.218218,0.333333,,,0.5,,...,,,0.0,,,,,1.0,,-0.5
1000152,,,1.0,0.3,-0.189389,0.449089,0.0,1.0,-1.0,1.0,...,0.408248,1.0,0.75,,,,,,0.678834,0.1066
1000153,,,0.3,1.0,,0.06742,-0.447214,,,,...,0.316228,0.755929,,,,,,,,0.186551
1000166,,0.218218,-0.189389,,1.0,0.49099,0.166667,1.0,,,...,0.0,-0.231455,-0.25,,-0.707107,,,1.0,,0.868243


In [19]:
# use the similarity matrix to predict the user's rating for an item
similar_users_storage = dict()
def predict(user, item, similarity_threshold = 0.5, similar_users_num = 30):
    if(user not in matrix.index):
        return user_mean.mean()
        
    user_rating = matrix.loc[user, item]
    if np.isnan(user_rating):

        item_rating = matrix[item]
        item_rating = item_rating.dropna()

        if(similar_users_storage.get(user) is None):
            similar_users = user_similarity[user].dropna()
            similar_users = similar_users[similar_users > similarity_threshold]
            similar_users = similar_users.sort_values(ascending=False)
            similar_users_storage[user] = similar_users
        else:
            similar_users = similar_users_storage[user]
        
        # Only consider similar users who have rated the item
        common_users = list(set(similar_users.index).intersection(set(item_rating.index)))
        common_users.sort(key = lambda x: similar_users[x], reverse=True)
        
        common_users = common_users[:similar_users_num if len(common_users) > similar_users_num else len(common_users)]
        item_rating = item_rating.loc[common_users]

        if len(item_rating) == 0:
            return user_mean[user]
        else:
            return item_rating.mean() + user_mean[user]
    else:
        return user_rating

In [20]:
from sklearn.metrics import ndcg_score
def compute_ndcg(group):
    true_ratings = group['true_rating'].tolist()
    pred_ratings = group['predict_rating'].tolist()
    return ndcg_score([true_ratings], [pred_ratings], k = 50)

In [23]:
from tqdm.notebook import tqdm
predict_rating = []
for idx, (user, item, true_rating, time, tag) in tqdm(test_data.iterrows()):
    # print(user, item, true_rating, time, tag)
    predict_rating.append(predict(user, item, similarity_threshold=0.4, similar_users_num=50))
    # print(true_rating, predict_rating)

results_df = pd.DataFrame({'user': test_data['User'], 'true_rating': test_data['Rate'], 'predict_rating': predict_rating})

print(results_df.head())


0it [00:00, ?it/s]

ValueError: array length 201903 does not match index length 201904

In [22]:
# 助教是这么写的，就很神奇, 但是这样对只有一个项目求ndcg会报错
# ndcg_scores = results_df.groupby('user').apply(compute_ndcg)
ndcg_scores = results_df[results_df.groupby('user')['user'].transform('count') > 1].groupby('user').apply(compute_ndcg)

avg_ndcg = ndcg_scores.mean()
print(f"Average NDCG score: {avg_ndcg}")

Average NDCG score: 0.9595920774270024
