In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
book_path = '../data/book_score.csv'
movie_path = '../data/movie_score.csv'
book_df = pd.read_csv(book_path, encoding='utf-8')
movie_df = pd.read_csv(movie_path, encoding='utf-8')
book_df.head()
print(book_df.nunique())
print(movie_df.nunique())

User      4419
Book      1200
Rate         6
Time    634776
Tag      59960
dtype: int64
User       1023
Movie      1200
Rate          6
Time     712198
Tag      110708
dtype: int64


In [3]:
# print(movie_df[movie_df["Rate"] == 0].nunique())
print(book_df["Rate"].value_counts())

Rate
0    233447
4    165052
5    131574
3     92447
2     11296
1      3438
Name: count, dtype: int64


In [4]:
book_df.rename(columns={'Book': 'Item'}, inplace=True)
movie_df.rename(columns={'Movie': 'Item'}, inplace=True)

book_df.drop(book_df[book_df["Rate"] == 0].index, inplace=True)
movie_df.drop(movie_df[movie_df["Rate"] == 0].index, inplace=True)

print(movie_df["Rate"].value_counts())

Rate
4    203661
3    162926
5    115492
2     32510
1      9059
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split
data = book_df
train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)

In [13]:
matrix = train_data.pivot_table(index='Item', columns='User', values='Rate')
item_mean = matrix.mean(axis=1)
matrix = matrix.subtract(matrix.mean(axis=1), axis = "rows")
matrix.head()
# item_mean.head()

User,1000068,1000147,1000152,1000153,1000166,1000182,1000288,1000375,1000430,1000553,...,45337884,46178728,46419900,46876853,47047302,47271690,47489193,47513621,48486558,48943819
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000280,,,0.895349,0.895349,,,,,,,...,,,,,,,,,,
1000323,,,,,,,,,,,...,,,,,,,,,,
1000445,,,,0.588235,,,,,,,...,,,,,,,,,,
1000482,,,,,,,,,,,...,,,,,,,,,,
1000517,,,,,,,,,,,...,,,,,,,,,,


In [32]:
item_similarity = matrix.T.corr(method="pearson")
item_similarity.head()

Item,1000280,1000323,1000445,1000482,1000517,1000534,1000594,1000856,1001136,1001193,...,4872671,4885241,4886245,5275059,5289756,5292912,5317075,5363767,5366275,5385852
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000280,1.0,-0.428571,,-0.452911,0.158114,,-1.0,0.4082483,,,...,-0.645497,-3.3821380000000002e-18,0.401423,-0.408248,-0.333333,,-0.852803,0.320256,,0.904534
1000323,-0.428571,1.0,0.685994,0.447214,-0.408248,,-0.114551,-0.5676135,0.557086,0.883452,...,,0.1829813,0.210649,0.347908,,,,0.0,-0.904534,0.342084
1000445,,0.685994,1.0,0.136931,0.091287,-0.29277,0.720577,0.4225771,0.058747,0.375,...,0.342997,0.2581989,0.064889,0.1,,,0.57735,0.34641,-0.395285,0.408248
1000482,-0.452911,0.447214,0.136931,1.0,0.09759,1.0,-0.522233,-2.266233e-17,0.162459,0.800641,...,0.218218,0.6859943,-0.013284,0.694136,0.656532,,0.051988,-0.248452,,-0.148522
1000517,0.158114,-0.408248,0.091287,0.09759,1.0,0.333333,-0.333333,-0.1849001,-0.158507,0.534522,...,0.707107,0.1587768,-0.049922,0.3,0.5,,0.258199,,,0.0


In [75]:
# use the similarity matrix to predict the item's rating
similar_items_storage = dict()
def predict(user, item, similarity_threshold = 0.5, similar_items_num = 30):
    if(item not in matrix.index):
        return item_mean.mean()
        
    if(user not in matrix.columns):
        return item_mean.mean()
    item_rating = matrix.loc[item, user]
    if np.isnan(item_rating):

        user_rating = matrix[user]
        user_rating = user_rating.dropna()

        if(similar_items_storage.get(item) is None):
            similar_items = item_similarity[item].dropna()
            similar_items = similar_items[similar_items > similarity_threshold]
            similar_items = similar_items.sort_values(ascending=False)
            similar_items_storage[item] = similar_items
        else:
            similar_items = similar_items_storage[item]
        
        # Only consider similar items which have been rated
        common_items = list(set(similar_items.index).intersection(set(user_rating.index)))
        common_items.sort(key = lambda x: similar_items[x], reverse=True)
        
        common_items = common_items[:similar_items_num if len(common_items) > similar_items_num else len(common_items)]
        # print(len(common_items))
        user_rating = user_rating.loc[common_items]

        if len(user_rating) == 0:
            return item_mean[item]
        else:
            return user_rating.mean() + item_mean[item]
    else:
        return item_rating

In [44]:
from sklearn.metrics import ndcg_score
def compute_ndcg(group):
    true_ratings = group['true_rating'].tolist()
    pred_ratings = group['predict_rating'].tolist()
    return ndcg_score([true_ratings], [pred_ratings], k = 50)

In [76]:
from tqdm.notebook import tqdm
predict_rating = []
for idx, (user, item, true_rating, time, tag) in tqdm(test_data.iterrows()):
    # print(user, item, true_rating, time, tag)
    predict_rating.append(predict(user, item, similarity_threshold=0.1, similar_items_num=200))
    # print(true_rating, predict_rating)

results_df = pd.DataFrame({'user': test_data['User'], 'true_rating': test_data['Rate'], 'predict_rating': predict_rating})

# print(predict_rating[0:10])


0it [00:00, ?it/s]

In [77]:
# 助教是这么写的，就很神奇, 但是这样对只有一个项目求ndcg会报错
# ndcg_scores = results_df.groupby('user').apply(compute_ndcg)
ndcg_scores = results_df[results_df.groupby('user')['user'].transform('count') > 1].groupby('user').apply(compute_ndcg)

avg_ndcg = ndcg_scores.mean()
print(f"Average NDCG score: {avg_ndcg}")

Average NDCG score: 0.9594938836309863
