In [65]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [66]:
book_path = '../data/book_score.csv'
movie_path = '../data/movie_score.csv'
book_df = pd.read_csv(book_path, encoding='utf-8')
movie_df = pd.read_csv(movie_path, encoding='utf-8')
book_df.head()
print(book_df.nunique())
print(movie_df.nunique())

User      4419
Book      1200
Rate         6
Time    634776
Tag      59960
dtype: int64
User       1023
Movie      1200
Rate          6
Time     712198
Tag      110708
dtype: int64


In [67]:
# print(movie_df[movie_df["Rate"] == 0].nunique())
print(book_df["Rate"].value_counts())

Rate
0    233447
4    165052
5    131574
3     92447
2     11296
1      3438
Name: count, dtype: int64


In [68]:
book_df.rename(columns={'Book': 'Item'}, inplace=True)
movie_df.rename(columns={'Movie': 'Item'}, inplace=True)

book_df.drop(book_df[book_df["Rate"] == 0].index, inplace=True)
movie_df.drop(movie_df[movie_df["Rate"] == 0].index, inplace=True)

print(movie_df["Rate"].value_counts())

Rate
4    203661
3    162926
5    115492
2     32510
1      9059
Name: count, dtype: int64


In [69]:
from sklearn.model_selection import train_test_split
data = movie_df
train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)

In [70]:
matrix = train_data.pivot_table(index='Item', columns='User', values='Rate')
item_mean = matrix.mean(axis=1)
matrix = matrix.subtract(matrix.mean(axis=1), axis = "rows")
matrix.head()
# item_mean.head()

User,1000068,1000174,1000263,1000905,1001093,1002057,1002987,1003080,1004613,1005928,...,36855984,37829070,39742649,42444500,43828497,44605628,45987769,46788722,48190738,48360271
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1291543,,,,0.247678,-0.752322,,,,-0.752322,,...,,,,,,,-2.752322,,,
1291544,,,,,,,,,0.365462,,...,,,,,,,-1.634538,,,
1291545,,,,,-0.395095,,,0.604905,,,...,,,,0.604905,,,,,,
1291546,,,,,,-0.648294,-0.648294,,0.351706,0.351706,...,0.351706,,,0.351706,,,,,,
1291548,,,,,-0.295455,-0.295455,,-0.295455,-0.295455,-0.295455,...,,,,0.704545,,,-0.295455,,,


In [71]:
item_similarity = matrix.T.corr(method="pearson")
item_similarity.head()

Item,1291543,1291544,1291545,1291546,1291548,1291549,1291550,1291552,1291554,1291555,...,4312232,4323732,4707230,4709904,4739952,4822848,4888039,4896263,5250583,5344178
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1291543,1.0,0.342478,0.190855,0.331996,0.10994,0.186855,0.259444,0.2707,-0.116035,0.453386,...,0.199938,-0.158963,-0.302364,2.137725e-17,0.004846,0.105963,0.074092,0.100066,0.197477,0.209272
1291544,0.342478,1.0,0.131949,0.154845,0.030472,0.267641,0.219454,0.3889,-0.168655,0.291039,...,-0.016912,-0.207324,0.033488,0.286972,0.336263,0.169495,0.153911,-0.140784,0.01309,-0.026704
1291545,0.190855,0.131949,1.0,0.264158,0.148068,0.333573,0.255336,0.301223,0.391298,0.192722,...,0.404994,0.039954,0.085026,-0.06352708,0.10737,0.092967,-0.083654,0.105282,0.119083,0.050235
1291546,0.331996,0.154845,0.264158,1.0,0.27393,0.265229,0.304796,0.1994,0.054591,0.133667,...,0.369875,0.114708,-0.003904,0.2178819,0.221861,0.186354,-0.102062,0.339295,-0.015287,0.158824
1291548,0.10994,0.030472,0.148068,0.27393,1.0,0.326312,0.154428,0.21773,0.156771,-0.007806,...,0.169435,0.141527,0.131188,0.06154575,0.112199,-0.191954,0.177174,0.440699,-0.167748,0.249803


In [72]:
# use the similarity matrix to predict the item's rating
similar_items_storage = dict()
def predict(user, item, similarity_threshold = 0.5, similar_items_num = 30):
    if(item not in matrix.index):
        return item_mean.mean()
        
    if(user not in matrix.columns):
        return item_mean.mean()
    item_rating = matrix.loc[item, user]
    if np.isnan(item_rating):

        user_rating = matrix[user]
        user_rating = user_rating.dropna()

        if(similar_items_storage.get(item) is None):
            similar_items = item_similarity[item].dropna()
            similar_items = similar_items[similar_items > similarity_threshold]
            similar_items = similar_items.sort_values(ascending=False)
            similar_items_storage[item] = similar_items
        else:
            similar_items = similar_items_storage[item]
        
        # Only consider similar items which have been rated
        common_items = list(set(similar_items.index).intersection(set(user_rating.index)))
        common_items.sort(key = lambda x: similar_items[x], reverse=True)
        
        common_items = common_items[:similar_items_num if len(common_items) > similar_items_num else len(common_items)]
        # print(len(common_items))
        user_rating = user_rating.loc[common_items]

        if len(user_rating) == 0:
            return item_mean[item]
        else:
            return user_rating.mean() + item_mean[item]
    else:
        return item_rating

In [73]:
from sklearn.metrics import ndcg_score
def compute_ndcg(group):
    true_ratings = group['true_rating'].tolist()
    pred_ratings = group['predict_rating'].tolist()
    return ndcg_score([true_ratings], [pred_ratings], k = 50)

In [74]:
from tqdm.notebook import tqdm
predict_rating = []
for idx, (user, item, true_rating, time, tag) in tqdm(test_data.iterrows()):
    # print(user, item, true_rating, time, tag)
    predict_rating.append(predict(user, item, similarity_threshold=0.1, similar_items_num=200))
    # print(true_rating, predict_rating)

results_df = pd.DataFrame({'user': test_data['User'], 'true_rating': test_data['Rate'], 'predict_rating': predict_rating})

# print(predict_rating[0:10])


0it [00:00, ?it/s]

In [75]:
# 助教是这么写的，就很神奇, 但是这样对只有一个项目求ndcg会报错
# ndcg_scores = results_df.groupby('user').apply(compute_ndcg)
ndcg_scores = results_df[results_df.groupby('user')['user'].transform('count') > 1].groupby('user').apply(compute_ndcg)

avg_ndcg = ndcg_scores.mean()
print(f"Average NDCG score: {avg_ndcg}")

results_df.sort_values(by=['user', 'predict_rating'], ascending=[True, False], inplace=True)
results_df.to_csv('../data/movie_item_based.csv', index=False, encoding='utf-8')

Average NDCG score: 0.9194036798029388
