In [2]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
path = "./movielens/movielens100k/"
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), index_col = 'userId', encoding='utf-8')

In [4]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=123)

print(train_df.shape)
print(test_df.shape)

(80003, 3)
(20001, 3)


In [6]:
train_df.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
480,6870,4.0,1272667994
434,590,1.0,886376254
668,1089,3.0,993613415
157,47,3.5,1292893066
152,54997,4.0,1335901908


In [7]:
train_df.groupby('movieId').apply(lambda x: pd.Series(x['rating'].values, index=x.index))

movieId  userId
1        574       4.0
         193       4.0
         72        3.5
         125       4.0
         119       2.0
                  ... 
161944   287       5.0
162376   73        4.5
162542   611       5.0
162672   611       3.0
163949   547       5.0
Length: 80003, dtype: float64

In [10]:
# sparse matrix > pandas grouped index > unstack
sparse_matrix = train_df.groupby('movieId').apply(lambda x: pd.Series(x['rating'].values, index=x.index)).unstack()
sparse_matrix.index.name = 'movieId'
sparse_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,3.0,,4.0,,...,,4.0,3.5,,,,,,4.0,5.0
2,,,,,,,,,,,...,5.0,,,3.0,,,,,,
3,,,,,4.0,,,,,,...,,,,3.0,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161944,,,,,,,,,,,...,,,,,,,,,,
162376,,,,,,,,,,,...,,,,,,,,,,
162542,,,,,,,,,,,...,,,,,,,,,,
162672,,,,,,,,,,,...,,,,,,,,,,


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a,b):
    sim_values = cosine_similarity(a.values, b.values)
    sim_df = pd.DataFrame(data =sim_values, columns= a.index.values, index = a.index)
    return sim_df

### Item-based CF

In [12]:
item_sparse_matrix = sparse_matrix.fillna(0)
item_sparse_matrix.shape

(8410, 671)

In [13]:
item_sparse_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
item_cos_sim_df = cos_sim_matrix(item_sparse_matrix, item_sparse_matrix)
item_cos_sim_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.314267,0.226255,0.065995,0.190867,0.270079,0.252205,0.0,0.114362,0.227361,...,0.0,0.0,0.0,0.088337,0.088337,0.088337,0.0,0.0,0.0,0.0
2,0.314267,1.0,0.173391,0.091808,0.223967,0.1316,0.183426,0.118513,0.086464,0.311323,...,0.0,0.061444,0.076805,0.092166,0.092166,0.15361,0.076805,0.0,0.0,0.0
3,0.226255,0.173391,1.0,0.128747,0.25448,0.104642,0.234486,0.013296,0.239474,0.148638,...,0.0,0.0,0.0,0.129249,0.129249,0.0,0.0,0.0,0.0,0.0
4,0.065995,0.091808,0.128747,1.0,0.082023,0.032992,0.144593,0.288195,0.262824,0.057517,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.190867,0.223967,0.25448,0.082023,1.0,0.173251,0.262717,0.0,0.281184,0.194304,...,0.0,0.197623,0.0,0.131749,0.131749,0.0,0.0,0.0,0.0,0.0


In [17]:
userid_grouped = train_df.groupby('userId')
userid_grouped.indices.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 22

* Rating prediction in item-based CF 
$$\hat{r_{u,i}} = \frac{\sum_{j \in N(u)}{w_{ij}r_{u,j}}}{\sum_{j \in N(u)}{w_{ij}}}$$
* $\hat{r_{u,i}}$ : 사용자 u 가 평가하지 않은 영화 i에 대한 예측 평점 
* $r_{u,j}$ : 사용자 u가 이미 평가한 영화 j의 평점
* $w_{ij}$ : 영화 i와 j 간의 유사도
* $N(u)$ : 사용자 u 가 이미 평가한 영화들의 집합

In [18]:
item_prediction_result_df = pd.DataFrame(index=list(userid_grouped.indices.keys()), columns=item_sparse_matrix.index)
for userId, group in tqdm(userid_grouped):
    user_sim = item_cos_sim_df.loc[group['movieId']]
    user_rating = group['rating']
    sim_sum = user_sim.sum(axis=0)

    # user movie similarity * user movie rating / (sum(similarity) + 1)
    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum+1)
    item_prediction_result_df.loc[userId] = pred_ratings

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for userId, group in tqdm(userid_grouped):


  0%|          | 0/671 [00:00<?, ?it/s]

In [19]:
item_prediction_result_df.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
1,1.774265,1.605894,1.456806,1.040275,1.443237,1.753157,1.591257,0.299854,0.972465,1.599577,...,1.697117,1.012484,1.510504,1.305886,1.305886,0.0,1.510504,0.235963,0.235963,1.697117
2,3.277016,3.243874,3.071935,2.951677,3.147534,3.20978,3.165981,2.610137,2.989077,3.297936,...,2.550841,2.465265,2.488719,2.3599,2.3599,0.863266,2.488719,0.274543,0.274543,2.550841
3,3.304641,3.188663,2.922312,2.349455,3.050054,3.239119,2.835745,1.904987,2.192478,3.154871,...,2.350028,2.568041,2.865978,2.633155,2.633155,1.543666,2.865978,0.573818,0.573818,2.350028
4,4.373381,4.288574,4.142773,3.994503,4.202269,4.308976,4.238524,3.439305,3.934052,4.232964,...,4.152524,3.93737,4.018514,3.969519,3.969519,3.390452,4.018514,0.457572,0.457572,4.152524
5,3.767676,3.766881,3.630271,3.079929,3.689588,3.66795,3.637033,2.991935,3.042444,3.714585,...,3.302587,3.259449,3.414095,3.291313,3.291313,2.509057,3.414095,1.155095,1.155095,3.302587
6,3.014853,2.833353,2.410679,2.371273,2.633687,2.933697,2.520089,0.667283,1.616689,2.870783,...,1.999981,2.50795,2.671347,2.239636,2.239636,1.737215,2.671347,0.067561,0.067561,1.999981
7,3.468593,3.289244,3.149949,3.11674,3.121669,3.338067,3.211731,2.170935,2.78416,3.298705,...,2.81325,2.900815,2.883829,3.003913,3.003913,1.796631,2.883829,0.366058,0.366058,2.81325
8,3.828857,3.763175,3.566179,3.167394,3.649,3.753037,3.503639,2.388325,3.000992,3.749295,...,3.239087,3.405767,3.390763,3.292496,3.292496,2.45385,3.390763,0.675223,0.675223,3.239087
9,3.580886,3.370461,3.04515,2.487645,3.247901,3.409057,3.195675,1.559257,2.339545,3.237664,...,2.782988,2.53637,2.378734,2.340404,2.340404,1.608122,2.378734,0.337804,0.337804,2.782988
10,3.454884,3.253317,2.943096,2.306234,3.091387,3.416378,3.028691,1.25308,2.103217,3.187669,...,2.898395,2.799912,2.571264,1.941486,1.941486,1.467945,2.571264,0.337804,0.337804,2.898395


### User-based CF

In [20]:
user_sparse_matrix = sparse_matrix.fillna(0).transpose()
user_sparse_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
user_cos_sim_df = cos_sim_matrix(user_sparse_matrix, user_sparse_matrix)
user_cos_sim_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.060029,0.021497,0.0,0.105222,0.0,0.0,0.0,...,0.0,0.0,0.003646,0.027521,0.0,0.0,0.0,0.078556,0.0,0.021933
2,0.0,1.0,0.119698,0.091269,0.095829,0.0,0.190992,0.039032,0.062569,0.022426,...,0.370309,0.046561,0.052544,0.149731,0.310915,0.381603,0.071845,0.007258,0.055991,0.098162
3,0.0,0.119698,1.0,0.059486,0.1299,0.069601,0.155641,0.198341,0.113777,0.09998,...,0.15674,0.072832,0.144362,0.121306,0.133746,0.104041,0.046827,0.072373,0.11099,0.187535
4,0.060029,0.091269,0.059486,1.0,0.106521,0.047956,0.223622,0.103946,0.039112,0.085982,...,0.103122,0.036381,0.09904,0.202593,0.093289,0.064506,0.053227,0.096178,0.074345,0.105383
5,0.021497,0.095829,0.1299,0.106521,1.0,0.056352,0.071238,0.1472,0.04167,0.0123,...,0.189445,0.02432,0.111278,0.192354,0.164027,0.057903,0.026061,0.045491,0.048408,0.192983


* Rating prediction in user-based CF 
$$\hat{r_{i,u}} = \frac{\sum_{j \in N(i)}{w_{u,j}r_{j,i}}}{\sum_{j \in N(i)}{w_{u,j}}}$$
* $\hat{r_{i,u}}$ : 사용자 u의 영화 i에 대한 예측 평점
* $r_{j,i}$ : 영화 i에 대한 사용자 j 의 평점
* $w_{u,j}$ : 사용자 u 와 사용자 j 간 유사도
* $N(i)$ : 영화 i를 평가한 사용자 집합

In [24]:
movieId_grouped = train_df.reset_index().groupby('movieId')
user_prediction_result_df = pd.DataFrame(index=list(movieId_grouped.indices.keys()), columns= user_sparse_matrix.index)
for movieId, group in tqdm(movieId_grouped):
    user_sim = user_cos_sim_df.loc[group['userId']]
    user_rating = group['rating']
    sim_sum = user_sim.sum(axis=0)

    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum+1)
    user_prediction_result_df.loc[movieId] = pred_ratings

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movieId, group in tqdm(movieId_grouped):


  0%|          | 0/8410 [00:00<?, ?it/s]

In [33]:
user_prediction_result_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
1,3.067798,3.721064,3.788379,3.799945,3.763593,3.667811,3.811052,3.809471,3.793516,3.697778,...,3.763444,3.721168,3.785827,3.808001,3.718505,3.702272,3.56374,3.634019,3.845928,3.882597
2,2.08863,3.307468,3.121399,3.093966,3.14229,2.739304,3.188095,3.090049,2.894015,2.890438,...,3.433652,2.881498,3.062172,3.156245,3.338706,3.208241,2.5674,2.755839,3.077246,3.115051
3,1.480251,2.524605,2.369424,2.560736,2.812561,1.837934,2.622094,2.527707,2.400379,2.192776,...,2.567968,2.086535,2.375761,2.66732,2.553674,2.475664,2.207521,2.300801,2.63401,2.495774
4,0.49421,1.462797,1.086608,1.406723,1.099382,0.892039,1.552258,1.244669,0.992792,0.93324,...,1.504581,0.559401,1.101299,1.481492,1.413832,1.452417,0.884884,0.653479,1.168308,1.435528
5,1.380974,2.760309,2.606356,2.637553,2.781519,2.162574,2.660509,2.717922,2.558434,2.231516,...,2.839322,2.176904,2.589868,2.853667,2.773441,2.672331,2.176147,2.089528,2.603328,2.741735


In [26]:
# Evaluation
test_df.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
23,1625,4.5,1148671498
564,2801,3.0,974716060
665,1541,3.0,995232789
574,49530,4.0,1232810927
472,841,4.0,1006929182


In [51]:
def evaluate(test_df, prediction_result_df):
    test_df = test_df.reset_index()
    groups_with_movie_ids = test_df.groupby(by='movieId')
    groups_with_user_ids = test_df.groupby(by='userId')
    intersection_movie_ids = sorted(list(set(list(prediction_result_df.columns)).intersection(set(list(groups_with_movie_ids.indices.keys())))))
    intersection_user_ids = sorted(list(set(list(prediction_result_df.index)).intersection(set(groups_with_user_ids.indices.keys()))))

    print(len(intersection_movie_ids))
    print(len(intersection_user_ids))

    compressed_prediction_df = prediction_result_df.loc[intersection_user_ids, intersection_movie_ids]
    # compressed_prediction_df

    # test_df에 대해서 RMSE 계산
    grouped = test_df.groupby(by='userId')
    for userId, group in tqdm(grouped):
        if userId in intersection_user_ids:
            pred_ratings = compressed_prediction_df.loc[userId][compressed_prediction_df.loc[userId].index.intersection(list(group['movieId'].values))]
            pred_ratings = pred_ratings.to_frame(name='rating').reset_index().rename(columns={'index':'movieId','rating':'pred_rating'})
            actual_ratings = group[['rating', 'movieId']].rename(columns={'rating':'actual_rating'})
            
            final_df = pd.merge(actual_ratings, pred_ratings, how='inner', on=['movieId'])
            final_df = final_df.round(4) 

    return final_df

In [55]:
result_df = evaluate(test_df, item_prediction_result_df) # item-based collaborative filtering
print(result_df)
print(f"RMSE: {sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))}")

4247
671


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for userId, group in tqdm(grouped):


  0%|          | 0/671 [00:00<?, ?it/s]

    actual_rating  movieId pred_rating
0             4.5     4963    3.860285
1             4.0     1387    3.834673
2             4.0     1291    3.892904
3             4.0     5989    3.840243
4             5.0      589    3.884863
5             3.5     6565    3.655618
6             3.0     3160      3.7409
7             4.0     2797    3.835279
8             5.0     2804    3.787254
9             1.0     3052    3.829332
10            4.0     6269    3.539992
11            4.0     1240    3.879108
12            4.0     1266    3.798226
13            4.0     5377    3.798474
14            4.0     1225    3.835014
15            4.0      608    3.851064
16            4.0     1265    3.877838
RMSE: 0.8459285113860532


In [53]:
user_prediction_result_df = user_prediction_result_df.transpose()

In [56]:
result_df = evaluate(test_df, user_prediction_result_df) # item-based collaborative filtering
print(result_df)
print(f"RMSE: {sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))}")

4247
671


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for userId, group in tqdm(grouped):


  0%|          | 0/671 [00:00<?, ?it/s]

    actual_rating  movieId pred_rating
0             4.5     4963     3.74079
1             4.0     1387    3.586239
2             4.0     1291    3.892131
3             4.0     5989    3.662411
4             5.0      589    3.900381
5             3.5     6565    3.055356
6             3.0     3160    3.325371
7             4.0     2797    3.566423
8             5.0     2804    3.551683
9             1.0     3052    3.102688
10            4.0     6269    1.987301
11            4.0     1240    3.807503
12            4.0     1266    3.588251
13            4.0     5377    3.340871
14            4.0     1225    3.804256
15            4.0      608    4.094953
16            4.0     1265     3.71065
RMSE: 0.9047554620236182
