## Simple recommender

In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
path = "./movielens/movielens100k/"
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), index_col = 'userId', encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col = 'movieId', encoding='utf-8')

In [4]:
# pivot ratings into movie features
user_movie_matrix = ratings_df.reset_index().pivot(
    index ='movieId',
    columns ='userId',
    values = 'rating'
).fillna(0)

sparse_mat = csr_matrix(user_movie_matrix.values)

In [5]:
user_movie_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
print(sparse_mat[:10])

  (0, 6)	3.0
  (0, 8)	4.0
  (0, 12)	5.0
  (0, 14)	2.0
  (0, 18)	3.0
  (0, 19)	3.5
  (0, 22)	3.0
  (0, 25)	5.0
  (0, 29)	4.0
  (0, 36)	4.0
  (0, 42)	4.0
  (0, 43)	4.0
  (0, 46)	5.0
  (0, 47)	4.0
  (0, 54)	3.0
  (0, 55)	4.0
  (0, 62)	5.0
  (0, 66)	3.0
  (0, 67)	4.0
  (0, 68)	5.0
  (0, 69)	5.0
  (0, 71)	3.5
  (0, 72)	5.0
  (0, 74)	3.0
  (0, 76)	4.0
  :	:
  (9, 549)	4.0
  (9, 551)	3.0
  (9, 554)	3.0
  (9, 559)	4.0
  (9, 560)	3.5
  (9, 563)	3.0
  (9, 567)	4.0
  (9, 573)	3.0
  (9, 576)	3.5
  (9, 587)	3.0
  (9, 588)	4.0
  (9, 589)	3.0
  (9, 591)	4.0
  (9, 592)	2.0
  (9, 595)	3.5
  (9, 601)	3.0
  (9, 606)	3.5
  (9, 607)	3.0
  (9, 623)	4.0
  (9, 640)	3.0
  (9, 648)	5.0
  (9, 649)	4.0
  (9, 653)	4.0
  (9, 661)	3.0
  (9, 665)	3.0


In [12]:
user_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],
                           index = user_movie_matrix.columns, columns=['movies_rated']).reset_index()
user_info_df.head()

Unnamed: 0,userId,movies_rated
0,1,20
1,2,76
2,3,51
3,4,204
4,5,100


In [13]:
movie_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:]) for x in user_movie_matrix.index],
                           index = user_movie_matrix.index, columns=['users_rated']).reset_index()
movie_info_df.head()


Unnamed: 0,movieId,users_rated
0,1,247
1,2,107
2,3,59
3,4,13
4,5,56


In [19]:
# split train and test set
train_df, test_df = train_test_split(ratings_df.reset_index(), test_size=.2, random_state=123)
print(train_df.shape)
print(test_df.shape)

(80003, 4)
(20001, 4)


In [20]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
69293,480,6870,4.0,1272667994
60116,434,590,1.0,886376254
99806,668,1089,3.0,993613415
22307,157,47,3.5,1292893066
22097,152,54997,4.0,1335901908


In [22]:
# configure cold-start case 
print("user: ", len(list(set(test_df['userId'].unique())-set(train_df['userId'].unique()))))
print("movie: ", len(list(set(test_df['movieId'].unique())-set(train_df['movieId'].unique()))))

user:  0
movie:  656


In [23]:
movies_not_included = list(set(test_df['movieId'].unique())-set(train_df['movieId'].unique()))
cold_start_df = test_df[test_df['movieId'].isin(movies_not_included)].sort_values('movieId')
print("cold start case in test set: ", cold_start_df.shape)
cold_start_df.head(10)

cold start case in test set:  (723, 4)


Unnamed: 0,userId,movieId,rating,timestamp
46719,344,53,5.0,850726154
6876,41,130,4.5,1093889645
85584,575,148,4.0,1012605106
98424,659,167,4.0,836137550
96602,647,245,3.0,947292322
6799,39,285,4.0,832523436
45944,330,389,4.0,948577776
10303,73,409,2.5,1255501745
36779,264,499,3.0,995664664
83155,564,561,3.0,974712445


### Simple functions for recommendation
* Random value assign
* Average rating by movies
* Average rating by users
* Rule based

In [24]:
# Random value assign
ratings_range = np.arange(0.5, 5.5, step=0.5)
ratings_range

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [25]:
import random
pred_random = [random.choice(ratings_range) for x in range(len(test_df))]
pred_random[:10]

[1.0, 3.5, 3.0, 3.5, 0.5, 4.0, 3.5, 2.0, 0.5, 4.0]

In [26]:
test_df['pred_ratings_random'] = pred_random
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_ratings_random'].values)
rmse = np.sqrt(mse)
print('MSE >>> {}'.format(mse))
print('RMSE >>> {}'.format(rmse)) 

MSE >>> 3.770586470676466
RMSE >>> 1.9417998019045284


In [27]:
# Average rating by movies
# cold-start case >> random value assign

train_movie_avg_rating = train_df.groupby('movieId')['rating'].mean()
train_movie_avg_rating.head()

movieId
1    3.912060
2    3.423529
3    3.156250
4    2.555556
5    3.318182
Name: rating, dtype: float64

In [30]:
def avg_rating_pred(train_set , x):
    if x in train_set.index:
        pred_rating = train_set.loc[x]
    else:
        pred_rating = random.choice(ratings_range)
    return pred_rating

In [31]:
test_df['pred_avg_rating_movie'] = test_df['movieId'].apply(lambda x: avg_rating_pred(train_movie_avg_rating, x))
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_avg_rating_movie
4241,23,1625,4.5,1148671498,1.0,3.939655
84223,564,2801,3.0,974716060,3.5,4.5
99408,665,1541,3.0,995232789,3.0,2.909091
85520,574,49530,4.0,1232810927,3.5,4.016667
67363,472,841,4.0,1006929182,0.5,3.7


In [32]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_avg_rating_movie'].values)
rmse = np.sqrt(mse)
print('MSE >>> {}'.format(mse))
print('RMSE >>> {}'.format(rmse))

MSE >>> 1.0842284656702035
RMSE >>> 1.0412629186090339


In [33]:
# Average rating by users
train_user_avg_rating = train_df.groupby('userId')['rating'].mean()
train_user_avg_rating.head()

userId
1    2.382353
2    3.491803
3    3.569767
4    4.388535
5    3.944444
Name: rating, dtype: float64

In [34]:
test_df['pred_avg_rating_user'] = test_df['userId'].apply(lambda x: avg_rating_pred(train_user_avg_rating, x))
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_avg_rating_movie,pred_avg_rating_user
4241,23,1625,4.5,1148671498,1.0,3.939655,3.634234
84223,564,2801,3.0,974716060,3.5,4.5,3.538
99408,665,1541,3.0,995232789,3.0,2.909091,3.271137
85520,574,49530,4.0,1232810927,3.5,4.016667,3.484733
67363,472,841,4.0,1006929182,0.5,3.7,3.778906


In [35]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_avg_rating_user'].values)
rmse = np.sqrt(mse)
print('MSE >>> {}'.format(mse))
print('RMSE >>> {}'.format(rmse))

MSE >>> 0.9384635153771168
RMSE >>> 0.9687432659776876


In [37]:
# Rule based prediction
# Rule1. Avg. rating by genre of movie
train_user_movie_matrix = train_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

In [38]:
train_user_movie_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
genres_df = movies_df['genres'].str.get_dummies(sep='|')
print(genres_df.shape)
genres_df = genres_df.loc[train_df.movieId.unique()]
print(genres_df.shape)
genres_df.head()

(9125, 20)
(8410, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6870,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
590,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1089,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
47,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
54997,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1


In [40]:
train_movie_avg_ratings_df = train_user_movie_matrix.copy()
train_movie_avg_ratings_df = train_movie_avg_ratings_df.replace(0, np.NaN)
train_movie_avg_ratings_df = train_movie_avg_ratings_df.mean(axis = 1)

train_movie_avg_ratings_df.head()

movieId
1    3.912060
2    3.423529
3    3.156250
4    2.555556
5    3.318182
dtype: float64

In [41]:
genres_avg_ratings_df = pd.DataFrame(index=genres_df.columns, columns=['avg_ratings'])

for genre in genres_avg_ratings_df.index:
    genre_avg_rating = train_movie_avg_ratings_df.loc[genres_df[genres_df[genre].isin([1])].index].mean()
    genres_avg_ratings_df.loc[genre]['avg_ratings'] = genre_avg_rating

genres_avg_ratings_df

Unnamed: 0,avg_ratings
(no genres listed),3.807692
Action,3.105919
Adventure,3.245913
Animation,3.469709
Children,3.170427
Comedy,3.203584
Crime,3.300188
Documentary,3.693583
Drama,3.460119
Fantasy,3.210882


In [42]:
def get_genre_avg_ratings(x):
    genres_list = movies_df.loc[x]['genres'].split('|')
    rating = 0
    for genre in genres_list:
        rating += genres_avg_ratings_df.loc[genre]['avg_ratings']
    
    return rating / len(genres_list)

In [43]:
tqdm.pandas()
test_df['pred_rating_genre'] = test_df['movieId'].progress_apply(lambda x: get_genre_avg_ratings(x))

100%|██████████| 20001/20001 [00:01<00:00, 18948.31it/s]


In [44]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_genre'].values)
rmse = np.sqrt(mse)
print('MSE >>> {}'.format(mse))
print('RMSE >>> {}'.format(rmse))

MSE >>> 1.186823715001605
RMSE >>> 1.0894143908548322


In [45]:
# Avg. user avg. rating add genre coef. (using Dummy linear regression)
from sklearn.linear_model import LinearRegression
genres_df.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6870,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
590,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1089,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
47,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
54997,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1


In [49]:
genres_df.columns[0]

'(no genres listed)'

In [47]:
ratings_df.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,31,2.5,1260759144
1,1029,3.0,1260759179
1,1061,3.0,1260759182
1,1129,2.0,1260759185
1,1172,4.0,1260759205


In [70]:
user_avg_ratings = train_df.reset_index().groupby('userId')['rating'].mean().reset_index()
user_genre_cnt = pd.merge(train_df.reset_index()[['userId','movieId']], genres_df.reset_index().drop(columns='(no genres listed)'), on='movieId',how='left')
user_genre_cnt.head()

Unnamed: 0,userId,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,480,6870,0,0,0,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
1,434,590,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,668,1089,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
3,157,47,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,152,54997,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1


In [71]:
# 벡터화 연산을 사용하여 one-hot encoding
user_genre_cnt.iloc[:,2:] = (user_genre_cnt.iloc[:,2:] != 0).astype(int)
user_genre_cnt.head()

Unnamed: 0,userId,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,480,6870,0,0,0,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
1,434,590,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,668,1089,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
3,157,47,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,152,54997,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1


In [72]:
user_genre_cnt = pd.merge(user_genre_cnt,user_avg_ratings, on='userId',how='left')
user_genre_cnt.head()

Unnamed: 0,userId,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating
0,480,6870,0,0,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,3.98495
1,434,590,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,3.70625
2,668,1089,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,3.583333
3,157,47,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,3.401515
4,152,54997,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,3.408824


In [74]:
user_genre_cnt['y_rating'] = train_df.reset_index()['rating']

In [85]:
user_genre_cnt.iloc[:,2:-1].head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating
0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,3.98495
1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3.70625
2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,3.583333
3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,3.401515
4,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,3.408824


In [86]:
lr_model = LinearRegression()
lr_model.fit(user_genre_cnt.iloc[:,2:-1], user_genre_cnt['y_rating'])

In [87]:
def convert_feature_set(X):
    user_avg_ratings = train_df.reset_index().groupby('userId')['rating'].mean().reset_index()
    user_genre_cnt = pd.merge(X.reset_index()[['userId','movieId']], genres_df.reset_index().drop(columns='(no genres listed)'), on='movieId',how='left')
    user_genre_cnt.iloc[:,2:] = (user_genre_cnt.iloc[:,2:] != 0).astype(int)
    user_genre_cnt = pd.merge(user_genre_cnt,user_avg_ratings, on='userId',how='left')
    return user_genre_cnt

In [88]:
X_test = convert_feature_set(test_df[['userId','movieId']])
X_test.head()

Unnamed: 0,userId,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating
0,23,1625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.634234
1,564,2801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.538
2,665,1541,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.271137
3,574,49530,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.484733
4,472,841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.778906


In [90]:
y_pred = lr_model.predict(X_test.iloc[:,2:].values)

In [92]:
y_pred[y_pred>5] = 5

In [93]:
test_df['pred_rating_genre'] = y_pred

In [94]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_genre'].values)
rmse = np.sqrt(mse)
print('MSE >>> {}'.format(mse))
print('RMSE >>> {}'.format(rmse))

MSE >>> 0.9602922741265181
RMSE >>> 0.9799450362783201
