<a href="https://colab.research.google.com/github/KevinTheRainmaker/Recommendation_Algorithms/blob/main/colab/fastcampus/Recommender_Trial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 간단한 추천시스템 만들기

- Movielens Dataset 이용
- 3가지 방식의 간단한 추천 시스템을 구현해보고 평가
  - 영화 평균 평점기반 예측
  - 사용자 평균 평점기반 예측
  - Rule-Based 평점 예측
  
- 평가 기준은 RMSE

## 0. Configuration


In [5]:
import os
from tqdm import tqdm # 진행률 Process Bar
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix # 압축희소행렬(Compressed Sparse Row)

import warnings
warnings.filterwarnings('ignore')

 **CSR이란**

 Compressed sparse row (CSR): 가로의 순서대로 재정렬하는 방법으로 행에 관여하여 정리 압축

장점: 산술연산, 행 슬라이싱, 행렬 벡터 곱이 효율적이고 빠름

단점: 느린 열 슬라이싱(Compressed Sparse Column 고려), 희소성 구조 변화 시 연산비용 큼 (LIL, DOK 고려)

## 1. Dataset

- ratings.csv
- movies.csv
- tags.csv

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
path = '/content/drive/My Drive/data/ml-latest-small'

In [8]:
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [9]:
print(ratings_df.shape)
print(ratings_df.sample(5))

(100836, 4)
       userId  movieId  rating   timestamp
19427     125    88129     4.5  1474294429
69589     448     7454     2.0  1084951203
79688     495    70286     5.0  1458635551
86465     560    60069     4.0  1469648050
98463     606    64839     3.0  1291933030


In [10]:
print(movies_df.shape)
print(movies_df.sample(5))

(9742, 2)
                                     title                          genres
movieId                                                                   
114670                         Tusk (2014)             Comedy|Drama|Horror
1272                         Patton (1970)                       Drama|War
589      Terminator 2: Judgment Day (1991)                   Action|Sci-Fi
1267      Manchurian Candidate, The (1962)              Crime|Thriller|War
63992                      Twilight (2008)  Drama|Fantasy|Romance|Thriller


In [11]:
print(tags_df.shape)
print(tags_df.sample(5))

(3683, 4)
      userId  movieId                    tag   timestamp
147       62    34150  heroine in tight suit  1525554138
3199     567   108932                 cheeky  1525283702
131       62    27831              confusing  1532723364
774      424     1240                 Action  1457901308
3098     567    27773         claustrophobic  1525283317


## 2. About ratings_df

In [12]:
num_users = ratings_df['userId'].unique()
num_movies = ratings_df['movieId'].unique()

print('Total Users:', len(num_users))
print('Total Movies:', len(num_movies))

Total Users: 610
Total Movies: 9724


In [13]:
# pivot ratings into movie features

user_movie_matrix = ratings_df.pivot(
    index = 'movieId',
    columns = 'userId',
    values = 'rating'
).fillna(0) # NaN to 0

# convert dataframe of movie features to scipy sparse matrix
sparse_mat = csr_matrix(user_movie_matrix.values)

In [14]:
print(user_movie_matrix)

userId   1    2    3    4    5    6    7    ...  604  605  606  607  608  609  610
movieId                                     ...                                   
1        4.0  0.0  0.0  0.0  4.0  0.0  4.5  ...  3.0  4.0  2.5  4.0  2.5  3.0  5.0
2        0.0  0.0  0.0  0.0  0.0  4.0  0.0  ...  5.0  3.5  0.0  0.0  2.0  0.0  0.0
3        4.0  0.0  0.0  0.0  0.0  5.0  0.0  ...  0.0  0.0  0.0  0.0  2.0  0.0  0.0
4        0.0  0.0  0.0  0.0  0.0  3.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0
5        0.0  0.0  0.0  0.0  0.0  5.0  0.0  ...  3.0  0.0  0.0  0.0  0.0  0.0  0.0
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
193581   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0
193583   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0
193585   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0
193587   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0
1936

In [15]:
print(sparse_mat)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [16]:
print(sparse_mat.indptr)
print(sparse_mat.indices)
print(sparse_mat.data)

[     0    215    325 ... 100834 100835 100836]
[  0   4   6 ... 183 183 330]
[4.  4.  4.5 ... 3.5 3.5 4. ]


In [17]:
user_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],
                            index = user_movie_matrix.columns, 
                            columns = ['movies_rated'])

# sum(list(user_movie_matrix[1].value_counts())[1:])

In [18]:
user_info_df

Unnamed: 0_level_0,movies_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [19]:
movie_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:]) for x in user_movie_matrix.index],
                            index = user_movie_matrix.index, 
                            columns = ['movies_rated'])

In [20]:
movie_info_df

Unnamed: 0_level_0,movies_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


## 3. Split Dataset

In [21]:
train_df, test_df = train_test_split(ratings_df, test_size = 0.2, random_state = 1234)

In [22]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


### Unseen data rate in test set

In [23]:
# userId
print('User:', len(list(set(test_df['userId'].unique()) - set(train_df['userId'].unique()))))

# movieId
print('Movie:', len(list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))))
print('Total movie nums in test set:', len(test_df['movieId'].unique()))

User: 0
Movie: 786
Total movie nums in test set: 5171


In [24]:
movies_not_included = list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))
print(sorted(movies_not_included)[:10])

not_included_df = test_df[test_df.movieId.isin(movies_not_included)].sort_values(by='movieId')
print(not_included_df.sample(10))

print('\nNumber of Unseen data in test set:', not_included_df.shape)

[49, 117, 137, 178, 241, 320, 359, 478, 488, 495]
       userId  movieId  rating   timestamp
63263     414     2893     3.0   961437647
94725     599    71327     3.5  1498797401
67030     432    71902     3.5  1315243103
70275     448   115151     2.0  1433082348
49324     318     7238     4.0  1428491623
88040     567   110603     0.5  1525290024
16897     105   158402     2.5  1526208164
63881     414     5209     2.0  1064941928
70170     448   105121     1.0  1415740266
69376     448     4660     4.0  1019127974

Number of Unseen data in test set: (852, 4)


## 4. Simple Recommender Practice


### 4-1. 영화 평균 평점기반 예측
- train set의 모든 영화에 대해서 평균 평점 계산
- test set 예측 시, train set의 영화 평균 평점 활용 / 없을 경우 random

In [25]:
import random
ratings_range = np.arange(0.5, 5.5, step=0.5)
ratings_range

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [26]:
pred_random = [random.choice(ratings_range) for x in range(len(test_df))]
pred_random[:10]

[1.0, 2.5, 2.5, 2.5, 1.5, 4.0, 1.0, 4.5, 5.0, 2.0]

In [27]:
train_movie_df = train_df.groupby('movieId').mean()

print(train_movie_df.shape)
print(train_movie_df.sample(5))

(8938, 3)
             userId    rating     timestamp
movieId                                    
2880     370.200000  3.200000  1.110195e+09
107348   327.222222  3.333333  1.463103e+09
26599    606.000000  4.000000  1.171410e+09
34321    347.333333  3.000000  1.309196e+09
168612   366.000000  3.750000  1.515317e+09


In [28]:
def avg_rating_prediction(training_set, x):
  if x in training_set.index:
    pred_rating = training_set.loc[x]['rating']
  else:
    pred_rating = random.choice(ratings_range)
  return pred_rating

In [29]:
test_df['pred_rating_movie'] = test_df['movieId'].apply(lambda x: avg_rating_prediction(train_movie_df, x))

test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_rating_movie
99731,610,3527,5.0,1479545223,3.604167
97583,606,1250,3.5,1171376891,4.180556
38197,262,213,5.0,840310907,3.75
11474,68,69406,3.0,1261622505,3.571429
34105,232,4728,3.0,1218166950,2.769231


In [30]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_movie'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.0513017822006636 1.0253300845096975


### 4-2. 사용자 평균 평점기반 예측
- train set의 모든 유저가 준 평균 평점
- test set 예측할 때, 유저가 train set에서 준 평균 평점을 활용 / 유저가 없을 경우 random 평점 적용

In [31]:
train_user_df = train_df.groupby('userId').mean()

print(train_user_df.shape)
print(train_user_df.head())

(610, 3)
             movieId    rating     timestamp
userId                                      
1        1891.168478  4.320652  9.649865e+08
2       70402.760000  3.940000  1.445715e+09
3        8394.733333  2.516667  1.306464e+09
4        1957.923077  3.631868  9.655941e+08
5         337.606061  3.636364  8.474351e+08


In [32]:
test_df['pred_rating_user'] = test_df['userId'].apply(lambda x: avg_rating_prediction(train_user_df, x))

test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_rating_movie,pred_rating_user
99731,610,3527,5.0,1479545223,3.604167,3.678709
97583,606,1250,3.5,1171376891,4.180556,3.649718
38197,262,213,5.0,840310907,3.75,2.925
11474,68,69406,3.0,1261622505,3.571429,3.229331
34105,232,4728,3.0,1218166950,2.769231,3.242268


In [33]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_user'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

0.8905889036428333 0.9437101798978504


### 4-3. Rule-Based 평점 예측
- train set에 포함된 유저의 영화 평균 평점과 영화의 장르를 활용하여, 장르별 평균 평점 계산 -> test set의 영화 장르의 평균 평점으로 예측
- user의 평균 영화 평점을 normalize해서 확인: 평점 측정 수, 표준편차 등 활용 가능

In [34]:
train_user_info_df = pd.DataFrame({
    'avg_ratings':train_df.groupby('userId')['rating'].mean(),
    'std_ratings':train_df.groupby('userId')['rating'].std(),
    'count_ratings':train_df.groupby('userId')['rating'].count()
})

train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.320652,0.836600,184
2,3.940000,0.820569,25
3,2.516667,2.127340,30
4,3.631868,1.317823,182
5,3.636364,1.084498,33
...,...,...,...
606,3.649718,0.734887,885
607,3.772414,0.955574,145
608,3.145865,1.071503,665
609,3.275862,0.454859,29


In [35]:
min_count = train_user_info_df['count_ratings'].min()
max_count = train_user_info_df['count_ratings'].max()
avg_count = train_user_info_df['count_ratings'].mean()

train_user_info_df['weights'] = train_user_info_df['count_ratings'].apply(lambda x: (x-avg_count)/(max_count-min_count))

In [36]:
train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.320652,0.836600,184,0.023995
2,3.940000,0.820569,25,-0.049718
3,2.516667,2.127340,30,-0.047400
4,3.631868,1.317823,182,0.023068
5,3.636364,1.084498,33,-0.046010
...,...,...,...,...
606,3.649718,0.734887,885,0.348983
607,3.772414,0.955574,145,0.005914
608,3.145865,1.071503,665,0.246990
609,3.275862,0.454859,29,-0.047864


In [40]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(train_user_info_df)
df_normalized = pd.DataFrame(np_scaled, columns = train_user_info_df.columns, index = train_user_info_df.index)
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.822227,0.393261,0.079740,0.079740
2,0.722617,0.385725,0.006027,0.006027
3,0.350156,1.000000,0.008345,0.008345
4,0.641984,0.619470,0.078813,0.078813
5,0.643161,0.509791,0.009736,0.009736
...,...,...,...,...
606,0.646655,0.345449,0.404729,0.404729
607,0.678762,0.449188,0.061660,0.061660
608,0.514806,0.503682,0.302735,0.302735
609,0.548824,0.213816,0.007881,0.007881


In [41]:
df_normalized['normalized_avg_ratings'] = df_normalized['avg_ratings'] * 5
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights,normalized_avg_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.822227,0.393261,0.079740,0.079740,4.111134
2,0.722617,0.385725,0.006027,0.006027,3.613084
3,0.350156,1.000000,0.008345,0.008345,1.750779
4,0.641984,0.619470,0.078813,0.078813,3.209921
5,0.643161,0.509791,0.009736,0.009736,3.215803
...,...,...,...,...,...
606,0.646655,0.345449,0.404729,0.404729,3.233275
607,0.678762,0.449188,0.061660,0.061660,3.393812
608,0.514806,0.503682,0.302735,0.302735,2.574029
609,0.548824,0.213816,0.007881,0.007881,2.744119


In [42]:
test_df['pred_rating_normalized'] = test_df['userId'].apply(lambda x: df_normalized.loc[x]['normalized_avg_ratings'])
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_rating_movie,pred_rating_user,pred_rating_normalized
99731,610,3527,5.0,1479545223,3.604167,3.678709,3.271208
97583,606,1250,3.5,1171376891,4.180556,3.649718,3.233275
38197,262,213,5.0,840310907,3.750000,2.925000,2.285047
11474,68,69406,3.0,1261622505,3.571429,3.229331,2.683236
34105,232,4728,3.0,1218166950,2.769231,3.242268,2.700164
...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,4.127907,3.666667,3.255452
4897,31,780,4.0,850466616,3.470760,3.911765,3.576141
8023,56,410,3.0,835799188,3.131148,3.837838,3.479414
77467,483,2291,4.0,1415579167,3.734375,3.598940,3.166837


In [43]:
mse = mean_squared_error(y_true = test_df['rating'].values, y_pred = test_df['pred_rating_normalized'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.120579096060227 1.05857408624065
