<a href="https://colab.research.google.com/github/KevinTheRainmaker/Recommendation_Algorithms/blob/main/colab/fastcampus/Recommender_Trial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 간단한 추천시스템 만들기

- Movielens Dataset 이용
- 3가지 방식의 간단한 추천 시스템을 구현해보고 평가
  - 영화 평균 평점기반 예측
  - 사용자 평균 평점기반 예측
  - Rule-Based 평점 예측
  
- 평가 기준은 RMSE

## 0. Configuration


In [2]:
import os
from tqdm import tqdm # 진행률 Process Bar
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix # 압축희소행렬(Compressed Sparse Row)

import warnings
warnings.filterwarnings('ignore')

 **CSR이란**

 Compressed sparse row (CSR): 가로의 순서대로 재정렬하는 방법으로 행에 관여하여 정리 압축

장점: 산술연산, 행 슬라이싱, 행렬 벡터 곱이 효율적이고 빠름

단점: 느린 열 슬라이싱(Compressed Sparse Column 고려), 희소성 구조 변화 시 연산비용 큼 (LIL, DOK 고려)

## 1. Dataset

- ratings.csv
- movies.csv
- tags.csv

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
path = '/content/drive/My Drive/data/ml-latest-small'

In [11]:
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [10]:
print(ratings_df.shape)
print(ratings_df.sample(5))

(100836, 4)
       userId  movieId  rating   timestamp
63732     414     4591     3.0  1026225501
16889     105   147300     5.0  1526207029
95901     601     4993     5.0  1441639520
24569     169     7004     3.5  1073179957
16169     104    47629     4.0  1325962571


In [12]:
print(movies_df.shape)
print(movies_df.sample(5))

(9742, 2)
                                    title              genres
movieId                                                      
4881     Man Who Wasn't There, The (2001)         Crime|Drama
76077         Hot Tub Time Machine (2010)       Comedy|Sci-Fi
6027                      Dogfight (1991)       Drama|Romance
110669             Honest Liar, An (2014)  Comedy|Documentary
26467               Day After, The (1983)        Drama|Sci-Fi


In [13]:
print(tags_df.shape)
print(tags_df.sample(5))

(3683, 4)
      userId  movieId                 tag   timestamp
347       62   135536  Horrible directing  1525555099
899      424    48516      Jack Nicholson  1457843173
2998     567     4144           nocturnal  1525283537
788      424     1258            suspense  1457843361
818      424     2700       controversial  1457844399


## 2. About ratings_df

In [16]:
num_users = ratings_df['userId'].unique()
num_movies = ratings_df['movieId'].unique()

print('Total Users:', len(num_users))
print('Total Movies:', len(num_movies))

Total Users: 610
Total Movies: 9724


In [26]:
# pivot ratings into movie features

user_movie_matrix = ratings_df.pivot(
    index = 'movieId',
    columns = 'userId',
    values = 'rating'
).fillna(0) # NaN to 0

# convert dataframe of movie features to scipy sparse matrix
sparse_mat = csr_matrix(user_movie_matrix.values)

In [27]:
print(user_movie_matrix)

userId   1    2    3    4    5    6    7    ...  604  605  606  607  608  609  610
movieId                                     ...                                   
1        4.0  0.0  0.0  0.0  4.0  0.0  4.5  ...  3.0  4.0  2.5  4.0  2.5  3.0  5.0
2        0.0  0.0  0.0  0.0  0.0  4.0  0.0  ...  5.0  3.5  0.0  0.0  2.0  0.0  0.0
3        4.0  0.0  0.0  0.0  0.0  5.0  0.0  ...  0.0  0.0  0.0  0.0  2.0  0.0  0.0
4        0.0  0.0  0.0  0.0  0.0  3.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0
5        0.0  0.0  0.0  0.0  0.0  5.0  0.0  ...  3.0  0.0  0.0  0.0  0.0  0.0  0.0
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
193581   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0
193583   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0
193585   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0
193587   0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0
1936

In [28]:
print(sparse_mat)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [29]:
print(sparse_mat.indptr)
print()
print(sparse_mat.indices)
print()
print(sparse_mat.data)

[     0    215    325 ... 100834 100835 100836]

[  0   4   6 ... 183 183 330]

[4.  4.  4.5 ... 3.5 3.5 4. ]
