## 3. Collaborative Filtering (협업 필터링: 사용자 리뷰 기반)

In [4]:
# surprise 패키지 이용
import surprise
surprise.__version__

'1.1.3'

In [5]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [6]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [7]:
ratings['rating'].min()
# 최소값

0.5

In [8]:
ratings['rating'].max()
# 최대값

5.0

In [9]:
reader = Reader(rating_scale=(0.5, 5)) # Reader 객체 생성 (최소값 : 0.5 , 최대값 : 5.0)

In [14]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader = reader) # 3개의 column 에 관하여 데이터 읽어오기
data

<surprise.dataset.DatasetAutoFolds at 0x2466f3e4610>

In [16]:
svd = SVD(random_state=0) # SVD 모델 사용

In [17]:
# cross_validate 함수를 통해 데이터 평가 (교차검증)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) 

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8919  0.8962  0.8956  0.8978  0.9008  0.8964  0.0029  
MAE (testset)     0.6878  0.6895  0.6879  0.6898  0.6940  0.6898  0.0022  
Fit time          1.34    1.30    1.28    1.38    1.32    1.33    0.04    
Test time         0.19    0.17    0.21    0.25    0.17    0.20    0.03    


{'test_rmse': array([0.89188728, 0.89616924, 0.8956013 , 0.89783231, 0.90075809]),
 'test_mae': array([0.68781459, 0.68949982, 0.68794683, 0.6897523 , 0.6939553 ]),
 'fit_time': (1.3444042205810547,
  1.2995233535766602,
  1.2775843143463135,
  1.3842973709106445,
  1.3244569301605225),
 'test_time': (0.18949222564697266,
  0.17154288291931152,
  0.21043658256530762,
  0.2533226013183594,
  0.17253875732421875)}

In [None]:
# 전체 데이터를 사용하여 학습

In [None]:
trainset = data.build_full_trainset()
svd.fit(trainset)

In [18]:
# usetId = 1 에 대하여 예측해보기
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [21]:
svd.predict(1, 1029, 3) # userId 1번인 사람이 MovieId = 1029인 영화에 대해서 실제 평가 3점일 때, 예측 평가 점수?

Prediction(uid=1, iid=1029, r_ui=3, est=2.7679593688019137, details={'was_impossible': False})

In [23]:
ratings[ratings['userId'] == 100]

Unnamed: 0,userId,movieId,rating,timestamp
15273,100,1,4.0,854193977
15274,100,3,4.0,854194024
15275,100,6,3.0,854194023
15276,100,7,3.0,854194024
15277,100,25,4.0,854193977
15278,100,32,5.0,854193977
15279,100,52,3.0,854194056
15280,100,62,3.0,854193977
15281,100,86,3.0,854194208
15282,100,88,2.0,854194208


In [26]:
svd.predict(100, 1029) # userId = 100, MovieId = 1029 예측해보기

Prediction(uid=100, iid=1029, r_ui=None, est=3.5631713340423006, details={'was_impossible': False})