## 3. Collaborative Filtering (협업 필터링 : 사용자 리뷰 기반)

In [1]:
import surprise
surprise.__version__

'1.1.3'

In [2]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [4]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
ratings['rating'].min()

0.5

In [6]:
ratings['rating'].max()

5.0

In [7]:
reader = Reader(rating_scale=(0.5, 5.0))

In [10]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)

In [11]:
svd = SVD(random_state=0)

In [12]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8912  0.9013  0.8889  0.9023  0.8952  0.8958  0.0053  
MAE (testset)     0.6847  0.6947  0.6843  0.6945  0.6917  0.6900  0.0046  
Fit time          1.96    1.63    1.57    1.48    1.47    1.62    0.18    
Test time         0.26    0.19    0.19    0.27    0.20    0.22    0.04    


{'test_rmse': array([0.89115275, 0.90130924, 0.8888667 , 0.90230536, 0.89519018]),
 'test_mae': array([0.68472031, 0.69473322, 0.68433748, 0.6944822 , 0.6917164 ]),
 'fit_time': (1.9633088111877441,
  1.6259019374847412,
  1.5687668323516846,
  1.4790856838226318,
  1.4702067375183105),
 'test_time': (0.2610962390899658,
  0.18679165840148926,
  0.18957829475402832,
  0.27000951766967773,
  0.20353078842163086)}

교차 검증
100 개 데이터
cv=5면 5세트로 나누는 거

A: 1-20
B: 21-40
C: 41-60
D: 61-80
E: 81-100

ABCD (train set) E (test set)
ABCE (train set) D (test set)
ABDE (train set) C (test set)
ACDE (train set) B (test set)
BCDE (train set) A (test set)
이렇게 나누어서 검증함


In [14]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x175a240ad10>

In [16]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [19]:
svd.predict(1, 302) # userid가 1인 사람이 302번을 예측한 점수

Prediction(uid=1, iid=302, r_ui=None, est=2.7142061734434044, details={'was_impossible': False})

In [21]:
svd.predict(1, 1029, 3) # userid가 1인 사람이 1029에 3점을 줬는데 모델로 예측한 점수

Prediction(uid=1, iid=1029, r_ui=3, est=2.8814455446761933, details={'was_impossible': False})

In [22]:
ratings[ratings['userId'] == 100]

Unnamed: 0,userId,movieId,rating,timestamp
15273,100,1,4.0,854193977
15274,100,3,4.0,854194024
15275,100,6,3.0,854194023
15276,100,7,3.0,854194024
15277,100,25,4.0,854193977
15278,100,32,5.0,854193977
15279,100,52,3.0,854194056
15280,100,62,3.0,854193977
15281,100,86,3.0,854194208
15282,100,88,2.0,854194208


In [24]:
svd.predict(100, 1029) # userid가 100인 사람이 302번을 예측한 점수

Prediction(uid=100, iid=1029, r_ui=None, est=3.7705476478414846, details={'was_impossible': False})