## 3. Collaborative Filtering (협업 필터링: 사용자 리뷰 기반)

In [1]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [2]:
ratings = pd.read_excel('data_45.xlsx')
ratings.head()

Unnamed: 0,cust,ma_fem_dv,A01,A02,A03,A04,A05,A06,B01,C01,C02,D01,D02,E01,L00,L01,오프라인,온라인,count_days,label
0,M000136117,1,0.25,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.75,4,0
1,M000419293,1,0.36,0.02,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.1,0.0,0.0,0.02,0.45,0.38,0.62,30,1
2,M000494848,1,0.05,0.0,0.0,0.0,0.0,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.51,0.06,0.94,75,1
3,M000557840,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,37,0
4,M000871427,1,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.2,0.0,0.0,0.0,0.8,0.2,3,0


In [3]:
reader = Reader(rating_scale=(0.5,5))

In [8]:
data = Dataset.load_from_df(ratings[['cust', 'movieId', 'rating']], reader = reader)

In [9]:
svd = SVD(random_state = 0)

In [11]:
cross_validate(svd, data, measures = ['RMSE', 'MSE'], cv = 5, verbose = True)

Evaluating RMSE, MSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8968  0.9019  0.8918  0.8951  0.8928  0.8957  0.0035  
MSE (testset)     0.8042  0.8134  0.7953  0.8013  0.7972  0.8023  0.0064  
Fit time          3.35    3.33    3.34    3.38    3.30    3.34    0.03    
Test time         0.09    0.17    0.09    0.09    0.16    0.12    0.04    


{'test_rmse': array([0.8967697 , 0.90186424, 0.8918109 , 0.89513149, 0.89284531]),
 'test_mse': array([0.80419589, 0.81335911, 0.79532667, 0.80126038, 0.79717275]),
 'fit_time': (3.3526554107666016,
  3.327029228210449,
  3.3422439098358154,
  3.3840460777282715,
  3.3041064739227295),
 'test_time': (0.09395718574523926,
  0.16716980934143066,
  0.09277963638305664,
  0.09186768531799316,
  0.16298317909240723)}

교차 검증 (K-Fold 교차 검증)

100개 데이터

A: 1 ~ 20

B: 21 ~ 40

C: 41 ~ 60

D: 61 ~ 80

E: 81 ~ 100

train set   /   test set

ABCD       /     E

ABCE       /     D

ABDE       /     C

ACDE       /     B

BCDE       /     A

In [None]:
trainset = data.build_full_trainset()
svd.fit(trainset)

In [12]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [13]:
# 유저 1이 평가한 영화 302의 평점 2.7로 예측
svd.predict(1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.783908476623709, details={'was_impossible': False})

In [14]:
# UserId가 1번인 사람이 MovieId가 1029인 영화에 대해서 실제 평가가 3점일 때, 예측 평가 점수?
svd.predict(1, 1029, 3)

Prediction(uid=1, iid=1029, r_ui=3, est=3.1240271398860338, details={'was_impossible': False})

In [15]:
ratings[ratings['userId'] == 100]

Unnamed: 0,userId,movieId,rating,timestamp
15273,100,1,4.0,854193977
15274,100,3,4.0,854194024
15275,100,6,3.0,854194023
15276,100,7,3.0,854194024
15277,100,25,4.0,854193977
15278,100,32,5.0,854193977
15279,100,52,3.0,854194056
15280,100,62,3.0,854193977
15281,100,86,3.0,854194208
15282,100,88,2.0,854194208


In [17]:
svd.predict(100, 1029)

Prediction(uid=100, iid=1029, r_ui=None, est=3.6518575902935324, details={'was_impossible': False})