In [12]:
import surprise
print(surprise.__version__)

1.1.4


### 1. Surprise 를 이용한 추천시스템 구축
- 컨텐츠 기반 필터링
- 아이템 기반 협업 필터링
- 잠재 요인 협업 필터링

In [13]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [14]:
# 1. 데이터불러오기
data = Dataset.load_builtin(name='ml-100k')

In [15]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=11)

In [16]:
## SVD 적용해보기
algo = SVD(random_state=11)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2b67efcfd00>

In [17]:
predictions = algo.test(testset)

In [18]:
print('최초 5개 이상 예상 결과 가져오기',predictions[:5])

최초 5개 이상 예상 결과 가져오기 [Prediction(uid='339', iid='550', r_ui=2.0, est=3.092687634878143, details={'was_impossible': False}), Prediction(uid='795', iid='419', r_ui=3.0, est=3.3086407514703615, details={'was_impossible': False}), Prediction(uid='14', iid='507', r_ui=4.0, est=4.21209649956976, details={'was_impossible': False}), Prediction(uid='276', iid='1157', r_ui=2.0, est=3.027511815120117, details={'was_impossible': False}), Prediction(uid='116', iid='259', r_ui=4.0, est=2.1942123269649687, details={'was_impossible': False})]


In [19]:
for i in predictions[:5]:
    print(i)

user: 339        item: 550        r_ui = 2.00   est = 3.09   {'was_impossible': False}
user: 795        item: 419        r_ui = 3.00   est = 3.31   {'was_impossible': False}
user: 14         item: 507        r_ui = 4.00   est = 4.21   {'was_impossible': False}
user: 276        item: 1157       r_ui = 2.00   est = 3.03   {'was_impossible': False}
user: 116        item: 259        r_ui = 4.00   est = 2.19   {'was_impossible': False}


In [20]:
for item in predictions[:5]:
    print(item.uid, item.iid, item.est)

339 550 3.092687634878143
795 419 3.3086407514703615
14 507 4.21209649956976
276 1157 3.027511815120117
116 259 2.1942123269649687


In [21]:
accuracy.rmse(predictions)

RMSE: 0.9453


0.9453218904734639

### 2. 실제 데이터 프레임으로부터 가져와서 사용해보기

In [22]:
import pandas as pd

In [23]:
from surprise import Reader
df = pd.read_csv("./ml-latest-small/ratings.csv")

In [24]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader=reader)

In [25]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=11)

In [26]:
predictions = algo.test(testset)

In [27]:
accuracy.rmse(predictions)

RMSE: 1.0444


1.0443943935199096

### 교차 검증과 그리드 서치를 이용해서 하이퍼 파라미터 튜닝

In [28]:
# 1. 교차검증
from surprise.model_selection import cross_validate

algo = SVD(random_state=11)
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8835  0.8629  0.8744  0.8816  0.8623  0.8729  0.0090  
MAE (testset)     0.6782  0.6622  0.6716  0.6760  0.6637  0.6703  0.0064  
Fit time          0.86    0.87    0.85    0.85    0.87    0.86    0.01    
Test time         0.17    0.08    0.16    0.08    0.07    0.11    0.05    


{'test_rmse': array([0.88349608, 0.86293117, 0.87442363, 0.88156426, 0.8623126 ]),
 'test_mae': array([0.67816361, 0.6621662 , 0.67156923, 0.67602243, 0.66372676]),
 'fit_time': (0.864084005355835,
  0.8698835372924805,
  0.8464171886444092,
  0.849297046661377,
  0.8720278739929199),
 'test_time': (0.17399382591247559,
  0.07543325424194336,
  0.1574704647064209,
  0.07573080062866211,
  0.06756329536437988)}

In [29]:
## 2. 그리드 서치
from surprise.model_selection import GridSearchCV

# 최적화할 파라미터를 설정 - 딕셔너리 저장
param_grid = {"n_epochs": [20, 40, 60],
              "n_factors":[50, 100, 200]}

# 그리드 서치 실행
gs = GridSearchCV(SVD, param_grid, measures=['r
mse', 'mae'], cv=3)
gs.fit(data)

In [30]:
print("가장 좋은 RMSE", gs.best_score['rmse'])
print("가장 좋은 파라미터는", gs.best_params['rmse'])

가장 좋은 RMSE 0.878092191354833
가장 좋은 파라미터는 {'n_epochs': 20, 'n_factors': 50}
