In [30]:
import numpy as np
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic
from surprise import KNNBaseline
from surprise.model_selection import cross_validate


import os
import pandas as pd

In [2]:
def data_load():
    files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')
    m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
    movies = pd.read_csv(files_dir+'u.item', sep='|', names=m_cols, usecols=range(5),
    encoding='latin-1')
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv(files_dir+'u.data', sep='\t', names=r_cols)
    return movies['title'], ratings

In [4]:
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset() # 전체 데이터를 학습 데이터로 사용

In [12]:
algo = NormalPredictor()
algo.fit(trainset) # 학습
testset = trainset.build_anti_testset() # 학습 데이터를 제외한 모두를 테스트 데이터로 사용
NormalPredictor_predictions = algo.test(testset) # 평점 예측

In [13]:
accuracy.rmse(NormalPredictor_predictions, verbose=True)
mae = accuracy.mae(NormalPredictor_predictions, verbose=False)
print("NormalPredictor_predictions MAE:", mae)

RMSE: 1.0225
NormalPredictor_predictions MAE: 0.8433498188902117


In [27]:
algo1 = BaselineOnly()
algo1.fit(trainset)
BaselineOnly_predictions = algo1.test(testset)

Estimating biases using als...


In [28]:
accuracy.rmse(BaselineOnly_predictions, verbose=True)
mae = accuracy.mae(BaselineOnly_predictions, verbose=False)
print("BaselineOnly_predictions_predictions MAE:", mae)

RMSE: 0.5159
BaselineOnly_predictions_predictions MAE: 0.40504696681503494


In [23]:
param_grid = {'k': [5, 10],
'sim_options': {'name': ['msd', 'cosine'],'user_based': [False, True]} }

gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=2)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

#상세정보용
#results_df = pd.DataFrame.from_dict(gs.cv_results)
#print(results_df)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd sim

In [31]:
sim_options = {'name': 'msd', # 'cosine', 'msd', 'pearson'
                'user_based': True,
                'shrinkage': 0 } # default 100


algo2 = KNNBaseline(k=10, min_k=1, sim_options=sim_options)
#algo2.fit(trainset)
#KNNBaseline_predictions = algo2.test(testset)

cross_validate(algo2, data, measures=['RMSE','MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9504  0.9449  0.9522  0.9535  0.9502  0.9502  0.0029  
MAE (testset)     0.7478  0.7451  0.7476  0.7506  0.7493  0.7481  0.0018  
Fit time          0.75    0.74    0.74    0.74    0.74    0.74    0.00    
Test time         2.35    2.35    2.38    2.34    2.34    2.35    0.02    


{'fit_time': (0.7456357479095459,
  0.7389087677001953,
  0.7415494918823242,
  0.7375540733337402,
  0.7406699657440186),
 'test_mae': array([0.74777491, 0.74508798, 0.74758769, 0.7505549 , 0.74932627]),
 'test_rmse': array([0.95035531, 0.94493103, 0.95222021, 0.95347215, 0.9501893 ]),
 'test_time': (2.354562759399414,
  2.347822666168213,
  2.3810982704162598,
  2.340524196624756,
  2.3378024101257324)}

In [26]:
accuracy.rmse(KNNBaseline_predictions, verbose=True)
mae = accuracy.mae(KNNBaseline_predictions, verbose=False)
print("KNNBaseline_predictions_predictions MAE:", mae)

RMSE: 0.8163
KNNBaseline_predictions_predictions MAE: 0.6458349623955439


In [None]:
sim_options = {'name': 'cosine', # 'cosine', 'msd', 'pearson'
'user_based': False,
'shrinkage': 0 } # default 100

In [5]:
all_predicted_ratings = np.zeros((trainset.n_users+1, trainset.n_items+1))

for uid, iid, true_r, est, _ in predictions:
    all_predicted_ratings[int(uid), int(iid)] = est

my_id = 2
index = np.argsort(all_predicted_ratings[my_id])
index = index[::-1]
index = index+1
movies, ratings= data_load() # 데이터 로드
print(movies[index[0:3]])

380     Muriel's Wedding (1994)
672            Cape Fear (1962)
1325                Boys (1996)
Name: title, dtype: object


In [6]:
my_id = 2
ratings = ratings[ratings['user_id']==my_id]
mv= ratings['movie_id']
rating= ratings['rating']
index = np.argsort(rating)
index = index[::-1]
index = index+1
print(movies[index[0:3]])

15    French Twist (Gazon maudit) (1995)
53                       Outbreak (1995)
29                  Belle de jour (1967)
Name: title, dtype: object
