# Collaborative Filtering

In [246]:
%matplotlib inline

import random
from pathlib import Path

import heapq
from collections import defaultdict

import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from itertools import chain
from surprise import Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
import numpy as np

##### 아래 주어진 평점 DataFrame을 활용하여 Collaborative Filtering을 적용한 결과를 출력하시오.

In [218]:
ratings = pd.DataFrame([
    [30878, 1, 4], [30878, 5, 1], [30878, 18, 3], [30878, 28, 3], [30878, 30, 4], [30878, 44, 5], 
    [124105, 1, 4], 
    [822109, 1, 5], 
    [823519, 1, 3], [823519, 8, 1], [823519, 17, 4], [823519, 28, 4], [823519, 30, 5], 
    [885013, 1, 4], [885013, 5, 5], 
    [893988, 1, 3], [893988, 30, 4], [893988, 44, 4], 
    [1248029, 1, 3], [1248029, 28, 2], [1248029, 30, 4], [1248029, 48, 3], 
    [1503895, 1, 4], 
    [1842128, 1, 4], [1842128, 30, 3], 
    [2238063, 1, 3], 
], columns=['customerID', 'movieID', 'rating'])

### Memory-based

In [219]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['customerID', 'movieID', 'rating']], reader)
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}  # compute cosine similarities between items
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
pred = algo.predict('823519', '30')
pred

Computing the cosine similarity matrix...
Done computing similarity matrix.


Prediction(uid='823519', iid='30', r_ui=None, est=3.5384615384615383, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

##### 아래 랜덤하게 만든 평점 DataFrame을 활용하여 Collaborative Filtering을 적용한 결과를 출력하시오.
- 주어진 get_top_n 함수 활용

In [220]:
random.seed(0)
nratings = 5000
randomData = pd.DataFrame({
    'itemID': [random.randint(0,99) for _ in range(nratings)],
    'userID': [random.randint(0,999) for _ in range(nratings)],
    'rating': [random.randint(1,5) for _ in range(nratings)],
})
randomData.head()

Unnamed: 0,itemID,userID,rating
0,49,665,1
1,97,974,5
2,53,542,5
3,5,634,3
4,33,694,2


In [221]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    byUser = defaultdict(list)
    for p in predictions:
        byUser[p.uid].append(p)
    
    byUserSorted = {}
    for uid in byUser:
        byUserSorted[uid] = sorted(byUser[uid], key=lambda x: x.est, reverse=True)[:n]
    return byUserSorted

- user 기반

In [222]:
# Convert thes data set into the format required by the surprise package
# The columns must correspond to user id, item id and ratings (in that order)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)

# Split into training and test set
trainset, testset = train_test_split(data, test_size=.25, random_state=1)

## User-based filtering
# compute cosine similarity between users 
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=4)

# Print the recommended items for each user
print('Top-3 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:5]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()
print()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Top-3 recommended items for each user
User 6
  Item 6 (5.00)  Item 77 (2.50)  Item 60 (1.00)
User 222
  Item 77 (3.50)  Item 75 (2.78)
User 424
  Item 14 (3.50)  Item 45 (3.10)  Item 54 (2.34)
User 87
  Item 27 (3.00)  Item 54 (3.00)  Item 82 (3.00)  Item 32 (1.00)
User 121
  Item 98 (3.48)  Item 32 (2.83)



In [223]:
accuracy.rmse(predictions)

RMSE: 1.6030


1.602978765654451

- item 기반

In [225]:
## Item-based filtering
# compute cosine similarity between users 
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)
top_n = get_top_n(predictions, n=4)

# Print the recommended items for each user
print()
print('Top-3 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:5]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()

Computing the cosine similarity matrix...
Done computing similarity matrix.

Top-3 recommended items for each user
User 6
  Item 77 (3.00)  Item 60 (3.00)  Item 6 (3.00)
User 222
  Item 77 (2.24)  Item 75 (2.00)
User 424
  Item 54 (3.47)  Item 14 (3.44)  Item 45 (3.00)
User 87
  Item 27 (3.00)  Item 32 (3.00)  Item 82 (3.00)  Item 54 (2.50)
User 121
  Item 32 (3.06)  Item 98 (2.31)


In [226]:
accuracy.rmse(predictions)

RMSE: 1.7063


1.706273729603763

### Model-based

* SVD 기반

In [227]:
from surprise import SVD

In [228]:
algo = SVD()
algo.fit(trainset)

predictions = algo.test(testset)
top_n = get_top_n(predictions, n=4)

# Print the recommended items for each user
print()
print('Top-3 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:5]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()


Top-3 recommended items for each user
User 6
  Item 6 (3.26)  Item 60 (3.18)  Item 77 (2.96)
User 222
  Item 75 (2.86)  Item 77 (2.52)
User 424
  Item 14 (3.04)  Item 54 (2.98)  Item 45 (2.72)
User 87
  Item 82 (3.02)  Item 27 (2.93)  Item 54 (2.74)  Item 32 (2.71)
User 121
  Item 98 (3.23)  Item 32 (3.16)


In [229]:
accuracy.rmse(predictions)

RMSE: 1.4464


1.4463662780451023

### Cross validation

Surprise 홈페이지: https://surprise.readthedocs.io/en/stable/index.html

In [235]:
data = Dataset.load_builtin('ml-100k')

In [236]:
algo1 = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
algo2 = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
algo3 = SVD()

In [241]:
cross_validate(algo1, data, measures=['RMSE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0262  1.0152  1.0138  1.0074  1.0235  1.0173  0.0068  
Fit time          0.54    0.55    0.55    0.54    0.54    0.54    0.00    
Test time         1.94    1.84    1.95    1.85    1.94    1.90    0.05    


{'test_rmse': array([1.02624769, 1.01522444, 1.01384497, 1.00744995, 1.02350943]),
 'fit_time': (0.5433268547058105,
  0.5461905002593994,
  0.5468482971191406,
  0.541083574295044,
  0.5435516834259033),
 'test_time': (1.9365429878234863,
  1.8359925746917725,
  1.9520387649536133,
  1.8521714210510254,
  1.942430019378662)}

In [242]:
cross_validate(algo2, data, measures=['RMSE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0192  1.0214  1.0319  1.0289  1.0291  1.0261  0.0049  
Fit time          0.96    1.01    1.02    0.99    1.01    1.00    0.02    
Test time         2.09    2.22    2.11    2.27    2.09    2.16    0.07    


{'test_rmse': array([1.0192115 , 1.02144182, 1.0318935 , 1.02885539, 1.02910201]),
 'fit_time': (0.9594085216522217,
  1.0097758769989014,
  1.0174012184143066,
  0.9948179721832275,
  1.013617753982544),
 'test_time': (2.091670036315918,
  2.2182188034057617,
  2.112368583679199,
  2.2716832160949707,
  2.0942490100860596)}

In [243]:
cross_validate(algo3, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9321  0.9342  0.9419  0.9388  0.9338  0.9362  0.0036  
Fit time          2.49    2.51    2.49    2.50    2.49    2.50    0.01    
Test time         0.19    0.10    0.19    0.10    0.19    0.15    0.05    


{'test_rmse': array([0.93205707, 0.93422931, 0.94194469, 0.93878855, 0.93380797]),
 'fit_time': (2.4908695220947266,
  2.511514663696289,
  2.4877610206604004,
  2.4956037998199463,
  2.4919655323028564),
 'test_time': (0.19041728973388672,
  0.09635305404663086,
  0.19119501113891602,
  0.09602499008178711,
  0.190643310546875)}

### Grid search

In [254]:
param_grid = {'k': [10, 20, 30, 40], 
              'sim_options': {'name': ['cosine', 'pearson_baseline'], 'user_based': [False, False]
              }}
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity mat

In [249]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9645789297273387
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
