## MovieLens 데이터셋을 이용한 Collaborative Filtering 연습

## Collaborative Filtering Open-source Libraries

1. implicit : https://github.com/benfred/implicit

2. Surprise : https://github.com/NicolasHug/Surprise

-----
3. LightFM : https://github.com/lyst/lightfm

4. Spotlight : https://github.com/maciejkula/spotlight (pytorch)

5. Buffalo : https://github.com/kakao/buffalo  (Win10에서 설치 안됨)

### 1. Implicit 

- Item-based CF https://github.com/benfred/implicit/blob/master/implicit/nearest_neighbours.py

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.4.8.tar.gz (1.1 MB)
[?25l[K     |▎                               | 10 kB 20.3 MB/s eta 0:00:01[K     |▋                               | 20 kB 24.1 MB/s eta 0:00:01[K     |▉                               | 30 kB 12.8 MB/s eta 0:00:01[K     |█▏                              | 40 kB 9.6 MB/s eta 0:00:01[K     |█▍                              | 51 kB 5.5 MB/s eta 0:00:01[K     |█▊                              | 61 kB 6.0 MB/s eta 0:00:01[K     |██                              | 71 kB 5.8 MB/s eta 0:00:01[K     |██▎                             | 81 kB 6.4 MB/s eta 0:00:01[K     |██▋                             | 92 kB 4.9 MB/s eta 0:00:01[K     |██▉                             | 102 kB 5.3 MB/s eta 0:00:01[K     |███▏                            | 112 kB 5.3 MB/s eta 0:00:01[K     |███▍                            | 122 kB 5.3 MB/s eta 0:00:01[K     |███▊                            | 133 kB 5.3 MB/s eta 0:00:01[K     |██

In [None]:
import numpy as np
from time import time
from tqdm import tqdm_notebook
import implicit

from implicit.nearest_neighbours import CosineRecommender
from implicit.datasets.movielens import get_movielens

titles, ratings = get_movielens("100k")

0.00B [00:00, ?B/s]

In [None]:
ratings.toarray().shape

(1683, 944)

In [None]:
titles

array([b'', b'Toy Story (1995)', b'GoldenEye (1995)', ...,
       b'Sliding Doors (1998)', b'You So Crazy (1994)',
       b'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object)

In [None]:
ratings

<1683x944 sparse matrix of type '<class 'numpy.float32'>'
	with 100000 stored elements in Compressed Sparse Row format>

In [None]:
# implicit의 CosineRecommender모델
model = CosineRecommender()
start_t = time()
model.fit(ratings)
end_t = time()
print("Elapsed Time for CF : %.3fsec" % (end_t - start_t))

  0%|          | 0/1683 [00:00<?, ?it/s]

Elapsed Time for CF : 0.192sec


In [None]:
user_items = ratings.T.tocsr() # .T

In [None]:
model.recommend(1, user_items)

[(423, 120.23832527080934),
 (385, 75.83986249299838),
 (403, 69.91658430995363),
 (568, 57.16350758378815),
 (405, 56.051047115754045),
 (474, 44.25502688952575),
 (393, 42.65925502490433),
 (357, 41.67387639963299),
 (550, 39.8505997586499),
 (318, 38.622309611229106)]

In [None]:
#
for xi in range(10):
    recommendations = model.recommend(xi, user_items)
    print(recommendations)

[]
[(423, 120.23832527080934), (385, 75.83986249299838), (403, 69.91658430995363), (568, 57.16350758378815), (405, 56.051047115754045), (474, 44.25502688952575), (393, 42.65925502490433), (357, 41.67387639963299), (550, 39.8505997586499), (318, 38.622309611229106)]
[(121, 40.225367066630014), (117, 39.29225874430751), (181, 35.68579095309081), (7, 29.68970226641666), (405, 29.361682795225203), (742, 22.891083166822884), (328, 21.00804009048448), (118, 20.30540230794754), (546, 20.09678616983666), (347, 20.060081658847135)]
[(313, 28.68715943980728), (286, 17.723778908110333), (748, 15.203835211489709), (879, 14.950572215973814), (678, 14.083828353493598), (269, 13.449728706253074), (315, 13.361294557864941), (301, 10.656446304592563), (316, 10.604457263734151), (895, 10.537210220642704)]
[(313, 18.048566260402367), (302, 17.55588572442778), (333, 17.518365289014476), (748, 15.372360308008071), (286, 14.206705726612451), (181, 13.898964689572573), (7, 13.25499305832247), (307, 12.480327

In [None]:
#  implicit-als
model2 = implicit.als.AlternatingLeastSquares(factors=50)

In [None]:
model2.fit(ratings)

  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
# recommend items for a user
recommendations = model2.recommend(1, user_items)
print('recommended ',recommendations)

RuntimeError: ignored

In [None]:
# find related items
related = model2.similar_items(1)
print(related)

[(1, 1.0000000000000069), (50, 0.7345720560109757), (181, 0.6999249712920779), (121, 0.6897856040758548), (117, 0.66455478870842), (405, 0.6413217603411484), (151, 0.6381576334869846), (222, 0.6367273145962258), (100, 0.630600760761389), (237, 0.6240747014811365)]


- Understand `indptr` in csr_matrix

![indpt](figs/indptr.png)

In [None]:

user_count = np.ediff1d(ratings.indptr)
to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])

In [None]:
ratings.indptr

array([     0,      0,    452, ...,  99998,  99999, 100000], dtype=int32)

In [None]:
user_count

array([  0, 452, 131, ...,   1,   1,   1], dtype=int32)

In [None]:
to_generate

[50,
 258,
 100,
 181,
 294,
 286,
 288,
 1,
 300,
 121,
 174,
 127,
 56,
 7,
 98,
 237,
 117,
 172,
 222,
 204,
 313,
 405,
 79,
 210,
 151,
 173,
 69,
 168,
 748,
 269,
 257,
 195,
 423,
 9,
 276,
 318,
 22,
 302,
 96,
 328,
 15,
 25,
 118,
 183,
 216,
 176,
 64,
 202,
 234,
 28,
 191,
 89,
 111,
 275,
 12,
 742,
 357,
 82,
 135,
 289,
 97,
 238,
 268,
 546,
 70,
 186,
 196,
 333,
 475,
 153,
 132,
 125,
 228,
 144,
 483,
 194,
 245,
 323,
 185,
 197,
 11,
 282,
 496,
 301,
 568,
 265,
 655,
 182,
 273,
 143,
 179,
 180,
 471,
 71,
 161,
 8,
 95,
 427,
 678,
 322,
 235,
 435,
 508,
 88,
 215,
 271,
 4,
 187,
 603,
 175,
 385,
 200,
 211,
 597,
 588,
 403,
 515,
 208,
 230,
 134,
 272,
 250,
 527,
 298,
 474,
 284,
 393,
 209,
 274,
 340,
 307,
 124,
 147,
 13,
 14,
 23,
 203,
 514,
 732,
 751,
 479,
 480,
 566,
 419,
 591,
 283,
 83,
 845,
 58,
 326,
 327,
 24,
 154,
 432,
 511,
 99,
 255,
 133,
 137,
 218,
 229,
 433,
 651,
 188,
 367,
 402,
 451,
 321,
 628,
 684,
 319,
 582,
 226,

In [None]:
with tqdm_notebook(total=len(to_generate)) as progress:
    with open("result.txt", 'w', encoding="utf-8") as f:
        for movieid in to_generate:
            if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
                title = titles[movieid]
                for other, score in model.similar_items(movieid, 11):
                    print("%s\t%s\t%s\n" % (title, titles[other], score), file=f)
            progress.update(1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/1683 [00:00<?, ?it/s]

### 2. Surprise 

- Item-based CF
https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic

In [None]:
from surprise import SVD
from surprise import KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25)


# Train the algorithm on the trainset, and predict ratings for the testset
algo = KNNBasic()
algo.fit(trainset)
predictions = algo.test(testset)

def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0


# Then compute RMSE
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9801


0.9800729778583429

In [None]:
import pandas as pd

df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [None]:
df

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,458,483,5.0,4.546551,"{'actual_k': 40, 'was_impossible': False}",130,193,0.453449
1,312,185,5.0,4.231579,"{'actual_k': 40, 'was_impossible': False}",162,180,0.768421
2,250,179,4.0,4.123165,"{'actual_k': 40, 'was_impossible': False}",93,163,0.123165
3,825,276,1.0,3.685472,"{'actual_k': 40, 'was_impossible': False}",102,221,2.685472
4,130,315,4.0,4.166190,"{'actual_k': 40, 'was_impossible': False}",274,117,0.166190
...,...,...,...,...,...,...,...,...
24995,880,177,5.0,4.037615,"{'actual_k': 40, 'was_impossible': False}",264,88,0.962385
24996,472,100,5.0,4.335637,"{'actual_k': 40, 'was_impossible': False}",191,385,0.664363
24997,109,71,4.0,4.027099,"{'actual_k': 40, 'was_impossible': False}",176,170,0.027099
24998,454,77,4.0,3.213800,"{'actual_k': 40, 'was_impossible': False}",177,107,0.786200


In [None]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
2847,782,1664,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",164,1,0.0
13071,896,1672,2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",270,1,0.0
563,655,1379,3.0,3.0,"{'actual_k': 2, 'was_impossible': False}",501,2,0.0
23628,5,439,1.0,1.0,"{'actual_k': 2, 'was_impossible': False}",136,2,0.0
5488,222,247,1.0,1.0,"{'actual_k': 2, 'was_impossible': False}",300,2,0.0
24403,776,439,1.0,1.0,"{'actual_k': 2, 'was_impossible': False}",81,2,0.0
17844,181,1354,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",321,1,0.0
7399,5,437,1.0,1.0,"{'actual_k': 4, 'was_impossible': False}",136,4,0.0
5790,519,1293,5.0,5.0,"{'actual_k': 2, 'was_impossible': False}",32,2,0.0
22139,181,1162,1.0,1.0,"{'actual_k': 2, 'was_impossible': False}",321,2,0.0


In [None]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
6742,401,127,1.0,4.314673,"{'actual_k': 40, 'was_impossible': False}",113,314,3.314673
16592,562,114,1.0,4.345554,"{'actual_k': 40, 'was_impossible': False}",57,54,3.345554
18456,405,851,1.0,4.347989,"{'actual_k': 2, 'was_impossible': False}",544,2,3.347989
20399,707,641,1.0,4.348378,"{'actual_k': 24, 'was_impossible': False}",163,24,3.348378
19766,517,1177,5.0,1.624446,"{'actual_k': 4, 'was_impossible': False}",29,4,3.375554
21638,427,263,5.0,1.609333,"{'actual_k': 15, 'was_impossible': False}",27,15,3.390667
18622,286,285,1.0,4.398816,"{'actual_k': 40, 'was_impossible': False}",220,122,3.398816
9561,681,1394,5.0,1.587868,"{'actual_k': 5, 'was_impossible': False}",14,5,3.412132
17900,472,375,5.0,1.580821,"{'actual_k': 16, 'was_impossible': False}",191,16,3.419179
13618,38,247,5.0,1.0,"{'actual_k': 2, 'was_impossible': False}",88,2,4.0
