# Reference
- https://github.com/benfred/implicit/blob/main/examples/movielens.py


In [1]:
from __future__ import print_function

import argparse
import codecs
import logging
import time

import numpy as np
import tqdm

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.datasets.movielens import get_movielens
from implicit.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    bm25_weight,
)

log = logging.getLogger("implicit")

In [2]:
variant = '100k'

titles, ratings = get_movielens(variant)




- movie lens 100k



    

In [3]:
print("titles 자료구조: ", type(titles))
print("rating 자료구조: ", type(ratings))
print("영화 수: ",len(titles))
print("영화수 x 사용자: ",ratings.shape)

for i in range(6):
    # 0은 !=을 이용하라는 Warning
    rate = np.sum(ratings == i) / (ratings.shape[0] * ratings.shape[1])
    print(f'{i}의 비율: ',rate)



titles 자료구조:  <class 'numpy.ndarray'>
rating 자료구조:  <class 'scipy.sparse.csr.csr_matrix'>
영화 수:  1683
영화수 x 사용자:  (1683, 944)
0의 비율:  0.9370575143257097
1의 비율:  0.0038457858746991347
2의 비율:  0.007156560621166802
3의 비율:  0.01708573773628609
4의 비율:  0.021509965054331955
5의 비율:  0.013344436387806278


  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
set(ratings.data)

{1.0, 2.0, 3.0, 4.0, 5.0}

In [5]:
min_rating = 4
ratings.data[ratings.data < min_rating] = 0
ratings.eliminate_zeros()
ratings.data = np.ones(len(ratings.data))


In [6]:
set(ratings.data)

{1.0}

In [7]:
for i in range(6):
    # 0은 !=을 이용하라는 Warning
    rate = np.sum(ratings == i) / (ratings.shape[0] * ratings.shape[1])
    print(f'{i}의 비율: ',rate)


0의 비율:  0.9651455985578618
1의 비율:  0.03485440144213823
2의 비율:  0.0
3의 비율:  0.0
4의 비율:  0.0
5의 비율:  0.0


- bm25 가중치
    - https://inyl.github.io/search_engine/2017/04/01/bm25.html
    


In [8]:
# def bm25_weight(X, K1=100, B=0.8):
#     """Weighs each row of a sparse matrix X  by BM25 weighting"""
#     # calculate idf per term (user)
#     X = coo_matrix(X)

#     N = float(X.shape[0])
#     idf = log(N) - log1p(bincount(X.col))

#     # calculate length_norm per document (artist)
#     row_sums = np.ravel(X.sum(axis=1))
#     average_length = row_sums.mean()
#     length_norm = (1.0 - B) + B * row_sums / average_length

#     # weight matrix rows by bm25
#     X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
#     return X

In [9]:
ratings_als = (bm25_weight(ratings, B=0.9) * 5).tocsr()
als = AlternatingLeastSquares(use_gpu=False)

bpr = BayesianPersonalizedRanking(use_gpu=False)
lmf = LogisticMatrixFactorization(use_gpu=False)

# similarity based recsys
tfidf = TFIDFRecommender()
cr = CosineRecommender()
bm25 = BM25Recommender(B=0.2)




In [10]:
models_data = [(als, ratings_als)]

In [39]:
models = [als, bpr, lmf, tfidf, cr, bm25]
datas = [ratings_als] + [ratings] * 5
model_names = ["als", "bpr", "lmf","tfidf", "cr", "bm25"]

trained_models = []

In [38]:
user_count = np.ediff1d(ratings.indptr)

to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])
    
    
    

In [41]:
for i, model in enumerate(models):
    model.fit(datas[i])
    
    trained_models.append(model)
    print(f"{model_names[i]} trained")
    

  0%|          | 0/15 [00:00<?, ?it/s]

als trained


  0%|          | 0/100 [00:00<?, ?it/s]

bpr trained


  0%|          | 0/30 [00:00<?, ?it/s]

lmf trained


  0%|          | 0/944 [00:00<?, ?it/s]

tfidf trained


  0%|          | 0/944 [00:00<?, ?it/s]

cr trained


  0%|          | 0/944 [00:00<?, ?it/s]

bm25 trained


In [58]:
ratings[to_generate[0]]

<1x944 sparse matrix of type '<class 'numpy.float64'>'
	with 501 stored elements in Compressed Sparse Row format>

In [64]:
models[0].similar_items(to_generate[0],12)

(array([ 50, 789, 150, 582, 736,  27, 501, 869, 529, 241, 105, 266],
       dtype=int32),
 array([0.9999998 , 0.41210485, 0.36812308, 0.33589113, 0.33532268,
        0.3136305 , 0.31298774, 0.30450404, 0.28677464, 0.28241003,
        0.27786496, 0.2726795 ], dtype=float32))

In [60]:
models[0].recommend(to_generate[0], ratings[to_generate[0]], 12)

(array([919, 327, 259, 881, 931, 870, 821, 941, 390,  30, 417, 677],
       dtype=int32),
 array([0.93889844, 0.93549114, 0.8187277 , 0.7528515 , 0.7081747 ,
        0.6909183 , 0.688522  , 0.6808297 , 0.6799457 , 0.67739385,
        0.67010164, 0.6612667 ], dtype=float32))

In [59]:
models[0].recommend(to_generate[0], ratings[to_generate[0]])

(array([919, 327, 259, 881, 931, 870, 821, 941, 390,  30], dtype=int32),
 array([0.93889844, 0.93549114, 0.8187277 , 0.7528515 , 0.7081747 ,
        0.6909183 , 0.688522  , 0.6808297 , 0.6799457 , 0.67739385],
       dtype=float32))