# 추천 시스템 (Recommender Systems)

* 추천 시스템은 크게 두가지로 구분 가능
  * 컨텐츠 기반 필터링 (content-based filtering)
  * 협업 필터링 (collaborative filtering)
* 두가지를 조합한 hybrid 방식도 가능
* 컨텐츠 기반 필터링은 지금까지 사용자의 이전 행동과 명시적 피드백을 통해 사용자가 좋아하는 것과 유사한 항목을 추천
* 협업 필터링은 사용자와 항목간의 유사성을 동시에 사용해 추천

## Surprise

* 추천 시스템 개발을 위한 라이브러리
* 다양한 모델과 데이터 제공
* scikit-learn과 유사한 사용 방법

간단한 surprise 실습

In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

data= Dataset.load_builtin('ml-100k',prompt=False) #영화 데이터 movie lens
data.raw_ratings[:10] #user_id movie_id rating time

[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [10]:
model=SVD()
cross_validate(model,data,measures=['rmse','mae',],cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9357  0.9313  0.9410  0.9401  0.9322  0.9361  0.0040  
MAE (testset)     0.7365  0.7339  0.7426  0.7406  0.7359  0.7379  0.0032  
Fit time          2.59    2.54    2.57    2.56    2.56    2.56    0.02    
Test time         0.09    0.11    0.06    0.10    0.06    0.09    0.02    


{'test_rmse': array([0.93570073, 0.93125747, 0.94102406, 0.9400586 , 0.93222811]),
 'test_mae': array([0.73654774, 0.73392256, 0.74263196, 0.7405616 , 0.73585339]),
 'fit_time': (2.5923054218292236,
  2.543876886367798,
  2.5687904357910156,
  2.5590498447418213,
  2.5552289485931396),
 'test_time': (0.0895543098449707,
  0.10596251487731934,
  0.06447315216064453,
  0.10464048385620117,
  0.06414318084716797)}

## 컨텐츠 기반 필터링 (Content-based Filtering)

* 사용자가 과거에 좋아했던 아이템을 파악하고, 그 아이템과 비슷한 아이템을 추천
  * ex) 스파이더맨에 평점 4.5점 부여한 유저 A -> 타이타닉보다 아이언맨을 더 선호할 것이다!
  * 1. 유저가 과거에 접한 아이템이면서 만족한 아이템
    2. 유저가 좋아했던 아이템 중 일부 또는 전체와 비슷한 아이템 선정
    3. 선정된 아이템을 유저에게 추천

* 장단점
  * 장점
    * 많은 수의 사용자를 대상으로 쉽게 확장 가능
  * 단점
    * 입력 특성을 직접 설계해야 하기 때문에 많은 도메인 지식이 필요
    * 사용자의 기존 관심사항을 기반으로만 추천 가능

## 협업 필터링(Collaborative Filtering)

* 비슷한 성향 또는 취향을 갖는 다른 유저가 좋아한 아이템을 현재 유저에게 추천
* 1. 스파이더맨에 4.5점을 준 2명의 유저 A,B
  2. 유저 A는 아이언맨도 좋게 평점을 줌
  3. 유저 A와 B의 성향은 비슷할 것이므로, 아이언맨을 B에게도 추천해줌
  
* 간단하면서도 수준 높은 정확도가 나타남

* 장단점
  * 장점
    * 자동으로 임베딩을 학습하기 때문에 도메인 지식이 필요 없다.
    * 기존의 관심사가 아니더라도 추천 가능
  * 단점
    * 학습 과정에 나오지 않은 항목은 임베딩을 만들 수 없음
    * 추가 특성을 사용하기 어려움

In [11]:
import numpy as np
from surprise import Dataset
import pandas as pd

* 이진 벡터의 내적을 통해 다른 사용자들과의 유사도 구하기
* 나와 가장 높은 유사도를 가진 사용자의 시청 목록을 추천

In [12]:
data=Dataset.load_builtin('ml-100k',prompt=False)
raw_data=np.array(data.raw_ratings,dtype=int)

In [13]:
raw_data[:,0]-=1 #0부터 시작하도록 조정
raw_data[:,1]-=1

In [14]:
#인접행렬 크기
n_users=np.max(raw_data[:,0])
n_movies=np.max(raw_data[:,1])
shape=(n_users+1,n_movies+1)
shape # 943 user, 1682 movies

(943, 1682)

In [46]:
#인접행렬(user_id와 movie_id)
adj_matrix=np.ndarray(shape,dtype=int)
for user_id, movie_id,rating,time in raw_data:
    adj_matrix[user_id][movie_id]=1
adj_matrix

# adj_matrix[2].sum()
# sum(raw_data[:,0]==2)

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [48]:
my_id,my_vector = 0,adj_matrix[0]
best_match,best_match_id,best_match_vector=-1,-1,[]

for user_id,user_vector in enumerate(adj_matrix):
    if my_id!=user_id:
        similarity=np.dot(my_vector,user_vector)
        if similarity>best_match:
            best_match=similarity
            best_match_id=user_id
            best_match_vector=user_vector

print('Best Match:{},Best Match ID:{}'.format(best_match,best_match_id)) #183개 같은 사람이 최대 유사도, np.dot 단순 행렬 곱

Best Match:183,Best Match ID:275


In [79]:
recommend_list=[]
for i,log in enumerate(zip(my_vector,best_match_vector)):
    log1,log2=log
    if log1<1 and log2>0:
        recommend_list.append(i)
print(recommend_list) # ID:275는 봤지만 ID:0은 안 본 영화  , 518 - 183 = 335

[272, 273, 275, 280, 281, 283, 287, 288, 289, 290, 292, 293, 297, 299, 300, 301, 302, 306, 312, 314, 315, 316, 317, 321, 322, 323, 324, 327, 330, 331, 332, 333, 339, 342, 345, 346, 353, 354, 355, 356, 357, 363, 364, 365, 366, 372, 374, 378, 379, 381, 382, 383, 384, 385, 386, 387, 390, 391, 392, 394, 395, 396, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 414, 416, 417, 418, 419, 420, 422, 424, 425, 426, 427, 428, 430, 431, 432, 435, 442, 446, 447, 448, 449, 450, 451, 452, 454, 455, 457, 460, 461, 462, 468, 469, 470, 471, 472, 473, 474, 478, 495, 500, 507, 517, 522, 525, 530, 539, 540, 543, 545, 546, 548, 549, 550, 551, 553, 557, 558, 560, 561, 562, 563, 565, 566, 567, 568, 570, 571, 574, 575, 576, 577, 580, 581, 582, 585, 587, 589, 590, 594, 596, 602, 623, 626, 627, 630, 633, 635, 639, 646, 648, 651, 652, 654, 657, 664, 668, 671, 677, 678, 681, 683, 684, 685, 690, 691, 692, 695, 696, 708, 709, 714, 718, 719, 720, 724, 726, 727, 731, 733, 734, 736, 738, 741, 742, 745,

* 유클리드 거리를 사용해 추천
$$euclidean = \sqrt{\sum_{d=1}^{D}(A_i - B_i)^2}$$
* 거리가 가까울 수록(값이 작을 수록) 나와 유사한 사용자

In [139]:
my_id,my_vector=0,adj_matrix[0]
best_match,best_match_id,best_match_vector=9999,-1,[]

for user_id,user_vector in enumerate(adj_matrix):
    if my_id!=user_id:
        euclidean_dist=np.sqrt(np.sum(np.square(my_vector-user_vector)))
        if euclidean_dist<best_match:
            best_match=euclidean_dist
            best_match_id=user_id
            best_match_vector=user_vector

print('Best Match:{},Best Match ID:{}'.format(best_match,best_match_id))

Best Match:14.832396974191326,Best Match ID:737


In [140]:
recommend_list=[]
for i,log in enumerate(zip(my_vector,best_match_vector)):
    log1,log2=log
    if log1<1 and log2>0:
        recommend_list.append(i)
print(recommend_list)

[0, 1, 3, 6, 21, 27, 38, 41, 46, 49, 53, 55, 62, 63, 68, 70, 78, 80, 81, 87, 88, 90, 94, 95, 96, 97, 99, 108, 116, 117, 120, 126, 127, 134, 135, 140, 143, 146, 150, 151, 152, 153, 160, 163, 167, 168, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 185, 187, 188, 190, 192, 194, 195, 196, 198, 199, 201, 202, 203, 204, 205, 207, 208, 209, 210, 213, 215, 221, 224, 225, 226, 227, 228, 229, 230, 232, 233, 234, 237, 239, 249, 251, 253, 256, 257, 259, 264, 268, 270]


* 코사인 유사도를 사용해 추천

\begin{equation}
cos \theta = \frac{A \cdot B}{||A|| \times ||B||}
\end{equation}
* 두 벡터가 이루고 있는 각을 계산

In [141]:
def compute_cos_similarity(v1,v2):
    norm1=np.sqrt(np.sum(np.square(v1)))
    norm2=np.sqrt(np.sum(np.square(v2)))
    dot=np.dot(v1,v2)
    return dot/(norm1*norm2)

In [142]:
my_id,my_vector=0,adj_matrix[0]
best_match,best_match_id,best_match_vector=-1,-1,[]

for user_id,user_vector in enumerate(adj_matrix):
    if my_id!=user_id:
        cos_similarity=compute_cos_similarity(my_vector,user_vector)
        if cos_similarity>best_match:
            best_match=cos_similarity
            best_match_id=user_id
            best_match_vector=user_vector

print('Best Match:{},Best Match ID:{}'.format(best_match,best_match_id))

Best Match:0.5278586163659506,Best Match ID:915


In [144]:
recommend_list=[]
for i,log in enumerate(zip(my_vector,best_match_vector)):
    log1,log2=log
    if log1<1 and log2>0:
        recommend_list.append(i)
print(recommend_list)

[272, 275, 279, 280, 283, 285, 289, 294, 297, 316, 317, 355, 365, 366, 368, 379, 380, 381, 384, 386, 392, 398, 401, 404, 416, 420, 422, 424, 426, 427, 430, 432, 450, 460, 461, 466, 469, 471, 473, 474, 475, 479, 482, 483, 497, 505, 508, 510, 511, 522, 526, 527, 529, 530, 534, 536, 540, 545, 548, 549, 556, 557, 558, 560, 565, 567, 568, 569, 577, 580, 581, 582, 592, 596, 630, 635, 639, 641, 649, 651, 654, 673, 677, 678, 683, 684, 692, 696, 701, 703, 707, 708, 709, 712, 714, 719, 720, 726, 731, 734, 736, 738, 740, 745, 747, 754, 755, 761, 762, 763, 766, 780, 789, 791, 805, 819, 823, 824, 830, 843, 862, 865, 918, 929, 930, 938, 942, 943, 947, 958, 959, 960, 970, 977, 1004, 1008, 1009, 1010, 1013, 1041, 1045, 1069, 1072, 1073, 1078, 1097, 1100, 1108, 1112, 1118, 1134, 1193, 1205, 1207, 1216, 1219, 1267, 1334, 1400, 1427, 1596, 1681]


기존 방법에 명시적 피드백(사용자가 평가한 영화 점수)을 추가해 실험

In [145]:
adj_matrix=np.ndarray(shape,dtype=int)
for user_id,movie_id,rating,time in raw_data:
    adj_matrix[user_id][movie_id]=rating
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [148]:
my_id,my_vector=0,adj_matrix[0]
best_match,best_match_id,best_match_vector=9999,-1,[]

for user_id,user_vector in enumerate(adj_matrix):
    if my_id!=user_id:
        euclidean_dist=np.sqrt(np.sum(np.square(my_vector-user_vector)))
        if euclidean_dist<best_match:
            best_match=euclidean_dist
            best_match_id=user_id
            best_match_vector=user_vector

print('Best Match:{},Best Match ID:{}'.format(best_match,best_match_id))

Best Match:55.06359959174482,Best Match ID:737


In [149]:
my_id,my_vector=0,adj_matrix[0]
best_match,best_match_id,best_match_vector=-1,-1,[]

for user_id,user_vector in enumerate(adj_matrix):
    if my_id!=user_id:
        cos_similarity=compute_cos_similarity(my_vector,user_vector)
        if cos_similarity>best_match:
            best_match=cos_similarity
            best_match_id=user_id
            best_match_vector=user_vector

print('Best Match:{},Best Match ID:{}'.format(best_match,best_match_id))

Best Match:0.569065731527988,Best Match ID:915


In [150]:
from surprise import KNNBasic,SVD,SVDpp,NMF
from surprise import Dataset
from surprise.model_selection import cross_validate

In [151]:
data=Dataset.load_builtin('ml-100k',prompt=False)

* KNN을 사용한 협업 필터링

In [153]:
model=KNNBasic()
cross_validate(model,data,measures=['rmse','mae'],cv=5,n_jobs=4,verbose=True)#병렬

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9818  0.9750  0.9717  0.9794  0.9848  0.9785  0.0047  
MAE (testset)     0.7741  0.7729  0.7680  0.7734  0.7762  0.7729  0.0027  
Fit time          0.27    0.28    0.26    0.28    0.27    0.27    0.01    
Test time         1.45    1.48    1.48    1.48    1.46    1.47    0.01    


{'test_rmse': array([0.98181652, 0.97497625, 0.97166455, 0.97940306, 0.98484347]),
 'test_mae': array([0.77405919, 0.77290013, 0.76797358, 0.77343902, 0.77616571]),
 'fit_time': (0.274580717086792,
  0.275393009185791,
  0.2623255252838135,
  0.28393054008483887,
  0.27086496353149414),
 'test_time': (1.4538841247558594,
  1.4757111072540283,
  1.483393669128418,
  1.4820780754089355,
  1.4642000198364258)}

* SVD를 사용한 협업 필터링

In [154]:
model=SVD()
cross_validate(model,data,measures=['rmse','mae'],cv=5,n_jobs=4,verbose=True)#병렬

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9300  0.9464  0.9360  0.9391  0.9348  0.9372  0.0054  
MAE (testset)     0.7320  0.7478  0.7379  0.7414  0.7367  0.7391  0.0053  
Fit time          3.00    3.00    3.02    2.84    2.44    2.86    0.22    
Test time         0.10    0.10    0.12    0.09    0.09    0.10    0.01    


{'test_rmse': array([0.92999806, 0.94637878, 0.93600267, 0.93906617, 0.93477233]),
 'test_mae': array([0.73195729, 0.74778831, 0.73792763, 0.74137244, 0.73670084]),
 'fit_time': (3.004208564758301,
  3.0032451152801514,
  3.017526626586914,
  2.8437561988830566,
  2.4402852058410645),
 'test_time': (0.09600257873535156,
  0.09604191780090332,
  0.1157999038696289,
  0.09131431579589844,
  0.0924217700958252)}

* NMF를 사용한 협업 필터링

In [25]:
model=NMF()
cross_validate(model,data,measures=['rmse','mae'],cv=5,n_jobs=4,verbose=True)#병렬

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9630  0.9628  0.9533  0.9643  0.9592  0.9605  0.0040  
MAE (testset)     0.7562  0.7559  0.7497  0.7589  0.7567  0.7555  0.0031  
Fit time          14.35   18.12   17.80   15.44   10.54   15.25   2.75    
Test time         0.67    0.49    0.39    0.24    0.14    0.39    0.18    


* SVD++를 사용한 협업 필터링

In [155]:
#시간이 오래걸려 실용적이지 않음
model=SVDpp()
cross_validate(model,data,measures=['rmse','mae'],cv=5,n_jobs=4,verbose=True)#병렬

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9255  0.9173  0.9162  0.9217  0.9206  0.9202  0.0033  
MAE (testset)     0.7243  0.7238  0.7183  0.7191  0.7198  0.7211  0.0025  
Fit time          112.87  113.80  115.51  113.93  105.26  112.27  3.61    
Test time         1.92    1.99    1.68    1.96    1.77    1.86    0.12    


{'test_rmse': array([0.92545322, 0.91728146, 0.91616368, 0.92170431, 0.92059477]),
 'test_mae': array([0.72434932, 0.72382328, 0.71833454, 0.71910427, 0.71983739]),
 'fit_time': (112.87064480781555,
  113.79991912841797,
  115.5105288028717,
  113.92655563354492,
  105.26371145248413),
 'test_time': (1.9249086380004883,
  1.9927406311035156,
  1.682854413986206,
  1.9579977989196777,
  1.7661612033843994)}

## 하이브리드(Hybrid)

* 컨텐츠 기반 필터링과 협업 필터링을 조합한 방식
* 많은 하이브리드 방식이 존재
* 실습에서는 협업 필터링으로 임베딩을 학습하고 컨텐츠 기반 필터링으로 유사도 기반 추천을 수행하는 추천 엔진 개발

In [104]:
import numpy as np
from sklearn.decomposition import randomized_svd,non_negative_factorization
from surprise import Dataset

In [105]:
data=Dataset.load_builtin('ml-100k',prompt=False)
raw_data=np.array(data.raw_ratings,dtype=int)
raw_data[:,0]-=1
raw_data[:,1]-=1

In [106]:
n_users=np.max(raw_data[:,0])
n_movies=np.max(raw_data[:,1])
shape=(n_users+1,n_movies+1)

In [107]:
adj_matrix=np.ndarray(shape,dtype=int)
for user_id,movie_id,rating,time in raw_data:
    adj_matrix[user_id][movie_id]=rating

In [108]:
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [109]:
U,S,V=randomized_svd(adj_matrix,n_components=2) #U:사용자 S:특이값 벡터 V:항목
S=np.diag(S)
print(U.shape)
print(S.shape)
print(V.shape)

(943, 2)
(2, 2)
(2, 1682)




In [110]:
np.matmul(np.matmul(U,S),V)

array([[ 3.91732663e+00,  1.47276644e+00,  7.98261988e-01, ...,
         6.24907189e-04,  1.41100852e-02,  1.36545878e-02],
       [ 1.85777226e+00,  3.96191175e-01,  5.05705740e-01, ...,
         5.38862978e-03,  1.77237914e-03,  5.26968095e-04],
       [ 8.94989517e-01,  1.71578497e-01,  2.51738682e-01, ...,
         2.92094923e-03,  5.39937171e-04, -1.25733753e-04],
       ...,
       [ 9.92051955e-01,  2.10814957e-01,  2.70363365e-01, ...,
         2.89019297e-03,  9.34221962e-04,  2.66612193e-04],
       [ 1.30425401e+00,  5.27669941e-01,  2.50080165e-01, ...,
        -4.20677765e-04,  5.30525683e-03,  5.28069948e-03],
       [ 2.82999397e+00,  9.70812247e-01,  6.15871694e-01, ...,
         2.02091492e-03,  8.67740813e-03,  8.03107892e-03]])

* 사용자 기반 추천
* 나와 비슷한 취향을 가진 다른 사용자의 행동을 추천
* 사용자 특징 벡터의 유사도 사용

In [167]:
my_id,my_vector=0,U[0]
best_match,best_match_id,best_match_vector=-1,-1,[]

for user_id,user_vector in enumerate(U):
    if my_id!=user_id:
        cos_similarity=compute_cos_similarity(my_vector,user_vector)
        if cos_similarity>best_match:
            best_match=cos_similarity
            best_match_id=user_id
            best_match_vector=user_vector

print('Best Match:{},Best Match ID:{}'.format(best_match,best_match_id)) #99퍼센트

Best Match:0.9999942295956322,Best Match ID:235


In [168]:
recommend_list=[]
for i,log in enumerate(zip(adj_matrix[my_id],adj_matrix[best_match_id])):
    log1,log2=log
    if log1<1 and log2>0:
        recommend_list.append(i)
print(recommend_list)

[272, 273, 274, 281, 285, 288, 293, 297, 303, 306, 312, 317, 327, 332, 369, 410, 418, 419, 422, 426, 428, 431, 434, 442, 461, 475, 477, 482, 495, 503, 504, 505, 506, 509, 519, 520, 522, 525, 531, 545, 548, 590, 594, 595, 613, 631, 654, 658, 660, 672, 684, 685, 691, 695, 698, 704, 716, 728, 734, 749, 755, 863, 865, 933, 1012, 1038, 1101, 1327, 1400]


* 항목 기반 추천
* 내가 본 항목과 비슷한 항목을 추천
* 항목 특징 벡터의 유사도 사용

In [169]:
my_id,my_vector=0,V.T[0]
best_match,best_match_id,best_match_vector=-1,-1,[]

for user_id,user_vector in enumerate(V.T):
    if my_id!=user_id:
        cos_similarity=compute_cos_similarity(my_vector,user_vector)
        if cos_similarity>best_match:
            best_match=cos_similarity
            best_match_id=user_id
            best_match_vector=user_vector

print('Best Match:{},Best Match ID:{}'.format(best_match,best_match_id)) #99퍼센트

Best Match:0.9999999951364144,Best Match ID:1287


In [170]:
recommend_list = []
for i, user_vector in enumerate(adj_matrix):
  if adj_matrix[i][my_id] > 0.9:
    recommend_list.append(i)
print(recommend_list)

[0, 1, 4, 5, 9, 12, 14, 15, 16, 17, 19, 20, 22, 24, 25, 37, 40, 41, 42, 43, 44, 48, 53, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 69, 71, 72, 74, 76, 78, 80, 81, 82, 83, 88, 91, 92, 93, 94, 95, 96, 98, 100, 101, 105, 107, 108, 116, 119, 120, 123, 124, 127, 129, 130, 133, 136, 137, 140, 143, 144, 147, 149, 150, 156, 157, 159, 161, 167, 173, 176, 177, 180, 181, 183, 188, 192, 193, 197, 198, 199, 200, 201, 202, 203, 208, 209, 212, 215, 221, 222, 229, 230, 231, 233, 234, 241, 242, 243, 245, 246, 247, 248, 249, 250, 251, 252, 253, 255, 261, 262, 264, 267, 270, 273, 274, 275, 276, 278, 279, 285, 286, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 302, 304, 306, 307, 310, 311, 312, 313, 319, 321, 323, 324, 325, 326, 329, 330, 331, 335, 337, 338, 339, 342, 343, 344, 346, 347, 349, 356, 358, 359, 362, 364, 370, 373, 377, 378, 379, 380, 386, 387, 388, 389, 392, 393, 394, 395, 397, 398, 400, 401, 402, 405, 406, 410, 411, 415, 416, 418, 421, 423, 424, 428, 431, 433, 434, 437, 440, 444, 

* 비음수 행렬 분해를 사용한 하이브리드 추천

In [171]:
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [172]:
#NNF
A,B,iter=non_negative_factorization(adj_matrix,n_components=2)

In [173]:
np.matmul(A,B)

array([[3.71107433e+00, 1.48461856e+00, 7.39541570e-01, ...,
        3.64501983e-03, 1.45513751e-02, 1.44116215e-02],
       [2.11729713e+00, 2.37145679e-01, 5.51637757e-01, ...,
        4.76290749e-03, 2.84605931e-05, 0.00000000e+00],
       [9.85325089e-01, 1.10360320e-01, 2.56715279e-01, ...,
        2.21651094e-03, 1.32446864e-05, 0.00000000e+00],
       ...,
       [1.04478344e+00, 1.17019891e-01, 2.72206478e-01, ...,
        2.35026384e-03, 1.40439224e-05, 0.00000000e+00],
       [1.45769331e+00, 5.42108391e-01, 2.99217251e-01, ...,
        1.61232500e-03, 5.15892655e-03, 5.10748255e-03],
       [2.44709957e+00, 9.41278705e-01, 4.95671746e-01, ...,
        2.56934867e-03, 9.08400301e-03, 8.99501717e-03]])

* 사용자 기반 추천

In [174]:
my_id,my_vector=0,U[0]
best_match,best_match_id,best_match_vector=-1,-1,[]

for user_id,user_vector in enumerate(U):
    if my_id!=user_id:
        cos_similarity=compute_cos_similarity(my_vector,user_vector)
        if cos_similarity>best_match:
            best_match=cos_similarity
            best_match_id=user_id
            best_match_vector=user_vector

print('Best Match:{},Best Match ID:{}'.format(best_match,best_match_id)) #99퍼센트

Best Match:0.9999942295956322,Best Match ID:235


In [175]:
recommend_list = []
for i, log in enumerate(zip(adj_matrix[my_id], adj_matrix[best_match_id])):
  log1, log2 = log
  if log1 < 1. and log2 > 0.:
    recommend_list.append(i)
print(recommend_list)

[272, 273, 274, 281, 285, 288, 293, 297, 303, 306, 312, 317, 327, 332, 369, 410, 418, 419, 422, 426, 428, 431, 434, 442, 461, 475, 477, 482, 495, 503, 504, 505, 506, 509, 519, 520, 522, 525, 531, 545, 548, 590, 594, 595, 613, 631, 654, 658, 660, 672, 684, 685, 691, 695, 698, 704, 716, 728, 734, 749, 755, 863, 865, 933, 1012, 1038, 1101, 1327, 1400]


* 항목 기반 추천

In [179]:
my_id, my_vector = 0, V.T[0]  # 기준 항목이 0번 이라고 가정
best_match, best_match_id, best_match_vector = -1, -1, []

for item_id, item_vector in enumerate(V.T):
  if my_id != item_id:
    cos_similarity = compute_cos_similarity(my_vector, item_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = item_id
      best_match_vector = item_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.9999999951364144, Best Match ID: 1287


In [180]:
recommend_list = []
for i, user_vector in enumerate(adj_matrix):
  if adj_matrix[i][my_id] > 0.9:
    recommend_list.append(i)
print(recommend_list)

[0, 1, 4, 5, 9, 12, 14, 15, 16, 17, 19, 20, 22, 24, 25, 37, 40, 41, 42, 43, 44, 48, 53, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 69, 71, 72, 74, 76, 78, 80, 81, 82, 83, 88, 91, 92, 93, 94, 95, 96, 98, 100, 101, 105, 107, 108, 116, 119, 120, 123, 124, 127, 129, 130, 133, 136, 137, 140, 143, 144, 147, 149, 150, 156, 157, 159, 161, 167, 173, 176, 177, 180, 181, 183, 188, 192, 193, 197, 198, 199, 200, 201, 202, 203, 208, 209, 212, 215, 221, 222, 229, 230, 231, 233, 234, 241, 242, 243, 245, 246, 247, 248, 249, 250, 251, 252, 253, 255, 261, 262, 264, 267, 270, 273, 274, 275, 276, 278, 279, 285, 286, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 302, 304, 306, 307, 310, 311, 312, 313, 319, 321, 323, 324, 325, 326, 329, 330, 331, 335, 337, 338, 339, 342, 343, 344, 346, 347, 349, 356, 358, 359, 362, 364, 370, 373, 377, 378, 379, 380, 386, 387, 388, 389, 392, 393, 394, 395, 397, 398, 400, 401, 402, 405, 406, 410, 411, 415, 416, 418, 421, 423, 424, 428, 431, 433, 434, 437, 440, 444, 