<a href="https://colab.research.google.com/github/GaGyeong-Kim/RecommenderSystem/blob/main/Recommender_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Surprise
- **추천 시스템** 개발을 위한 라이브러리
- 다양한 모델과 데이터 제공
- **scikit-learn** 과 유사한 사용 방법

In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162678 sha256=cc85a78e3e6799ca480831e04cd738524d58bcadc5b95f7b96d448dd5d9ead9e
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

## 데이터 불러오기
- 데이터 형식 : 사용자, 영화, 별점, id

In [5]:
data = Dataset.load_builtin('ml-100k', prompt = False)
data.raw_ratings[:10]

[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [11]:
model = SVD()

In [12]:
cross_validate(model, data, measures = ['rmse','mae'], cv = 5, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9293  0.9339  0.9369  0.9423  0.9423  0.9369  0.0050  
MAE (testset)     0.7305  0.7366  0.7394  0.7433  0.7415  0.7383  0.0045  
Fit time          0.95    0.96    0.95    0.95    0.96    0.95    0.01    
Test time         0.09    0.08    0.17    0.10    0.18    0.12    0.04    


{'test_rmse': array([0.92927599, 0.93392179, 0.93685059, 0.94231711, 0.94229168]),
 'test_mae': array([0.73045067, 0.73659106, 0.7393722 , 0.74334064, 0.7415279 ]),
 'fit_time': (0.9490604400634766,
  0.9584696292877197,
  0.9475162029266357,
  0.9542264938354492,
  0.96329665184021),
 'test_time': (0.08787417411804199,
  0.08193445205688477,
  0.16666078567504883,
  0.09796810150146484,
  0.17565345764160156)}

# 컨텐츠 기반 필터링(Content_based Filtering)

- 이전의 행동과 명시적 피드백을 통해 좋아하는 것과 유사한 항목 추천
  - 내가 지금까지 시청한 영화 목록과 다른 사용자의 시청 목록을 비교해 나와 비슷한 취향의 사용자가 시청한 영화 추천
- 유사도 기반으로 추천


In [13]:
import numpy as np
from surprise import Dataset

In [24]:
data = Dataset.load_builtin('ml-100k', prompt = False)
raw_data = np.array(data.raw_ratings,dtype = int)

In [25]:
# raw data를 0부터 사용할 수 있도록 조정
raw_data[:,0] -= 1
raw_data[:,1] -= 1

### 인접 행렬을 활용한 유사도 계산

In [26]:
# 인접 행렬의 크기
n_users = np.max(raw_data[:,0])
n_movies = np.max(raw_data[:,1])
shape = (n_users+1, n_movies+1)
shape

(943, 1682)

In [27]:
# 1이 있는 위치가 데이터가 있는 위치 (1: 본 영화 / 0 : 안 본 영))
adj_matrix = np.ndarray(shape, dtype = int)
for user_id, movie_id, raing, time in raw_data:
  adj_matrix[user_id][movie_id] = 1.
adj_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [28]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1,-1,[]

# 유사도 계산(다른 아이디와 유사도 계산해야 함)
for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    similarity = np.dot(my_vector, user_vector)
    if similarity > best_match:
      best_match = similarity
      best_match_id = user_id
      best_match_vector = user_vector
print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 183, Best Match ID: 275


In [29]:
# ID 275가 볼 만한 영화 추천
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 > 0. :
    recommend_list.append(i)
print(recommend_list)

[272, 273, 275, 280, 281, 283, 287, 288, 289, 290, 292, 293, 297, 299, 300, 301, 302, 306, 312, 314, 315, 316, 317, 321, 322, 323, 324, 327, 330, 331, 332, 333, 339, 342, 345, 346, 353, 354, 355, 356, 357, 363, 364, 365, 366, 372, 374, 378, 379, 381, 382, 383, 384, 385, 386, 387, 390, 391, 392, 394, 395, 396, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 414, 416, 417, 418, 419, 420, 422, 424, 425, 426, 427, 428, 430, 431, 432, 435, 442, 446, 447, 448, 449, 450, 451, 452, 454, 455, 457, 460, 461, 462, 468, 469, 470, 471, 472, 473, 474, 478, 495, 500, 507, 517, 522, 525, 530, 539, 540, 543, 545, 546, 548, 549, 550, 551, 553, 557, 558, 560, 561, 562, 563, 565, 566, 567, 568, 570, 571, 574, 575, 576, 577, 580, 581, 582, 585, 587, 589, 590, 594, 596, 602, 623, 626, 627, 630, 633, 635, 639, 646, 648, 651, 652, 654, 657, 664, 668, 671, 677, 678, 681, 683, 684, 685, 690, 691, 692, 695, 696, 708, 709, 714, 718, 719, 720, 724, 726, 727, 731, 733, 734, 736, 738, 741, 742, 745,

### 유클리드 거리를 사용한 추천

d(P, Q) = sqrt((x₂ - x₁)² + (y₂ - y₁)²)"

In [32]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector =9999,-1,[]

# 유클리디안 거리를 계산(다른 아이디와 계산해야 함)
for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    euclidean_dist = np.sqrt(np.sum(np.square(my_vector-user_vector)))
    if euclidean_dist < best_match:
      best_match = euclidean_dist
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 14.832396974191326, Best Match ID: 737


In [33]:
# ID 275가 볼 만한 영화 추천
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 > 0. :
    recommend_list.append(i)
print(recommend_list)

[297, 312, 317, 342, 356, 366, 379, 384, 392, 402, 404, 407, 417, 422, 428, 433, 448, 454, 469, 473, 495, 510, 516, 526, 527, 549, 567, 602, 635, 649, 650, 654, 658, 661, 664, 696, 731, 746, 750, 754, 915, 918, 925, 929, 950, 968, 1015, 1046]


### 코사인 유사도를 사용


In [37]:
def compute_cos_similar(v1,v2):
  norm1 = np.sqrt(np.sum(np.square(v1)))
  norm2 = np.sqrt(np.sum(np.square(v2)))
  dot = np.dot(v1,v2)
  return dot / (norm1 * norm2)

In [40]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector =-1,-1,[]

# 코사인 유사도를 계산(다른 아이디와 계산해야 함)
for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    cos_similarity = compute_cos_similar(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.5278586163659506, Best Match ID: 915


In [41]:
# ID 915는 봤지만 나는 안 본 영화
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 > 0. :
    recommend_list.append(i)
print(recommend_list)

[272, 275, 279, 280, 283, 285, 289, 294, 297, 316, 317, 355, 365, 366, 368, 379, 380, 381, 384, 386, 392, 398, 401, 404, 416, 420, 422, 424, 426, 427, 430, 432, 450, 460, 461, 466, 469, 471, 473, 474, 475, 479, 482, 483, 497, 505, 508, 510, 511, 522, 526, 527, 529, 530, 534, 536, 540, 545, 548, 549, 556, 557, 558, 560, 565, 567, 568, 569, 577, 580, 581, 582, 592, 596, 630, 635, 639, 641, 649, 651, 654, 673, 677, 678, 683, 684, 692, 696, 701, 703, 707, 708, 709, 712, 714, 719, 720, 726, 731, 734, 736, 738, 740, 745, 747, 754, 755, 761, 762, 763, 766, 780, 789, 791, 805, 819, 823, 824, 830, 843, 862, 865, 918, 929, 930, 938, 942, 943, 947, 958, 959, 960, 970, 977, 1004, 1008, 1009, 1010, 1013, 1041, 1045, 1069, 1072, 1073, 1078, 1097, 1100, 1108, 1112, 1118, 1134, 1193, 1205, 1207, 1216, 1219, 1267, 1334, 1400, 1427, 1596, 1681]


### 기존 방법에 명시적 피드백(별점)을 추가해 실험


In [43]:
# 별점이 들어간 인접 행렬로 변환
adj_matrix = np.ndarray(shape, dtype = int)
for user_id, movie_id, rating, time in raw_data:
  adj_matrix[user_id][movie_id] = rating
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [44]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector =9999,-1,[]

# 유클리디안 거리를 계산(다른 아이디와 계산해야 함)
for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    euclidean_dist = np.sqrt(np.sum(np.square(my_vector-user_vector)))
    if euclidean_dist < best_match:
      best_match = euclidean_dist
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 55.06359959174482, Best Match ID: 737


In [45]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector =-1,-1,[]

# 코사인 유사도를 계산(다른 아이디와 계산해야 함)
for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    cos_similarity = compute_cos_similar(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.569065731527988, Best Match ID: 915


# 협업 필터링(Collaborative Filtering)

- 사용자와 항목의 유사성을 동시에 고려
- 사용자의 기존 관심사가 아니라도 추천 가능
- 자동으로 임베딩 학습 가능

In [47]:
from surprise import KNNBasic, SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate

In [48]:
data = Dataset.load_builtin('ml-100k',prompt = False)

### KNN을 사용한 협업 필터링

In [52]:
model = KNNBasic()
cross_validate(model, data, measures = ['rmse','mae'], cv = 5, n_jobs = 4, verbose = 1)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9849  0.9813  0.9747  0.9718  0.9802  0.9786  0.0047  
MAE (testset)     0.7758  0.7743  0.7702  0.7689  0.7737  0.7726  0.0026  
Fit time          0.97    1.56    1.63    1.23    1.07    1.29    0.26    
Test time         6.11    6.50    6.29    5.81    1.60    5.26    1.85    


{'test_rmse': array([0.98487925, 0.98132948, 0.97473473, 0.97179677, 0.98023873]),
 'test_mae': array([0.77577055, 0.77432673, 0.77023423, 0.76892282, 0.77371393]),
 'fit_time': (0.971315860748291,
  1.5648481845855713,
  1.629490852355957,
  1.231464147567749,
  1.0705578327178955),
 'test_time': (6.110782861709595,
  6.498441219329834,
  6.290028095245361,
  5.812669992446899,
  1.595560073852539)}

### SVD를 활용한 협업 필터링

In [53]:
model = SVD()
cross_validate(model, data, measures = ['rmse','mae'], cv = 5, n_jobs = 4, verbose = 1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9320  0.9363  0.9353  0.9389  0.9379  0.9361  0.0024  
MAE (testset)     0.7362  0.7372  0.7399  0.7381  0.7407  0.7384  0.0017  
Fit time          1.70    1.42    1.92    2.03    1.06    1.63    0.35    
Test time         0.57    0.44    0.30    0.24    0.25    0.36    0.13    


{'test_rmse': array([0.93197507, 0.93631666, 0.935342  , 0.9388834 , 0.93786088]),
 'test_mae': array([0.7361661 , 0.73715678, 0.73985434, 0.73809656, 0.74065642]),
 'fit_time': (1.702657699584961,
  1.4243323802947998,
  1.9215400218963623,
  2.025596857070923,
  1.0578153133392334),
 'test_time': (0.5727307796478271,
  0.4417154788970947,
  0.29927611351013184,
  0.239882230758667,
  0.2544403076171875)}

### SVD++를 활용한 협업 필터링

In [54]:
model = SVDpp()
cross_validate(model, data, measures = ['rmse','mae'], cv = 5, n_jobs = 4, verbose = 1)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9197  0.9134  0.9203  0.9183  0.9224  0.9188  0.0030  
MAE (testset)     0.7234  0.7146  0.7233  0.7201  0.7239  0.7211  0.0035  
Fit time          73.55   73.47   82.66   74.51   25.71   65.98   20.42   
Test time         8.72    9.17    4.74    8.21    2.87    6.74    2.49    


{'test_rmse': array([0.91966665, 0.91342252, 0.92034265, 0.91829986, 0.92244539]),
 'test_mae': array([0.7234183 , 0.71461233, 0.72326327, 0.72012366, 0.72388141]),
 'fit_time': (73.5452036857605,
  73.46793532371521,
  82.6584107875824,
  74.5053358078003,
  25.710814714431763),
 'test_time': (8.721893072128296,
  9.168325901031494,
  4.736532688140869,
  8.210378408432007,
  2.8716914653778076)}

### NMF를 사용한 협업 필터링

In [56]:
model = NMF()
cross_validate(model, data, measures = ['rmse','mae'], cv = 5, n_jobs = 4, verbose = 1)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9557  0.9692  0.9563  0.9558  0.9597  0.9593  0.0052  
MAE (testset)     0.7545  0.7617  0.7527  0.7495  0.7524  0.7542  0.0041  
Fit time          2.22    2.59    3.12    2.75    1.80    2.50    0.45    
Test time         0.25    0.47    0.35    0.35    0.09    0.30    0.13    


{'test_rmse': array([0.95566286, 0.96923224, 0.95628856, 0.95575761, 0.95965619]),
 'test_mae': array([0.7544562 , 0.76168174, 0.75270228, 0.74954949, 0.75236549]),
 'fit_time': (2.2213518619537354,
  2.5934672355651855,
  3.116603136062622,
  2.754748821258545,
  1.802133560180664),
 'test_time': (0.2519080638885498,
  0.46538829803466797,
  0.35364699363708496,
  0.34750914573669434,
  0.0902094841003418)}

# 하이브리드(Hybrid)
: 성능 개굿

- 컨텐츠 기반 필터링 + 협업 필터링 조합
- 많은 하이브리드 방식 존재
- 협업 필터링으로 임베딩 학습 ➡️ 컨텐츠 기반 필터링으로 유사도 기반 추천

In [59]:
import numpy as np
from sklearn.decomposition import randomized_svd, non_negative_factorization
from surprise import Dataset

In [64]:
data = Dataset.load_builtin('ml-100k', prompt = False)
raw_data = np.array(data.raw_ratings, dtype = int)
raw_data[:,0] -= 1
raw_data[:,1] -= 1

In [65]:
n_users = np.max(raw_data[:,0])
n_movies = np.max(raw_data[:,1])
shape = (n_users + 1, n_movies + 1)
shape

(943, 1682)

In [66]:
adj_matrix = np.ndarray(shape, dtype = int)
for user_id, movie_id, rating, time in raw_data:
  adj_matrix[user_id][movie_id] = rating
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [67]:
# user, 특이값 , 인접 행렬
U, S, V = randomized_svd(adj_matrix, n_components=2)
S = np.diag(S)

In [69]:
print(U.shape)
print(S.shape) # 잠재 요인을 찾을 수 있는 특이값 벡터
print(V.shape)

(943, 2)
(2, 2)
(2, 1682)


In [74]:
# 복원된 행렬(SVD)
np.matmul(np.matmul(U,S),V)

array([[ 3.91732674e+00,  1.47276647e+00,  7.98262063e-01, ...,
         6.24907868e-04,  1.41100863e-02,  1.36545884e-02],
       [ 1.85777212e+00,  3.96191001e-01,  5.05705398e-01, ...,
         5.38862827e-03,  1.77236929e-03,  5.26961356e-04],
       [ 8.94989107e-01,  1.71578375e-01,  2.51738387e-01, ...,
         2.92094671e-03,  5.39932159e-04, -1.25736691e-04],
       ...,
       [ 9.92051633e-01,  2.10814844e-01,  2.70363102e-01, ...,
         2.89019091e-03,  9.34216975e-04,  2.66609145e-04],
       [ 1.30425387e+00,  5.27669959e-01,  2.50080172e-01, ...,
        -4.20678316e-04,  5.30525873e-03,  5.28070102e-03],
       [ 2.82999382e+00,  9.70812197e-01,  6.15871575e-01, ...,
         2.02091395e-03,  8.67740597e-03,  8.03107761e-03]])

#### 사용자 기반 추천

In [75]:
my_id, my_vector = 0,U[0]
best_match, best_match_id, best_match_vector =-1,-1,[]

# 코사인 유사도를 계산(다른 아이디와 계산해야 함)
for user_id, user_vector in enumerate(U):
  if my_id != user_id:
    cos_similarity = compute_cos_similar(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.9999942292403424, Best Match ID: 235


In [78]:
# 사용자 기반 추천 영화 리스트
recommend_list = []
for i, log in enumerate(zip(adj_matrix[my_id], adj_matrix[best_match_id])):
  log1, log2 = log
  if log1 < 1. and log2 > 0. :
    recommend_list.append(i)
print(recommend_list)

[272, 273, 274, 281, 285, 288, 293, 297, 303, 306, 312, 317, 327, 332, 369, 410, 418, 419, 422, 426, 428, 431, 434, 442, 461, 475, 477, 482, 495, 503, 504, 505, 506, 509, 519, 520, 522, 525, 531, 545, 548, 590, 594, 595, 613, 631, 654, 658, 660, 672, 684, 685, 691, 695, 698, 704, 716, 728, 734, 749, 755, 863, 865, 933, 1012, 1038, 1101, 1327, 1400]


#### 항목 기반 추천

- 내가 본 항목과 비슷한 항목을 추천
- 항목 특징 벡터의 유사도 사용

In [91]:
my_id, my_vector = 0,V.T[0]
best_match, best_match_id, best_match_vector =-1,-1,[]

# 코사인 유사도를 계산(다른 아이디와 계산해야 함)
for user_id, user_vector in enumerate(V.T):
  if my_id != user_id:
    cos_similarity = compute_cos_similar(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.9999999949421506, Best Match ID: 1287


In [92]:
# 사용자 기반 추천 영화 리스트
recommend_list = []
for i, user_vector in enumerate(adj_matrix):
  if adj_matrix[i][my_id] > 0.9 :
    recommend_list.append(i)
print(recommend_list)

[0, 1, 4, 5, 9, 12, 14, 15, 16, 17, 19, 20, 22, 24, 25, 37, 40, 41, 42, 43, 44, 48, 53, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 69, 71, 72, 74, 76, 78, 80, 81, 82, 83, 88, 91, 92, 93, 94, 95, 96, 98, 100, 101, 105, 107, 108, 116, 119, 120, 123, 124, 127, 129, 130, 133, 136, 137, 140, 143, 144, 147, 149, 150, 156, 157, 159, 161, 167, 173, 176, 177, 180, 181, 183, 188, 192, 193, 197, 198, 199, 200, 201, 202, 203, 208, 209, 212, 215, 221, 222, 229, 230, 231, 233, 234, 241, 242, 243, 245, 246, 247, 248, 249, 250, 251, 252, 253, 255, 261, 262, 264, 267, 270, 273, 274, 275, 276, 278, 279, 285, 286, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 302, 304, 306, 307, 310, 311, 312, 313, 319, 321, 323, 324, 325, 326, 329, 330, 331, 335, 337, 338, 339, 342, 343, 344, 346, 347, 349, 356, 358, 359, 362, 364, 370, 373, 377, 378, 379, 380, 386, 387, 388, 389, 392, 393, 394, 395, 397, 398, 400, 401, 402, 405, 406, 410, 411, 415, 416, 418, 421, 423, 424, 428, 431, 433, 434, 437, 440, 444, 

#### 비음수 행렬 분해를 사용한 하이브리드 추천

In [97]:
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [98]:
A,B, iter = non_negative_factorization(adj_matrix, n_components = 2)

In [99]:
np.matmul(A,B)

array([[3.71107433e+00, 1.48461856e+00, 7.39541570e-01, ...,
        3.64501983e-03, 1.45513751e-02, 1.44116215e-02],
       [2.11729713e+00, 2.37145679e-01, 5.51637757e-01, ...,
        4.76290749e-03, 2.84605930e-05, 0.00000000e+00],
       [9.85325089e-01, 1.10360320e-01, 2.56715279e-01, ...,
        2.21651094e-03, 1.32446863e-05, 0.00000000e+00],
       ...,
       [1.04478344e+00, 1.17019891e-01, 2.72206478e-01, ...,
        2.35026384e-03, 1.40439223e-05, 0.00000000e+00],
       [1.45769331e+00, 5.42108391e-01, 2.99217251e-01, ...,
        1.61232500e-03, 5.15892655e-03, 5.10748255e-03],
       [2.44709957e+00, 9.41278705e-01, 4.95671746e-01, ...,
        2.56934867e-03, 9.08400301e-03, 8.99501717e-03]])

In [100]:
my_id, my_vector = 0,U[0]
best_match, best_match_id, best_match_vector =-1,-1,[]

# 코사인 유사도를 계산(다른 아이디와 계산해야 함)
for user_id, user_vector in enumerate(U):
  if my_id != user_id:
    cos_similarity = compute_cos_similar(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.9999942292403424, Best Match ID: 235


In [101]:
# 사용자 기반 추천 영화 리스트
recommend_list = []
for i, log in enumerate(zip(adj_matrix[my_id], adj_matrix[best_match_id])):
  log1, log2 = log
  if log1 < 1. and log2 > 0. :
    recommend_list.append(i)
print(recommend_list)

[272, 273, 274, 281, 285, 288, 293, 297, 303, 306, 312, 317, 327, 332, 369, 410, 418, 419, 422, 426, 428, 431, 434, 442, 461, 475, 477, 482, 495, 503, 504, 505, 506, 509, 519, 520, 522, 525, 531, 545, 548, 590, 594, 595, 613, 631, 654, 658, 660, 672, 684, 685, 691, 695, 698, 704, 716, 728, 734, 749, 755, 863, 865, 933, 1012, 1038, 1101, 1327, 1400]


In [102]:
my_id, my_vector = 0,V.T[0]
best_match, best_match_id, best_match_vector =-1,-1,[]

# 코사인 유사도를 계산(다른 아이디와 계산해야 함)
for user_id, user_vector in enumerate(V.T):
  if my_id != user_id:
    cos_similarity = compute_cos_similar(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.9999999949421506, Best Match ID: 1287


In [103]:
# 사용자 기반 추천 영화 리스트
recommend_list = []
for i, user_vector in enumerate(adj_matrix):
  if adj_matrix[i][my_id] > 0.9 :
    recommend_list.append(i)
print(recommend_list)

[0, 1, 4, 5, 9, 12, 14, 15, 16, 17, 19, 20, 22, 24, 25, 37, 40, 41, 42, 43, 44, 48, 53, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 69, 71, 72, 74, 76, 78, 80, 81, 82, 83, 88, 91, 92, 93, 94, 95, 96, 98, 100, 101, 105, 107, 108, 116, 119, 120, 123, 124, 127, 129, 130, 133, 136, 137, 140, 143, 144, 147, 149, 150, 156, 157, 159, 161, 167, 173, 176, 177, 180, 181, 183, 188, 192, 193, 197, 198, 199, 200, 201, 202, 203, 208, 209, 212, 215, 221, 222, 229, 230, 231, 233, 234, 241, 242, 243, 245, 246, 247, 248, 249, 250, 251, 252, 253, 255, 261, 262, 264, 267, 270, 273, 274, 275, 276, 278, 279, 285, 286, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 302, 304, 306, 307, 310, 311, 312, 313, 319, 321, 323, 324, 325, 326, 329, 330, 331, 335, 337, 338, 339, 342, 343, 344, 346, 347, 349, 356, 358, 359, 362, 364, 370, 373, 377, 378, 379, 380, 386, 387, 388, 389, 392, 393, 394, 395, 397, 398, 400, 401, 402, 405, 406, 410, 411, 415, 416, 418, 421, 423, 424, 428, 431, 433, 434, 437, 440, 444, 