# 협업 필터링


데이터:
https://www.kaggle.com/sengzhaotoo/movielens-small?select=movies.csv

## 사용자 기반 협업 필터링

In [47]:
import pandas as pd 
import numpy as np 

In [48]:
pd.set_option('display.max_columns', 6)
pd.set_option('display.width', 300)

In [49]:
ratings = pd.read_csv('./data/ratings.csv')
movies = pd.read_csv('./data/movies.csv')

In [50]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [51]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [52]:
# 두 DataFrame 합치기
movie_ratings = pd.merge(ratings, movies, on='movieId')

movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,7,31,3.0,851868750,Dangerous Minds (1995),Drama
2,31,31,4.0,1273541953,Dangerous Minds (1995),Drama
3,32,31,4.0,834828440,Dangerous Minds (1995),Drama
4,36,31,3.0,847057202,Dangerous Minds (1995),Drama


In [53]:
title_user = movie_ratings.pivot_table('rating', index='userId', columns='title')

# 결측치를 0으로 변경
title_user.fillna(0, inplace=True)
print(title_user)

title   "Great Performances" Cats (1998)  $9.99 (2008)  'Hellboy': The Seeds of Creation (2004)  ...  ¡Three Amigos! (1986)  À nous la liberté (Freedom for Us) (1931)  İtirazım Var (2014)
userId                                                                                           ...                                                                                       
1                                    0.0           0.0                                      0.0  ...                    0.0                                        0.0                  0.0
2                                    0.0           0.0                                      0.0  ...                    0.0                                        0.0                  0.0
3                                    0.0           0.0                                      0.0  ...                    0.0                                        0.0                  0.0
4                                    0.0           0.0      

In [54]:
# 코사인 유사도
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
user_based_collab = cosine_similarity(title_user, title_user)

print(user_based_collab)

[[1.         0.         0.         ... 0.06291708 0.         0.01746565]
 [0.         1.         0.12429498 ... 0.02413984 0.17059464 0.1131753 ]
 [0.         0.12429498 1.         ... 0.08098382 0.13660585 0.17019275]
 ...
 [0.06291708 0.02413984 0.08098382 ... 1.         0.04260878 0.08520194]
 [0.         0.17059464 0.13660585 ... 0.04260878 1.         0.22867673]
 [0.01746565 0.1131753  0.17019275 ... 0.08520194 0.22867673 1.        ]]


In [56]:
user_based_collab = pd.DataFrame(user_based_collab, index=title_user.index, columns=title_user.index)

print(user_based_collab)

userId       1         2         3    ...       669       670       671
userId                                ...                              
1       1.000000  0.000000  0.000000  ...  0.062917  0.000000  0.017466
2       0.000000  1.000000  0.124295  ...  0.024140  0.170595  0.113175
3       0.000000  0.124295  1.000000  ...  0.080984  0.136606  0.170193
4       0.074482  0.118821  0.081640  ...  0.104309  0.054512  0.211609
5       0.016818  0.103646  0.151531  ...  0.038358  0.062642  0.225086
...          ...       ...       ...  ...       ...       ...       ...
667     0.000000  0.425462  0.124562  ...  0.018416  0.153111  0.127995
668     0.000000  0.084646  0.124911  ...  0.000000  0.178017  0.135387
669     0.062917  0.024140  0.080984  ...  1.000000  0.042609  0.085202
670     0.000000  0.170595  0.136606  ...  0.042609  1.000000  0.228677
671     0.017466  0.113175  0.170193  ...  0.085202  0.228677  1.000000

[671 rows x 671 columns]


In [57]:
# 1번 유저와 유사한 유저 10명 
id = 1
print(user_based_collab[id].sort_values(ascending=False)[:10])

userId
1      1.000000
325    0.371852
634    0.194093
341    0.162819
310    0.157524
207    0.152746
35     0.130585
195    0.122647
485    0.114021
130    0.112817
Name: 1, dtype: float64


In [58]:
# 가장 유사한 유저를 뽑아서 그 유저가 본 영화를 추천
id = 1
user = user_based_collab[id].sort_values(ascending=False)[:10].index[1]

print(user)

result = title_user.query(f"userId == {user}").sort_values(ascending=False, by=user, axis=1)

print(result)

325
title   Beverly Hills Cop (1984)  Dangerous Minds (1995)  Brady Bunch Movie, The (1995)  ...  Frogs for Snakes (1998)  Fritz the Cat (1972)  İtirazım Var (2014)
userId                                                                                   ...                                                                    
325                          4.5                     4.5                            4.0  ...                      0.0                   0.0                  0.0

[1 rows x 9064 columns]


In [59]:
# 1번 유저와 유사한 유저 9명을 뽑아서, 그 유저들이 특정 영화에 대해서 부여햔 평점에 
# 유사도만큼의 가중치를 부여해서 이걸 토대로 1번 유저가 부여햘 평점을 계산/예측
# 가중치 --> 유저 9명 유사도의 합 중에서 해당 유저가 차지하는 유사도의 비율

id = 1
user_index_list = user_based_collab[id].sort_values(ascending=False)[1:10].index.tolist()
user_weight_list = user_based_collab[id].sort_values(ascending=False)[1:10].tolist()

print(user_index_list)
print(user_weight_list)


print(sum(user_weight_list))

[325, 634, 341, 310, 207, 35, 195, 485, 130]
[0.3718515795200445, 0.19409305170790575, 0.16281928881328767, 0.1575243302750048, 0.15274612900892096, 0.13058496348265256, 0.12264701454037472, 0.11402063453702121, 0.11281730419223501]
1.519104296077447


In [60]:
movie_title = 'Dark Knight, The (2008)'
# code 수정 해야함
weighted_sum = []
for i in range(9):
    weighted_sum.append(title_user[movie_title][user_index_list[i]] * user_weight_list[i])

print(sum(weighted_sum))

0.0


In [61]:
# 유사한 유저 9명이 해당 영화를 봤는지 확인
# 안봤다..

movie_ratings[movie_ratings.title == movie_title].userId == title_user[movie_title][user_index_list[i]]

12816    False
12817    False
12818    False
12819    False
12820    False
         ...  
12932    False
12933    False
12934    False
12935    False
12936    False
Name: userId, Length: 121, dtype: bool

## 아이템 기반 협업 필터링

In [73]:
user_title = movie_ratings.pivot_table('rating', index='title', columns='userId')

# 결측치를 0으로 변경
user_title.fillna(0, inplace=True)
# print(user_title)

item_based_collab = cosine_similarity(title_user, title_user)
# print(item_based_collab)

item_based_collab = pd.DataFrame(item_based_collab, index=user_title.index, columns=user_title.index)

print(item_based_collab)

movie_title = 'Dark Knight, The (2008)'
user = item_based_collab[movie_title].sort_values(ascending=False)[:10]

ValueError: Shape of passed values is (671, 671), indices imply (9064, 9064)