In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

In [8]:
ds_ratings = pd.read_csv("../ml-latest-small/ratings.csv")
ds_movies = pd.read_csv("../ml-latest-small/movies.csv")

In [5]:
def create_user_item_matrix(ratings) -> pd.DataFrame:
  mat = ratings.pivot(index="userId", columns="movieId", values="rating")
  # 유저가 평점을 매긴 영화는 '본 영화'로 간주하고 1로 지정합니다
  mat[~mat.isna()] = 1
  # 유저가 평점을 매기지 않은 영화는 NaN인데
  # '보지 않은 영화'로 간주하고 0으로 바꿔줍니다
  mat.fillna(0, inplace=True)
  return mat

user_item_matrix = create_user_item_matrix(ds_ratings)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
def get_svd_prediction(user_item_matrix, k):
    # U, sigma, V 를 얻어옵니다.
    u, s, vh = scipy.sparse.linalg.svds(user_item_matrix.to_numpy(), k=k)
    # 이들을 다시 곱해서 원본 행렬을 복원해봅니다
    # sigma는 주대각선 성분만 반환되므로 np.diag 함수로 대각행렬로 바꿔줍니다
    preds = np.dot(np.dot(u, np.diag(s)), vh)

    # 결과를 DataFrma
    preds = pd.DataFrame(preds, columns=user_item_matrix.columns, index=user_item_matrix.index)
    preds = (preds - preds.min()) / (preds.max() - preds.min())
    return preds

predictions = get_svd_prediction(user_item_matrix, k=64)
predictions

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.428114,0.425604,0.348175,0.154023,0.243343,0.453558,0.219816,0.196430,0.190059,0.365286,...,0.116342,0.116342,0.116342,0.116342,0.116342,0.116342,0.116342,0.116342,0.116342,0.092452
2,0.265159,0.209954,0.215911,0.136243,0.255729,0.187805,0.250365,0.153909,0.174337,0.123947,...,0.222027,0.222027,0.222027,0.222027,0.222027,0.222027,0.222027,0.222027,0.222027,0.452406
3,0.228959,0.227407,0.233126,0.151581,0.254409,0.215054,0.248156,0.163065,0.171641,0.143253,...,0.200178,0.200178,0.200178,0.200178,0.200178,0.200178,0.200178,0.200178,0.200178,0.356912
4,0.410161,0.140274,0.153194,0.167781,0.308183,0.314373,0.374465,0.203160,0.126510,0.268821,...,0.209514,0.209514,0.209514,0.209514,0.209514,0.209514,0.209514,0.209514,0.209514,0.342163
5,0.407273,0.325954,0.205385,0.161983,0.298501,0.263657,0.275638,0.178712,0.158161,0.333431,...,0.201163,0.201163,0.201163,0.201163,0.201163,0.201163,0.201163,0.201163,0.201163,0.355053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.690775,0.217291,0.250497,0.143019,0.212394,0.141855,0.905293,0.147106,0.177363,0.105533,...,0.124452,0.124452,0.124452,0.124452,0.124452,0.124452,0.124452,0.124452,0.124452,0.191845
607,0.559043,0.408386,0.269015,0.131456,0.193766,0.344019,0.271290,0.146729,0.221170,0.349969,...,0.200401,0.200401,0.200401,0.200401,0.200401,0.200401,0.200401,0.200401,0.200401,0.283258
608,0.549373,0.980488,0.807716,0.204844,0.222560,0.213664,0.331879,0.182127,0.111172,0.592941,...,0.173623,0.173623,0.173623,0.173623,0.173623,0.173623,0.173623,0.173623,0.173623,0.751350
609,0.383871,0.296912,0.196343,0.127700,0.240714,0.213925,0.246829,0.138400,0.174708,0.328211,...,0.221609,0.221609,0.221609,0.221609,0.221609,0.221609,0.221609,0.221609,0.221609,0.346883


movieId
6145      0.657699
124851    0.619278
114265    0.619278
113829    0.619278
6055      0.618473
            ...   
70703     0.011296
6722      0.011296
174727    0.011296
92674     0.011296
26622     0.011296
Name: 3, Length: 9724, dtype: float64

In [39]:
user_id = 609
user_movie_ids = ds_ratings[ds_ratings.userId == user_id].movieId
user_movies = ds_movies[ds_movies.movieId.isin(user_movie_ids)]
print(user_id, "유저가 본 영화 목록")
print(user_movies)

# 유저가 보지 않은 영화들을
user_predictions = predictions.loc[user_id].sort_values(ascending=False)
user_predictions = user_predictions[~user_predictions.index.isin(user_movie_ids)]
user_predictions = user_predictions.head(10)
user_recommendations = ds_movies[ds_movies.movieId.isin(user_predictions.index)]
print(user_recommendations)

609 유저가 본 영화 목록
     movieId                                              title  \
0          1                                   Toy Story (1995)   
9         10                                   GoldenEye (1995)   
97       110                                  Braveheart (1995)   
101      116                       Anne Frank Remembered (1995)   
114      137                             Man of the Year (1995)   
123      150                                   Apollo 13 (1995)   
134      161                                Crimson Tide (1995)   
156      185                                    Net, The (1995)   
176      208                                  Waterworld (1995)   
197      231             Dumb & Dumber (Dumb and Dumber) (1994)   
217      253  Interview with the Vampire: The Vampire Chroni...   
249      288                        Natural Born Killers (1994)   
253      292                                    Outbreak (1995)   
257      296                                Pu

In [40]:
user_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
9,10,GoldenEye (1995),Action|Adventure|Thriller
97,110,Braveheart (1995),Action|Drama|War
101,116,Anne Frank Remembered (1995),Documentary
114,137,Man of the Year (1995),Documentary
123,150,Apollo 13 (1995),Adventure|Drama|IMAX
134,161,Crimson Tide (1995),Drama|Thriller|War
156,185,"Net, The (1995)",Action|Crime|Thriller
176,208,Waterworld (1995),Action|Adventure|Sci-Fi
197,231,Dumb & Dumber (Dumb and Dumber) (1994),Adventure|Comedy


In [41]:
user_recommendations

Unnamed: 0,movieId,title,genres
4224,6145,Venom (1982),Horror|Thriller
6616,55854,"Fugitive, The (1947)",Drama
6628,56169,Awake (2007),Drama|Thriller
6759,59738,All the Boys Love Mandy Lane (2006),Horror|Mystery|Thriller
6806,60832,Pathology (2008),Crime|Horror|Thriller
7265,74580,"Spy Next Door, The (2010)",Action|Children|Comedy
7717,90384,Behind Enemy Lines II: Axis of Evil (2006),Action|Thriller|War
8504,113829,"One I Love, The (2014)",Comedy|Drama|Romance
8516,114265,Laggies (2014),Comedy|Romance
8663,121129,The Hungover Games (2014),Comedy
