In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas_datareader.data as web
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False

%matplotlib inline

from fbprophet import Prophet
from datetime import datetime
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import ast

In [2]:
# 한글 폰트 문제 해결
import platform

from matplotlib import font_manager, rc
# plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~')

In [3]:
#아이템 기반 협업필터링
rating_data = pd.read_csv('./data/small_movie/ratings.csv')
movie_data = pd.read_csv('./data/small_movie/movies.csv')

In [4]:
rating_data.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [5]:
movie_data.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [6]:
#전처리
rating_data.drop('timestamp', axis = 1, inplace =True)
rating_data.head(2)

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0


In [8]:
user_movie_rating = pd.merge(rating_data, movie_data, on = 'movieId')

In [9]:
user_movie_rating.head(2)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,31,2.5,Dangerous Minds (1995),Drama
1,7,31,3.0,Dangerous Minds (1995),Drama


In [10]:
movie_user_rating = user_movie_rating.pivot_table('rating',index='title',columns='userId')
user_movie_rating = user_movie_rating.pivot_table('rating',index='userId',columns='title')

In [11]:
user_movie_rating.head(2)

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,


In [12]:
movie_user_rating.head(2)

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",,,,,,,,,,,...,,,,,,,,,,
$9.99 (2008),,,,,,,,,,,...,,,,,,,,,,


In [13]:
movie_user_rating.fillna(0, inplace = True)
movie_user_rating.head(2)

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
item_based_collabor = cosine_similarity(movie_user_rating)
item_based_collabor

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.05821787, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.05821787, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [16]:
item_based_collabor = pd.DataFrame(data = item_based_collabor, index = movie_user_rating.index, columns = movie_user_rating.index)

In [17]:
item_based_collabor.head()

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",1.0,0.0,0.0,0.164399,0.020391,0.0,0.014046,0.0,0.0,0.003166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.079474,0.0,0.15633,...,0.0,0.0,0.0,0.0,0.0,0.013899,0.0,0.058218,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.217357,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Neath the Arizona Skies (1934),0.164399,0.0,0.0,1.0,0.124035,0.0,0.085436,0.0,0.0,0.019259,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.020391,0.0,0.0,0.124035,1.0,0.0,0.010597,0.143786,0.0,0.136163,...,0.0,0.0,0.0,0.121567,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
def get_item_based_collabor(title):
    return item_based_collabor[title].sort_values(ascending=False)[:6]

In [19]:
get_item_based_collabor('Godfather, The (1972)') #Godfather를 보고 좋은평점을 매겼으면 유사한 영화를 5개추천

title
Godfather, The (1972)                        1.000000
Godfather: Part II, The (1974)               0.773685
Goodfellas (1990)                            0.620349
One Flew Over the Cuckoo's Nest (1975)       0.568244
American Beauty (1999)                       0.557997
Star Wars: Episode IV - A New Hope (1977)    0.546750
Name: Godfather, The (1972), dtype: float64

In [39]:
## 잠재요인 기반 협업필터링(행렬분해 사용), 사용자 개인맞춤형 방식 목표
df_ratings = pd.read_csv('./data/small_movie/ratings.csv')
df_movies = pd.read_csv('./data/small_movie/movies.csv')

In [40]:
df_user_movie_ratings = df_ratings.pivot(

    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [41]:
df_user_movie_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
#사용자-영화 피봇테이블을 사용자 별 각 영화평점 방식으로 변경

# matrix는 피봇 테이블 값을 넘파이 매트릭스로 만든 것
matrix = df_user_movie_ratings.values

# user_ratings_mean은 사용자 평균 평점
user_ratings_mean = np.mean(matrix, axis=1)

# R_user_mean : 사용자-영화에서 사용자 평균 평점을 뺀 것.
matrix_user_mean = matrix - user_ratings_mean.reshape(-1,1)

In [45]:
matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [46]:
pd.DataFrame(matrix_user_mean,columns = df_user_movie_ratings.columns).head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,...,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625
1,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,3.97077,...,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923
2,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,...,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075
3,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,3.902162,...,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838
4,-0.043128,-0.043128,3.956872,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,...,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128


In [48]:
from scipy.sparse.linalg import svds
from sklearn.decomposition import TruncatedSVD #행렬분해
# scipy에서 제공하는 svd.
# U행렬, sigma행렬, v전치행렬 반환
u, sigma, vt = svds(matrix_user_mean, k = 12)

In [49]:
print(u.shape)
print(sigma.shape)
print(vt.shape)

(671, 12)
(12,)
(12, 9066)


In [50]:
sigma = np.diag(sigma)
sigma.shape

(12, 12)

In [51]:
sigma[0]

array([105.72437051,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ])

In [52]:
# U, Sigma, Vt의 내적을 수행하면, 다시 원본 행렬로 복원이 된다
# 거기에 + 사용자 평균 rating을 적용한다
svd_user_predicted_ratings = np.dot(np.dot(u, sigma),vt) + user_ratings_mean.reshape(-1,1)

In [53]:
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = df_user_movie_ratings.columns)
df_svd_preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.079686,0.021779,-0.013837,-0.00587,-0.028877,0.032371,0.000715,-0.004428,-0.005219,0.038195,...,-0.004324,-0.004352,0.010478,-0.004256,-0.003944,-0.005674,0.018157,-0.005575,-0.005297,-0.003766
1,1.428452,1.608841,0.529476,0.168278,0.520809,1.107473,0.529719,0.089376,0.29627,1.970031,...,0.013227,-0.002275,0.02068,-0.005245,-0.007644,-0.021019,0.031243,-0.000957,-0.000753,0.026901
2,0.977246,0.396971,0.000299,0.027444,0.021287,0.141458,-0.057134,0.031633,-0.012538,0.383576,...,0.002761,0.004907,-0.01419,-0.000251,-0.006007,-0.003189,-0.026916,0.014637,0.013287,-0.005741
3,1.870844,1.169993,0.252202,0.094831,-0.181713,-0.511953,-0.02782,-0.14308,0.013247,1.461694,...,0.026412,-0.027245,0.054681,0.01845,0.034544,-0.03574,0.088889,-0.019365,-0.017113,0.066559
4,1.182777,0.924903,0.075998,0.061505,0.60268,-0.159825,0.339925,0.081534,-0.079666,0.535018,...,-0.029124,-0.029357,0.009064,-0.029092,-0.03089,-0.057453,0.026344,-0.024027,-0.024614,-0.032752


In [58]:
# 인자로 유저 아이디, 영화 정보 테이블, 평점 테이블 등을 받음
# 사용자 아이디에 SVD로 나온 결과의 영화 평점이 가장 높은 데이터 순으로 정렬
# 사용자가 본 데이터 제외
# 사용자가 안 본 영화에서 평점이 높은 것 추천

def recommend_movies(df_svd_preds, user_id, ori_movies_df, ori_ratings_df, num_recommendations=5):
    
    #현재는 인덱스로 적용이 되어있으므로 user_id -1 해야함.
    user_row_number = user_id -1
    
    #최종적으로 만든 pred_df에서 사용자 인덱스에 따라 영화 데이터 정렬 -> 영화 평점이 높은 순 정렬됨
    sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
    
    #원본 평점 데이터에서 user_id에 해당하는 데이터를 뽑는다
    user_data = ori_ratings_df[ori_ratings_df.userId == user_id]
    
    #위에서 뽑은 user_data와 원본 영화 데이터를 합친다
    user_history = user_data.merge(ori_movies_df, on='movieId').sort_values(['rating'],ascending=False)
    
    #원본 영화 데이터에서 사용자가 본 영화 데이터를 제외한 데이터 추출
    recommendations = ori_movies_df[~ori_movies_df['movieId'].isin(user_history['movieId'])]
    
    #사용자의 영화평점이 높은 순으로 정렬된 데이터와 위 recommendations를 합친다
    recommendations = recommendations.merge(pd.DataFrame(sorted_user_predictions).reset_index(),on='movieId')
    
    #컬럼이름 바꾸고 정렬해서 리턴
    recommendations = recommendations.rename(columns = {user_row_number : 'Predictions'}).sort_values('Predictions',ascending=False)
    
    return user_history, recommendations

In [59]:
# 330번 유저가 10개의 영화를 추천받는다고 가정
already_rated, predictions = recommend_movies(df_svd_preds, 330, df_movies, df_ratings, 10)

In [60]:
already_rated.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
47,330,1094,5.0,948577200,"Crying Game, The (1992)",Drama|Romance|Thriller
14,330,213,5.0,948577039,Burnt by the Sun (Utomlyonnye solntsem) (1994),Drama
34,330,527,5.0,948577164,Schindler's List (1993),Drama|War
45,330,1035,5.0,948574126,"Sound of Music, The (1965)",Musical|Romance
48,330,1172,5.0,948575236,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama
53,330,1249,5.0,948576522,"Femme Nikita, La (Nikita) (1990)",Action|Crime|Romance|Thriller
27,330,373,5.0,948576522,Red Rock West (1992),Thriller
58,330,1428,5.0,948731410,Angel Baby (1995),Drama
61,330,1575,5.0,948577442,Gabbeh (1996),Drama
67,330,1683,5.0,948578017,"Wings of the Dove, The (1997)",Drama|Romance


In [61]:
predictions

Unnamed: 0,movieId,title,genres,Predictions
246,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,2.060064
294,356,Forrest Gump (1994),Comedy|Drama|Romance|War,1.772814
497,608,Fargo (1996),Comedy|Crime|Drama|Thriller,1.526111
487,590,Dances with Wolves (1990),Adventure|Drama|Western,1.508960
316,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,1.276646
...,...,...,...,...
1619,2140,"Dark Crystal, The (1982)",Adventure|Fantasy,-0.148684
5028,7373,Hellboy (2004),Action|Adventure|Fantasy|Horror,-0.150847
918,1215,Army of Darkness (1993),Action|Adventure|Comedy|Fantasy|Horror,-0.175392
1490,2005,"Goonies, The (1985)",Action|Adventure|Children|Comedy|Fantasy,-0.219911
