In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, accuracy
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

In [None]:
rating_data = pd.read_csv("./ratings_new.csv")
movie_data = pd.read_csv("./movies.csv")
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [None]:
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
print(movie_data.shape) #영화 데이터 45466개
print(rating_data.shape) #영화 평점 데이터 26024289개

(9125, 3)
(100004, 4)


# 특정 영화와 비슷한 영화 추천

In [None]:
rating_data.drop('timestamp',axis=1,inplace=True)
movie_data.drop('genres',axis=1,inplace=True) #timestamep, genre는 모델 기반 필터링에 필요 없으므로 제거
print(rating_data.head())
print(movie_data.head())

   userId  movieId  rating
0       1       31     2.5
1       1     1029     3.0
2       1     1061     3.0
3       1     1129     2.0
4       1     1172     4.0
   movieId                               title
0        1                    Toy Story (1995)
1        2                      Jumanji (1995)
2        3             Grumpier Old Men (1995)
3        4            Waiting to Exhale (1995)
4        5  Father of the Bride Part II (1995)


In [None]:
user_movie_data = pd.merge(rating_data,movie_data,on='movieId')
print(user_movie_data.head()) #데이터 결합
print(user_movie_data.shape) #특정 유저가 평점을 매긴 영화의 제목이 무엇인지 알 수 있음

   userId  movieId  rating                   title
0       1       31     2.5  Dangerous Minds (1995)
1       7       31     3.0  Dangerous Minds (1995)
2      31       31     4.0  Dangerous Minds (1995)
3      32       31     4.0  Dangerous Minds (1995)
4      36       31     3.0  Dangerous Minds (1995)
(100004, 4)


In [None]:
user_movie_rating=user_movie_data.pivot_table('rating',index='userId',columns='title').fillna(0) #pivot table로 변경(행: user, 열: movie, value: 평점)
print(user_movie_rating.head())
print(user_movie_rating.shape) #사용자-영화 평점 데이터로 변환, 결측치는 0으로 처리

title   "Great Performances" Cats (1998)  $9.99 (2008)  \
userId                                                   
1                                    0.0           0.0   
2                                    0.0           0.0   
3                                    0.0           0.0   
4                                    0.0           0.0   
5                                    0.0           0.0   

title   'Hellboy': The Seeds of Creation (2004)  \
userId                                            
1                                           0.0   
2                                           0.0   
3                                           0.0   
4                                           0.0   
5                                           0.0   

title   'Neath the Arizona Skies (1934)  'Round Midnight (1986)  \
userId                                                            
1                                   0.0                     0.0   
2                                 

In [None]:
movie_user_rating = user_movie_rating.values.T #Transpose를 통해 영화-사용자 데이터로 변환

In [None]:
#TruncatedSVD 이용
SVD = TruncatedSVD(n_components=12) #latent=12
matrix = SVD.fit_transform(movie_user_rating)
matrix.shape #9064개의 영화 데이터가 12개의 어떤 요소의 값을 갖는다.

(9064, 12)

In [None]:
cor = np.corrcoef(matrix) #피어슨 상관계수 -> '특정 영화'와 관련해 상관계수가 높은 영화를 뽑음
movie_title = user_movie_rating.columns
movie_title_list = list(movie_title)
coffey_hands = movie_title_list.index("Toy Story (1995)")
cor_coffey_hands = cor[coffey_hands]
list(movie_title[(cor_coffey_hands >= 0.9)])[:50]

['Before Sunrise (1995)', 'Sleepers (1996)', 'Toy Story (1995)']

# 사용자 개인 영화 히스토리 기반

In [None]:
df_user_movie_ratings = rating_data.pivot(
index='userId',
columns='movieId',
values='rating').fillna(0)
print(df_user_movie_ratings.head()) #사용자별 각 영화 평점 데이터
print(df_user_movie_ratings.shape)

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           0.0     0.0     4.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  161084  161155  161594  161830  161918  161944  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     4.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     4.0  ...     0.0     0.0     0.0     0.0     0.0     0

In [None]:
#671명의 사용자들이 매긴 각각의 평균 평점
matrix = df_user_movie_ratings.values #이전의 pivot table을 matrix로 변경
user_ratings_mean = np.mean(matrix,axis=1)
#평점-사용자에서 사용자의 평균 영화 평점
matrix_user_mean = matrix - user_ratings_mean.reshape(-1,1) #사용자-영화 평점에서 사용자의 평균 평점을 뺀 값
pd.DataFrame(matrix_user_mean,columns=df_user_movie_ratings.columns).head() #영화-사용자 평점 데이터

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,...,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625
1,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,3.97077,...,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923
2,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,...,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075
3,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,3.902162,...,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838
4,-0.043128,-0.043128,3.956872,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,...,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128


In [None]:
#SVD 이용, U행렬, sigma행렬, V 전치 행렬 반환
U, sigma, Vt = svds(matrix_user_mean, k=12)
print(U.shape)
print(sigma.shape)
print(Vt.shape)

(671, 12)
(12,)
(12, 9066)


In [None]:
#sigma 행렬은 0이 포함되지 않은 값으로만 구성. 이를 대칭행렬로 변환
sigma = np.diag(sigma)
sigma.shape

(12, 12)

In [None]:
sigma[0]

array([105.72437051,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ])

In [None]:
#SVD를 적용해 분해한 행렬을 원본 행렬로 복구(U,Sigma,Vt 내적)
svd_user_predicted_ratings = np.dot(np.dot(U,sigma),Vt) + user_ratings_mean.reshape(-1,1)

In [None]:
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings,columns=df_user_movie_ratings.columns)
df_svd_preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.079686,0.021779,-0.013837,-0.00587,-0.028877,0.032371,0.000715,-0.004428,-0.005219,0.038195,...,-0.004324,-0.004352,0.010478,-0.004256,-0.003944,-0.005674,0.018157,-0.005575,-0.005297,-0.003766
1,1.428452,1.608841,0.529476,0.168278,0.520809,1.107473,0.529719,0.089376,0.29627,1.970031,...,0.013227,-0.002275,0.02068,-0.005245,-0.007644,-0.021019,0.031243,-0.000957,-0.000753,0.026901
2,0.977246,0.396971,0.000299,0.027444,0.021287,0.141458,-0.057134,0.031633,-0.012538,0.383576,...,0.002761,0.004907,-0.01419,-0.000251,-0.006007,-0.003189,-0.026916,0.014637,0.013287,-0.005741
3,1.870844,1.169993,0.252202,0.094831,-0.181713,-0.511953,-0.02782,-0.14308,0.013247,1.461694,...,0.026412,-0.027245,0.054681,0.01845,0.034544,-0.03574,0.088889,-0.019365,-0.017113,0.066559
4,1.182777,0.924903,0.075998,0.061505,0.60268,-0.159825,0.339925,0.081534,-0.079666,0.535018,...,-0.029124,-0.029357,0.009064,-0.029092,-0.03089,-0.057453,0.026344,-0.024027,-0.024614,-0.032752


함수 설정 : 인자로 사용자 아이디, 영화 정보, 평점 테이블  
사용자 아이디에 SVD로 나온 결과의 영화 평점이 가장 높은 데이터 순으로 정렬  
사용자가 본 데이터를 제외  
사용자가 안 본 영화에서 높은 평점 예측된 것 추천

In [None]:
def recommend_movies(df_svd_preds,user_id,ori_movies_df,ori_ratings_df,num_recommendations=10):
    user_row_number = user_id -1
    sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False) #영화 평점이 높은 순으로 정렬
    user_data = ori_ratings_df[ori_ratings_df.userId == user_id]
    user_history = user_data.merge(ori_movies_df,on='movieId').sort_values(['rating'],ascending=False) #평점 데이터의 userid 데이터와 원본 영화 데이터 결합
    recommendations = ori_movies_df[~ori_movies_df['movieId'].isin(user_history['movieId'])] #원본 영화 데이터에서 사용자가 본 영화 데이터 제외
    recommendations = recommendations.merge(pd.DataFrame(sorted_user_predictions).reset_index(),on='movieId')
    recommendations = recommendations.rename(columns={user_row_number:'Predictions'}).sort_values('Predictions',ascending=False).iloc[:num_recommendations,:]
    
    return user_history,recommendations

In [None]:
already_rated,predictions = recommend_movies(df_svd_preds,15,movie_data,rating_data,10)
already_rated.head(10)

Unnamed: 0,userId,movieId,rating,title
659,15,3300,5.0,Pitch Black (2000)
778,15,3996,5.0,"Crouching Tiger, Hidden Dragon (Wo hu cang lon..."
495,15,2395,5.0,Rushmore (1998)
757,15,3915,5.0,Girlfight (2000)
759,15,3948,5.0,Meet the Parents (2000)
760,15,3949,5.0,Requiem for a Dream (2000)
143,15,593,5.0,"Silence of the Lambs, The (1991)"
769,15,3983,5.0,You Can Count on Me (2000)
171,15,800,5.0,Lone Star (1996)
187,15,912,5.0,Casablanca (1942)


In [None]:
predictions #영화 추천 및 예측 평점

Unnamed: 0,movieId,title,Predictions
662,1080,Monty Python's Life of Brian (1979),3.231573
669,1090,Platoon (1986),3.031839
224,348,Bullets Over Broadway (1994),2.967341
515,852,Tin Cup (1996),2.918055
486,805,"Time to Kill, A (1996)",2.851778
413,653,Dragonheart (1996),2.783178
216,337,What's Eating Gilbert Grape (1993),2.730816
422,671,Mystery Science Theater 3000: The Movie (1996),2.721821
240,376,"River Wild, The (1994)",2.673036
33,58,"Postman, The (Postino, Il) (1994)",2.637897


latent factor 이용하는 방법  
-정확하게 장르에 맞게 특성을 추출하기 어려울 수도 있을 듯  
-지난 프로젝트에서 추출된 latent factor가 해석 불가능하다는 어려움
-> 잠재 요인은 명확히 무엇인지 알 수 없음

-> scalability와 속도에 있어서 장점이 있으나 전체 데이터셋을 이용하는 것이 아니라 여기서 SVD와 같은 모델을 이용했기 때문에 다른 추천 시스템에 비해 정확성이 떨어질 수 있다.

평점 외에 러닝타임이나 출연 배우 등의 변수를 고려하는 방법  
 -> 일반적인 MF 모델에서는 쉽지 않을 수도..  
 -> 평점 외에 다른 변수를 추가하려면 딥러닝 모델이나 FM(Factorization Machine) 모델을 이용  
 -> 또는 MF 모델을 여러 개 만들어 조합하는 방법  