In [110]:
import os
import pandas as pd
import pickle
from pathlib import Path

data_path = os.getcwd()

# movie에 대한 정보가 담긴 파일과
# 유저들이 각 영화에 남긴 평점이 담긴 파일 불러오기
df_ratings =  pd.read_csv(
    os.path.join(data_path, "modified.csv"),
    usecols=['userId', 'movieId', 'rating', 'title'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32', 'title':'str'})

df_ratings

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5
5,1,Toy Story (1995),18,3.5
6,1,Toy Story (1995),19,4.0
7,1,Toy Story (1995),21,3.5
8,1,Toy Story (1995),27,3.0
9,1,Toy Story (1995),31,5.0


In [111]:
from scipy.sparse import csr_matrix

# 각 유저들이 영화에 평가한 평점을 토대로 행렬을 새로 만든다
# 유저가 평가하지 않은 영화는 평점이 0으로 들어감
df_ratings = df_ratings.drop_duplicates(['userId', 'title'])
df_ratings_pivot = df_ratings.pivot(index = 'title', columns = 'userId', values = 'rating').fillna(0)
df_ratings_matrix = csr_matrix(df_ratings_pivot.values)
df_ratings_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Tis the Season for Love (2015),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'night Mother (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df_ratings_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [6]:
# KNN이 올바르게 영화를 추천하고 있는지 확인!

import numpy as np

# 랜덤으로 아무 영화나 선택하여 그와 비슷한 영화 추천 받기
query_index = np.random.choice(df_ratings_pivot.shape[0])

# KNN을 이용하여 각 영화의 이웃의 수가 6개가 되도록 설정
distances, indices = model_knn.kneighbors(df_ratings_pivot.iloc[query_index, :].values.reshape(1, -1), \
                                          n_neighbors = 6)

# 거리가 가까운 순서대로 나타나게 한다
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(df_ratings_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, df_ratings_pivot.index[indices.flatten()[i]], distances.flatten()[i]))
    

Recommendations for Home Alone (1990):

1: Mrs. Doubtfire (1993), with distance of 0.39578694105148315:
2: Lion King, The (1994), with distance of 0.4414706230163574:
3: Pretty Woman (1990), with distance of 0.4444250464439392:
4: Jurassic Park (1993), with distance of 0.4748850464820862:
5: Jumanji (1995), with distance of 0.47512364387512207:


In [7]:
# 함수로 만들기
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df_ratings_matrix)

def get_movie_rec(movie, released_year, model_knn, df_ratings_pivot):
    # 같은 포맷으로 만들기___예: 영화이름 (연도)
    movie_year = movie + " (" + released_year + ")"

    # KNN을 이용하여 각 영화의 이웃의 수가 6개가 되도록 설정
    distances, indices = model_knn.kneighbors(df_ratings_pivot.loc[movie_year, :].values.reshape(1, -1), \
                                              n_neighbors = 6)
    
    # 추천 영화를 답장으로 알려주기 위해 string 타입 변수에 저장
    reply = ""
    for i in range(0, len(distances.flatten())):
        if i == 0:
            reply += 'Recommendations for {0}:\n'.format(movie)
        else:
            reply += '{0}: {1}\n'.format(i, df_ratings_pivot.index[indices.flatten()[i]])
    
    return reply
    

In [65]:
# 잘 되나 확인
print (get_movie_rec("Titanic", "1997", model_knn, df_ratings_pivot))

Recommendations for Titanic:
1: Men in Black (a.k.a. MIB) (1997)
2: Star Wars: Episode I - The Phantom Menace (1999)
3: Saving Private Ryan (1998)
4: Shrek (2001)
5: Catch Me If You Can (2002)



# knn train 시킨 결과를 export 하기

In [100]:
# 각 movie에 대해 5개의 recommendation 받고 dictionary 형태로 저장
movie_rec = {}
for title in df_ratings.title:
    year = title.strip()[-5:-1]
    movie = title.strip()[:-6].strip()
    try:
        movie_rec[(movie, year)] = get_movie_rec(movie, year, model_knn, df_ratings_pivot)
    except:
        pass

In [10]:
# dictionary를 pickle로 저장
pickle.dump(movie_rec, open(data_path + "/movie_rec.pkl", "wb"))

In [108]:
# pickle로 저장시킨 dictionary 다시 불러와서 확인
movie_rec = pickle.load(open(data_path + "/movie_rec.pkl", "rb"))
movie_rec[('Toy Story', '1995')]

'Recommendations for Toy Story:\n1: Toy Story 2 (1999)\n2: Jurassic Park (1993)\n3: Independence Day (a.k.a. ID4) (1996)\n4: Star Wars: Episode IV - A New Hope (1977)\n5: Forrest Gump (1994)\n'