In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import operator


In [2]:
ratings = pd.read_csv('ratings_small.csv')
ratings = ratings[['userId','movieId','rating']]
ratings.head()
ratings.movieId = pd.to_numeric(ratings.movieId, errors='coerce')
ratings.userId = pd.to_numeric(ratings.userId, errors='coerce')
ratings.rating = pd.to_numeric(ratings.rating, errors='coerce')

In [3]:
#여기서 결측치를 처리하고 movieId를 count했을때 개수에 대해 70%의 것들만 가져옴
# 적은수의 평가가 있는 영화는 제외
ratings = ratings[pd.notnull(ratings['rating'])]
f = ['count', 'mean']
df_movie_summary = ratings.groupby('movieId')['rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int) #map함수 쓰면 한번에 형변환 처리 가능
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0) #quantile사분위수
drop_movie_list = df_movie_summary[df_movie_summary['count']<movie_benchmark].index
print('Movie minimum times of review: {}'.format(movie_benchmark))

Movie minimum times of review: 7.0


In [4]:
#영화리뷰가 너무 적은것들 제외
df_cust_summary = ratings.groupby('userId')['rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count']<cust_benchmark].index
print('Customer minimum times of review: {}'.format(cust_benchmark))

Customer minimum times of review: 138.0


In [5]:
#위에서 제외할 리스트들을 넣어주어 drop
# ratings의 'Movie_Id'에서 drop_movie_list의 값이 있으면 True
ratings = ratings[~ratings['movieId'].isin(drop_movie_list)]
ratings = ratings[~ratings['userId'].isin(drop_cust_list)]

In [11]:
meta = pd.read_csv('movies_metadata.csv',low_memory=False)
meta = meta[['id','original_title','original_language']]
meta = meta.rename(columns={'id':'movieId'})
meta.movieId = pd.to_numeric(meta.movieId,errors='coerce')
meta.head()

Unnamed: 0,movieId,original_title,original_language
0,862.0,Toy Story,en
1,8844.0,Jumanji,en
2,15602.0,Grumpier Old Men,en
3,31357.0,Waiting to Exhale,en
4,11862.0,Father of the Bride Part II,en


In [12]:
user_movie = pd.merge(ratings, meta, on = 'movieId')
user_movie.head()

Unnamed: 0,userId,movieId,rating,original_title,original_language
0,4,112,5.0,Italiensk for begyndere,da
1,15,112,2.5,Italiensk for begyndere,da
2,19,112,3.0,Italiensk for begyndere,da
3,21,112,4.0,Italiensk for begyndere,da
4,73,112,3.5,Italiensk for begyndere,da


In [13]:
piv = user_movie.pivot_table('rating',index = 'userId',columns='original_title').fillna(0)
piv.shape

(202, 1113)

In [14]:
piv.head()

original_title,...Più forte ragazzi!,10 Items or Less,10 Things I Hate About You,12 + 1,1984,2 Days in Paris,"20,000 Leagues Under the Sea",2001: A Space Odyssey,24 Hour Party People,25th Hour,...,隠し砦の三悪人,風の谷のナウシカ,鬼婆,거룩한 계보,괴물,밀양,빈집,사마리아,해안선,활
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,1.5,3.0,0.0,1.0,0.0,4.0,0.0,2.0,0.0,0.0,...,1.5,0.0,2.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0
17,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
piv = piv.T
piv = piv.loc[:,(piv!=0).any(axis=0)]

In [53]:
piv.head()

userId,4,15,17,19,21,22,23,26,30,33,...,627,641,646,647,648,652,654,659,664,665
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
...Più forte ragazzi!,0.0,1.5,0.5,0.0,0.0,3.0,5.0,0.0,4.0,0.0,...,3.5,0.0,0.0,0.0,1.0,0.0,5.0,0.0,0.0,3.0
10 Items or Less,0.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
12 + 1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# Our data needs to be in a sparse matrix format to be read by the following functions
piv_sparse = sp.sparse.csr_matrix(piv.values)

item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)
# Inserting the similarity matricies into dataframe objects
item_sim_df = pd.DataFrame(item_similarity, index = piv.index, columns = piv.index)
user_sim_df = pd.DataFrame(user_similarity, index = piv.columns, columns = piv.columns)

In [46]:
user_sim_df.head()

userId,4,15,17,19,21,22,23,26,30,33,...,627,641,646,647,648,652,654,659,664,665
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1.0,0.303341,0.163605,0.267443,0.278762,0.201712,0.177498,0.09802,0.329623,0.129482,...,0.138512,0.161257,0.293616,0.176093,0.120672,0.147283,0.35796,0.139495,0.145146,0.286119
15,0.303341,1.0,0.411392,0.413817,0.301702,0.317426,0.515829,0.295397,0.502896,0.202182,...,0.340263,0.246228,0.183757,0.40348,0.327811,0.21655,0.49567,0.296973,0.492153,0.331544
17,0.163605,0.411392,1.0,0.272256,0.176268,0.27109,0.424943,0.305088,0.338456,0.044119,...,0.293802,0.070617,0.098503,0.37834,0.350864,0.081229,0.264525,0.180035,0.369918,0.18084
19,0.267443,0.413817,0.272256,1.0,0.482937,0.195461,0.393612,0.130314,0.402556,0.070634,...,0.215849,0.33425,0.161669,0.291741,0.197046,0.128516,0.33194,0.441613,0.213778,0.254076
21,0.278762,0.301702,0.176268,0.482937,1.0,0.230237,0.303997,0.14067,0.322255,0.082047,...,0.223057,0.170723,0.165814,0.231981,0.192954,0.091928,0.290541,0.229557,0.159711,0.190618


In [56]:
# This function will return the top 5 users with the highest similarity value 
def top_users(user):
    if user not in piv.columns:
        return('No data available on user {}'.format(user))

    print('유사한 사용자:\n')
    result = user_sim_df.sort_values(by=user, ascending=False).loc[:,user][1:11]
    for user, sim in result.items():
        print('User #{0}, 유사도값: {1:.2f}'.format(user, sim)) 

In [57]:
top_users(15)

유사한 사용자:

User #509, 유사도값: 0.59
User #73, 유사도값: 0.59
User #388, 유사도값: 0.58
User #547, 유사도값: 0.58
User #468, 유사도값: 0.57
User #461, 유사도값: 0.57
User #580, 유사도값: 0.57
User #452, 유사도값: 0.54
User #472, 유사도값: 0.53
User #624, 유사도값: 0.53


In [58]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list

def similar_user_recs(user):
    if user not in piv.columns:
        return('No data available on user {}'.format(user))

    # 유사도가 높은 10명의 사용자를 가져옵니다. 
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11] 
    best = []
    most_common = {}


    for i in sim_users:
        # 유사도가 높은 10명의 사용자들이 평가점수를 높게 주었던 item list를 가져옵니다. 
        # 단, 주의해야할 점은 추천하려고 하는 대상 user가 평가하지 않았던 아이템이어야 합니다. 
        result_sorted = piv.loc[:, i][(piv.loc[:,user] == 0)].sort_values(ascending = False)
        best.append(result_sorted.index[:10].tolist())
#     print(best)
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:10]    

In [59]:
similar_user_recs(15)

[('The Thirteenth Floor', 4),
 ("One Night at McCool's", 4),
 ('Totally Blonde', 3),
 ('The Remains of the Day', 3),
 ('To Kill a Mockingbird', 3),
 ('Once in a Lifetime: The Extraordinary Story of the New York Cosmos', 3),
 ('Enough', 2),
 ('Les Quatre Cents Coups', 2),
 ('An Unfinished Life', 2),
 ("Vampire's Kiss", 2)]