In [1]:
# coding: utf-8
import pandas as pd
import numpy as np
from matplotlib import rcParams
import matplotlib.pyplot as plt
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
from __future__ import print_function
import math
from operator import itemgetter
from scipy.spatial import distance
%matplotlib inline  

rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
matplotlib.style.use('ggplot')
pd.options.display.max_rows=14

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392
5,1,329,5.0,838983392
6,1,355,5.0,838984474
...,...,...,...,...
10000047,71567,2012,3.0,912580722
10000048,71567,2028,5.0,912580344


In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
...,...,...,...
10674,65027,"Death Kiss, The (1933)",Comedy|Mystery
10675,65037,Ben X (2007),Drama


In [5]:
UM_matrix_ds = ratings.pivot(index = 'userId', columns = 'movieId', values = 'rating')
UM_matrix_ds

movieId,1,2,3,4,5,6,7,8,9,10,...,65006,65011,65025,65027,65037,65088,65091,65126,65130,65133
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.0,,,,,,3.0,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71561,,,,,,,,,,,...,,,,,,,,,,
71562,,3.5,,,,,,,,,...,,,,,,,,,,


In [6]:
def distance_cosine(a,b):
    return 1 - distance.cosine(a, b)

def disance_corr(a,b):
    return 1 - distance.correlation(a, b)

def distance_euclidean(a,b):
    return 1 / (distance.euclidean(a, b) + 1)

In [7]:
def nearest_neighbor_user(user, topN, simFunc) :  
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid==user:
            continue
            
        for i in ratedIndex:
            if False==math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
        if interSectionLen < 3 :
            continue
            
        sim = simFunc(interSectionU1,interSectionU2)
        
        if  math.isnan(sim) == False:
            nn[uid] = sim
            
    return sorted(nn.items(), key = itemgetter(1))[:-(topN + 1):-1]

In [8]:
user_list = [8, 18, 34, 36, 47, 56, 65, 73, 82, 92]
neighbor_list = {}

for userId in user_list :
    neighbor = nearest_neighbor_user(userId, 20000, distance_euclidean)
    file_neighbor = open(str(userId), 'w')
    file_neighbor.write(str(neighbor))
    file_neighbor.close()
    neighbor_list[str(userId)] = neighbor
    print(str(userId) + ' -> ' + str(neighbor[:3]))

8 -> [(12847, 1.0), (65233, 0.6666666666666666), (59117, 0.6666666666666666)]
18 -> [(63194, 1.0), (59356, 1.0), (43932, 1.0)]
34 -> [(66779, 1.0), (55019, 1.0), (1513, 1.0)]
36 -> [(64966, 1.0), (64851, 1.0), (58872, 1.0)]
47 -> [(71129, 1.0), (66681, 1.0), (65164, 1.0)]
56 -> [(70798, 1.0), (69561, 1.0), (68984, 1.0)]
65 -> [(71027, 1.0), (64394, 1.0), (62795, 1.0)]
73 -> [(70933, 1.0), (69725, 1.0), (69694, 1.0)]
82 -> [(70915, 1.0), (70812, 1.0), (69603, 1.0)]
92 -> [(71459, 1.0), (69313, 1.0), (68797, 1.0)]


In [9]:
def predictRating(userid, nn=50, simFunc=distance_euclidean) :
    neighbor = neighbor_list[str(userid)]
    neighbor_id = [id for id,sim in neighbor]
    
    neighbor_movie = UM_matrix_ds.loc[neighbor_id].dropna(1, how='all', thresh = 4 )
    neighbor_dic = (dict(neighbor))
    ret = []
    
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dic.get(v[0],0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
       
    return ret

In [10]:
eval_ratings = ratings

def eval_prediction( predict_users,  n_users=50 ):
    ds = pd.merge(eval_ratings, 
                       ratings[['movieId','rating']].groupby(['movieId']).mean().reset_index(), 
                       on='movieId', how='left')

    ds = ds.rename(columns= {'rating_x':'rating', 'rating_y':'mean_rating'})
    
    ds['euclidean'] = 0
    for userId in predict_users:
            for x in predictRating(userId, n_users, distance_euclidean):
                ds.loc[(ds.userId==userId) & (ds.movieId==x[0]), 'euclidean'] = x[1]

    return ds[ds.euclidean > 0]

In [11]:
predicted = eval_prediction(user_list, 20000)
predicted

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating,euclidean
351,8,2,2.5,1115858432,3.208070,3.335316
352,8,5,3.0,1116550582,3.077435,3.173791
353,8,6,4.0,1116547028,3.813011,3.767617
354,8,16,3.0,1115859664,3.749974,3.647759
355,8,19,3.5,1115859653,2.578199,2.960988
356,8,22,2.5,1111545739,3.331164,3.233836
357,8,31,3.5,1116547192,3.273417,3.391018
...,...,...,...,...,...,...
10013,92,42718,4.0,1162164412,3.688612,4.239403
10014,92,44191,3.0,1162163793,3.872191,3.928420


In [12]:
predicted.to_csv('predict.csv', sep=',', na_rep='NaN')

In [13]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,userId,movieId,rating,timestamp
0,8,5678,3.5,1111624249
1,18,2378,3.0,1111554408
2,34,552,4.0,981825279
3,36,4544,4.0,1049942449
4,47,778,4.0,1162150265
5,56,3095,5.0,1162156484
6,65,1353,2.0,970835728
7,73,3115,2.0,974326984
8,82,49272,2.0,1216275888
9,92,8798,4.0,1162164173


In [14]:
predict_ratings = []
predict_euclidean = []

for i, testline in test.iterrows() :
    for j, predictline in predicted.iterrows() :
        if (testline['userId'] == predictline['userId'] and (testline['movieId'] == predictline['movieId'])) :
            predict_ratings.append(predictline['mean_rating'])
            predict_euclidean.append(predictline['euclidean'])
            break
            
test['mean_rating'] = pd.Series(predict_ratings)
test['euclidean'] = pd.Series(predict_euclidean)
test

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating,euclidean
0,8,5678,3.5,1111624249,3.004808,3.049289
1,18,2378,3.0,1111554408,2.901833,3.018252
2,34,552,4.0,981825279,3.228934,3.190423
3,36,4544,4.0,1049942449,2.41657,3.209735
4,47,778,4.0,1162150265,4.011534,4.189934
5,56,3095,5.0,1162156484,4.050864,4.563958
6,65,1353,2.0,970835728,3.186354,3.209394
7,73,3115,2.0,974326984,3.190594,2.811132
8,82,49272,2.0,1216275888,3.881323,3.635516
9,92,8798,4.0,1162164173,3.704308,3.744532


In [24]:
def RMSE(X, left_col, right_col):
    return(np.sqrt(np.mean( (X[left_col] - X[right_col])**2 )))

def MAE(X, left_col, right_col):
    return(np.mean(np.absolute(X[left_col] - X[right_col])) )

In [27]:
print(MAE(predicted, 'rating', 'mean_rating'))
print(MAE(predicted, 'rating', 'euclidean'))

0.688974335481705
0.5529019214712361


In [28]:
print(RMSE(predicted, 'rating', 'mean_rating'))
print(RMSE(predicted, 'rating', 'euclidean'))

0.866917353671
0.710008557079
