In [1]:
import pandas as pd
import csv
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
import math
from operator import itemgetter
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
ratings = pd.read_csv('./data/ml-20m/ratings.csv')
ratings = ratings.iloc[:10000, :]
ratings.tail(5)

Unnamed: 0,userId,movieId,rating,timestamp
9995,91,2797,3.5,1112061221
9996,91,2857,4.0,1111966776
9997,91,2858,4.5,1111557477
9998,91,2863,4.5,1111558557
9999,91,2890,2.5,1113202901


In [3]:
movies = pd.read_csv('./data/ml-20m/movies.csv')
movies.tail(5)

Unnamed: 0,movieId,title,genres
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)
27277,131262,Innocence (2014),Adventure|Fantasy|Horror


## 사용자 샘플 선출

In [4]:
UM_matrix_ds = ratings.pivot(index = 'userId', columns = 'movieId', values='rating')
UM_matrix_ds.head(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,111921,112138,112290,112556,112852,116797,117511,117590,118696,125916
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.5,,,,,,,,,...,,,,,,,,,,
2,,,4.0,,,,,,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,,,
4,,,,,,3.0,,,,4.0,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


In [5]:
selected_user = pd.DataFrame(columns = ratings.columns)
for i in range(0, 10000, 1000):
    selected_user = selected_user.append(ratings.iloc[i])
selected_user.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1.0,2.0,3.5,1112486000.0
1000,11.0,527.0,4.5,1251171000.0
2000,18.0,4967.0,5.0,1196423000.0
3000,24.0,4321.0,3.0,994232800.0
4000,35.0,110.0,4.5,1164499000.0
5000,50.0,1094.0,4.0,1182678000.0
6000,54.0,3198.0,2.0,975440600.0
7000,59.0,1.0,4.5,1380401000.0
8000,70.0,2916.0,2.0,1020294000.0
9000,83.0,342.0,3.5,1112724000.0


## 1-a 사용자 유사도 측정

In [6]:
def distance_cosine(a, b):
    return 1-distance.cosine(a,b)

In [7]:
def nearest_neighbor_user(user, topN, simFunc):
    ul = UM_matrix_ds.loc[user].dropna()
    ratedIndex = ul.index
    nn = {}
    
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid == user: continue
            
        for i in ratedIndex:
            if not math.isnan(row[i]):
                interSectionU1.append(ul[i])
                interSectionU2.append(row[i])
        interSection = len(interSectionU1)
            
        if interSection < 3: continue
        sim = simFunc(interSectionU1, interSectionU2)
            
        if not math.isnan(sim): nn[uid] = sim
        
    return sorted(nn.items(), key = itemgetter(1))[:-(topN+1):-1]

In [8]:
for user in selected_user['userId']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(int(user), 3, distance_cosine)))

User 1.0 neighbors : [(81, 0.9984603532054125), (59, 0.9980449936610162), (15, 0.997785157856609)]
User 11.0 neighbors : [(81, 0.997040538050167), (62, 0.9948894628669589), (42, 0.9947454674783927)]
User 18.0 neighbors : [(71, 0.9967171510149001), (59, 0.996551724137931), (63, 0.9953106423895011)]
User 24.0 neighbors : [(71, 0.9926288007437607), (81, 0.9916744114284443), (10, 0.9887979383153774)]
User 35.0 neighbors : [(20, 0.9988356607724258), (9, 0.9946717954563347), (49, 0.992319826667701)]
User 50.0 neighbors : [(84, 0.9998513121692769), (81, 0.9982743731749959), (77, 0.9945378653782108)]
User 54.0 neighbors : [(79, 0.9918819421856829), (57, 0.986758804781948), (55, 0.9838393259779317)]
User 59.0 neighbors : [(26, 0.9984038297885895), (82, 0.9982743731749959), (1, 0.9980449936610162)]
User 70.0 neighbors : [(10, 0.9943767126843691), (59, 0.9928083647904367), (44, 0.9878048780487805)]
User 83.0 neighbors : [(89, 0.9984427709212877), (49, 0.9962422739487996), (67, 0.9961110313020488)

## 1-b 평점 예측

In [9]:
def distance_euclidean(a, b):
    return distance.euclidean

In [10]:
print(distance_euclidean(1,3))

<function euclidean at 0x000001DA77CF9D90>


In [11]:
def predict_rating(userid, nn=100, simFunc=distance_euclidean):
    neighbor = nearest_neighbor_user(userid, nn, simFunc)
    neighbor_id = [id for id, sim in neighbor]
    
    neighbor_movie = UM_matrix_ds.loc[neighbor_id].dropna(1, how='all', thresh=4)
    neighbor_dict = (dict(neighbor))
    ret = []
    
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dict.get(v[0],0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
        
    return ret

In [12]:
result = []
for i in range(10):
    userId = int(selected_user.iloc[i].userId)
    movieId = int(selected_user.iloc[i].movieId)
    predict = predict_rating(userId, 300, distance_cosine)
    
    for movie in predict:
        if movieId == movie[0]:
            result.append([int(userId), int(movieId), movie[1]])

resultdf = pd.DataFrame(result, columns=['userId', 'movieId', 'rating'])
resultdf

Unnamed: 0,userId,movieId,rating
0,1,2,2.802626
1,11,527,4.30801
2,35,110,4.112809
3,50,1094,3.75047
4,59,1,3.940832
5,70,2916,3.999457
6,83,342,3.777067


## 1-c 에러 측정하기

In [13]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(selected_user[selected_user['userId']==userid]['rating']))

resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print("\nError Rate(Absolute) : ", error_rate_absol)
print("Error Rate(Squared) : ", error_rate_squared)


Error Rate(Absolute) :  0.6231109979919659
Error Rate(Squared) :  0.7318051701956932
