In [1]:
import pandas as pd
import csv
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
import math
from operator import itemgetter
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# [과제1] 네이버 평점 데이터 수집

In [2]:
naver_user = pd.read_csv('./data/naver_user.csv')
print(len(naver_user))
naver_user.tail()

100


Unnamed: 0,reviewNo,userId
95,15771929,eyku
96,15771927,geon
97,15771926,myil
98,15771925,hous
99,15771922,huya


In [3]:
rating_data = pd.read_csv('./data/final_rating.csv')
del rating_data['Unnamed: 0']
print(len(rating_data))
rating_data.tail()

5124


Unnamed: 0,userId,rating,movieId
5119,ebc8,1,174065
5120,glab,8,137327
5121,zxcv,1,174065
5122,kktw,7,47701
5123,kktw,6,62328


In [4]:
user_num = rating_data.userId.unique().shape[0]
movie_num = rating_data.movieId.unique().shape[0]

print('유저의 수는 {}명이고, \n영화의 수는 {}개 입니다.'.format(user_num, movie_num))

유저의 수는 100명이고, 
영화의 수는 2697개 입니다.


# [과제 2-a] 유사 사용자 탐색

## ★top10 뽑기★

### <사전작업1> final_rating.csv 읽어오기

In [5]:
ratings = pd.read_csv('./data/final_rating.csv')
del ratings['Unnamed: 0']
ratings.tail(5)

Unnamed: 0,userId,rating,movieId
5119,ebc8,1,174065
5120,glab,8,137327
5121,zxcv,1,174065
5122,kktw,7,47701
5123,kktw,6,62328


### <사전작업2> naver_user.csv 읽어오기

In [6]:
users = pd.read_csv('./data/naver_user.csv')
users.head(5)

Unnamed: 0,reviewNo,userId
0,15772038,airf
1,15772037,nanw
2,15772036,zxcv
3,15772035,sdh1
4,15772032,guan


### <사전작업3> final_rating 파일과 user 파일 join

In [7]:
join_table = pd.merge(ratings, users, how='inner')
join_table.tail()

Unnamed: 0,userId,rating,movieId,reviewNo
5118,huya,10,36666,15771922
5119,huya,10,37235,15771922
5120,patl,10,161967,15771989
5121,yoya,10,161967,15771960
5122,ebc8,1,174065,15771979


### <사전작업4> join한 파일에서 userId로 groupby -> 가장 큰 10개 뽑기

In [8]:
top10_user = join_table.groupby(['userId']).size().nlargest(10).reset_index()
top10_user

Unnamed: 0,userId,0
0,ykm3,700
1,sang,691
2,tsp0,677
3,hosu,564
4,zxcv,357
5,zard,276
6,artn,192
7,suha,108
8,ldsl,105
9,imag,102


### <사전작업5> top10_user와 ratings를 outer join

In [9]:
top10_table = pd.merge(top10_user, ratings, how='outer')
top10_table

Unnamed: 0,userId,0,rating,movieId
0,ykm3,700.0,1,137938
1,ykm3,700.0,3,145162
2,ykm3,700.0,8,86343
3,ykm3,700.0,8,127459
4,ykm3,700.0,8,127767
5,ykm3,700.0,10,153729
6,ykm3,700.0,10,98438
7,ykm3,700.0,8,152680
8,ykm3,700.0,9,125466
9,ykm3,700.0,3,127382


### <사전작업6> ppt에 있는 movieId는 해당 userId의 index=1인 데이터이기 때문에 iloc[1]

In [10]:
movieId_list = []
# 'userId == imag'은 새로운 데이터가 추가되어서 따로 처리
for i in range(10):
    movieId_list.append(top10_table[top10_table['userId']==top10_user['userId'][i]].iloc[1].movieId)
    
print(movieId_list)

[145162, 161967, 163788, 180399, 86507, 158653, 172174, 180399, 157297, 181409]


### <사전작업7> movieId 칼럼 추가

In [11]:
top10_user['movieId'] = movieId_list
top10_user

Unnamed: 0,userId,0,movieId
0,ykm3,700,145162
1,sang,691,161967
2,tsp0,677,163788
3,hosu,564,180399
4,zxcv,357,86507
5,zard,276,158653
6,artn,192,172174
7,suha,108,180399
8,ldsl,105,157297
9,imag,102,181409


### <사전작업8> 위의 top10_user와 join_table을 inner join

In [12]:
top10_user = pd.merge(top10_user, join_table, how='inner')
top10_user

Unnamed: 0,userId,0,movieId,rating,reviewNo
0,ykm3,700,145162,3,15771936
1,sang,691,161967,10,15771961
2,tsp0,677,163788,7,15771934
3,hosu,564,180399,7,15771998
4,zxcv,357,86507,10,15772036
5,zard,276,158653,10,15772012
6,artn,192,172174,10,15771948
7,suha,108,180399,7,15771976
8,ldsl,105,157297,1,15771977
9,imag,102,181409,9,15771940


### <사전작업9> 불필요한 column 제거

In [13]:
del top10_user[0]
top10_user

Unnamed: 0,userId,movieId,rating,reviewNo
0,ykm3,145162,3,15771936
1,sang,161967,10,15771961
2,tsp0,163788,7,15771934
3,hosu,180399,7,15771998
4,zxcv,86507,10,15772036
5,zard,158653,10,15772012
6,artn,172174,10,15771948
7,suha,180399,7,15771976
8,ldsl,157297,1,15771977
9,imag,181409,9,15771940


### <사전작업10> columns 순서 변경

In [14]:
columnList = ['userId','rating','movieId','reviewNo']
top10_user = top10_user[columnList]
top10_user

Unnamed: 0,userId,rating,movieId,reviewNo
0,ykm3,3,145162,15771936
1,sang,10,161967,15771961
2,tsp0,7,163788,15771934
3,hosu,7,180399,15771998
4,zxcv,10,86507,15772036
5,zard,10,158653,15772012
6,artn,10,172174,15771948
7,suha,7,180399,15771976
8,ldsl,1,157297,15771977
9,imag,9,181409,15771940


### <사전작업11> UM_matrix_ds 생성

In [15]:
UM_matrix_ds = join_table.pivot(index = 'reviewNo', columns = 'movieId', values='rating')
UM_matrix_ds.head(100)

movieId,10002,10003,10004,10005,10006,10008,10009,10012,10016,10018,...,181409,181410,181411,181414,181419,181711,182348,182360,183132,183877
reviewNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15771922,,,,,,,,,,,...,,,,,,,,,,
15771925,,,,,,,,,,,...,,8.0,,,,,,,,
15771926,,,,,,,,,,,...,,,,,,,,,,
15771927,,,,,,,,,,,...,,,,,,,,,,
15771929,,,,,,,,,,,...,,,,,,,,,,
15771931,,,,,,,,,,,...,,,,,,,,,,
15771932,,,,,,,,,,,...,,,,,,,,,,
15771933,,,,,,,,,,,...,,,,,,,,,,
15771934,,,,,,,,9.0,9.0,,...,,,,,,,,,,
15771935,,,,,,,,,,,...,,,,,,,,,,


## 2-a. Cosine

In [16]:
def distance_cosine(a, b):
    return 1-distance.cosine(a,b)

In [17]:
def nearest_neighbor_user(user, topN, simFunc):
    ul = UM_matrix_ds.loc[user].dropna()
    ratedIndex = ul.index
    nn = {}
    
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid == user: continue
            
        for i in ratedIndex:
            if not math.isnan(row[i]):
                interSectionU1.append(ul[i])
                interSectionU2.append(row[i])
        interSection = len(interSectionU1)
            
        if interSection < 3: continue
        sim = simFunc(interSectionU1, interSectionU2)
            
        if not math.isnan(sim): nn[uid] = round(sim,2)
        
    return sorted(nn.items(), key = itemgetter(1))[:-(topN+1):-1]

In [18]:
for user in top10_user['reviewNo']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(int(user), 3, distance_cosine)))

User 15771936 neighbors : [(15771972, 1.0), (15771993, 0.97), (15772003, 0.95)]
User 15771961 neighbors : [(15771970, 1.0), (15771966, 1.0), (15771947, 1.0)]
User 15771934 neighbors : [(15772009, 1.0), (15771942, 1.0), (15772030, 0.99)]
User 15771998 neighbors : [(15771970, 1.0), (15771965, 1.0), (15771947, 1.0)]
User 15772036 neighbors : [(15771959, 1.0), (15772000, 0.98), (15771966, 0.98)]
User 15772012 neighbors : [(15772003, 1.0), (15772016, 0.95), (15772000, 0.92)]
User 15771948 neighbors : [(15771966, 1.0), (15771945, 1.0), (15771943, 0.98)]
User 15771976 neighbors : [(15771974, 1.0), (15771966, 1.0), (15771947, 1.0)]
User 15771977 neighbors : [(15771971, 1.0), (15771959, 1.0), (15771983, 0.99)]
User 15771940 neighbors : [(15771993, 1.0), (15771981, 1.0), (15771947, 1.0)]


## 2-a. Correlation

In [19]:
def distance_correlation(a, b):
    return 1-distance.correlation(a,b)

In [20]:
for user in top10_user['reviewNo']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(int(user), 3, distance_correlation)))

  dist = 1.0 - uv / np.sqrt(uu * vv)


User 15771936 neighbors : [(15771993, 1.0), (15772022, 0.69), (15772019, 0.5)]
User 15771961 neighbors : [(15771972, 1.0), (15772015, 0.94), (15771974, 0.93)]
User 15771934 neighbors : [(15771942, 1.0), (15772030, 0.98), (15771947, 0.96)]
User 15771998 neighbors : [(15772005, 0.89), (15771922, 0.88), (15771980, 0.66)]
User 15772036 neighbors : [(15771966, 0.97), (15771943, 0.94), (15771971, 0.62)]
User 15772012 neighbors : [(15772016, 0.94), (15771974, 0.58), (15771969, 0.58)]
User 15771948 neighbors : [(15771943, 0.94), (15771938, 0.73), (15771926, 0.59)]
User 15771976 neighbors : [(15771940, 0.85), (15771974, 0.58), (15771927, 0.53)]
User 15771977 neighbors : [(15771971, 1.0), (15771959, 1.0), (15771998, 0.66)]
User 15771940 neighbors : [(15771981, 0.98), (15771995, 0.97), (15771974, 0.94)]


## 2-a. Euclidean

In [21]:
def distance_euclidean(a, b):
    return 1 / (1+distance.euclidean(a,b))

In [22]:
for user in top10_user['reviewNo']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(int(user), 3, distance_euclidean)))

User 15771936 neighbors : [(15771972, 0.25), (15771993, 0.2), (15772020, 0.12)]
User 15771961 neighbors : [(15772031, 0.33), (15771970, 0.33), (15771954, 0.33)]
User 15771934 neighbors : [(15772030, 0.41), (15771988, 0.29), (15772003, 0.25)]
User 15771998 neighbors : [(15771922, 0.17), (15772030, 0.16), (15772005, 0.15)]
User 15772036 neighbors : [(15771959, 1.0), (15771943, 0.24), (15772022, 0.19)]
User 15772012 neighbors : [(15772016, 0.22), (15772003, 0.15), (15771974, 0.13)]
User 15771948 neighbors : [(15771945, 0.41), (15771943, 0.23), (15771938, 0.15)]
User 15771976 neighbors : [(15771974, 0.31), (15771926, 0.31), (15771966, 0.29)]
User 15771977 neighbors : [(15771971, 1.0), (15771959, 0.5), (15771938, 0.25)]
User 15771940 neighbors : [(15771993, 1.0), (15771947, 1.0), (15771981, 0.33)]


## 2-b. 영화 평점 예측

In [23]:
def predict_rating(userid, nn, simFunc):
    neighbor = nearest_neighbor_user2(userid, nn, simFunc)
    neighbor_id = [id for id, sim in neighbor]
    
    neighbor_movie = UM_matrix_ds.loc[neighbor_id].dropna(1, how='all', thresh=1)
    neighbor_dict = (dict(neighbor))
    ret = []
    
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dict.get(v[0],0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
        
    return ret

In [24]:
def nearest_neighbor_user2(user, topN, simFunc):
    ul = UM_matrix_ds.loc[user].dropna()
    ratedIndex = ul.index
    nn = {}
    
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid == user: continue
            
        for i in ratedIndex:
            if not math.isnan(row[i]):
                interSectionU1.append(ul[i])
                interSectionU2.append(row[i])
        interSection = len(interSectionU1)
            
        if interSection < 3: continue
        sim = simFunc(interSectionU1, interSectionU2)
            
        if not math.isnan(sim): nn[uid] = sim
        
    return sorted(nn.items(), key = itemgetter(1))[:-(topN+1):-1]

### 2-b. Cosine

In [25]:
result = []
for i in range(10):
    reviewNo = int(top10_user.iloc[i].reviewNo)
    movieId = int(top10_user.iloc[i].movieId)
    predict = predict_rating(reviewNo, 100, distance_cosine)
    
    for movie in predict:
        if movieId == movie[0]:
            result.append([int(reviewNo), int(movieId), movie[1]])

resultdf = pd.DataFrame(result, columns=['userId', 'movieId', 'rating'])
resultdf

Unnamed: 0,userId,movieId,rating
0,15771936,145162,5.796201
1,15771961,161967,8.59564
2,15771934,163788,9.150525
3,15771998,180399,7.0
4,15772036,86507,8.561077
5,15771976,180399,7.0
6,15771977,157297,5.80092


#### Error Rate

In [26]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(top10_user[top10_user['reviewNo']==userid]['rating']))

resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print("\nError Rate(Absolute) : ", error_rate_absol)
print("Error Rate(Squared) : ", error_rate_squared)


Error Rate(Absolute) :  1.798704122760153
Error Rate(Squared) :  5.6478653086387975


### 2-b. Euclidean

In [27]:
result = []
for i in range(10):
    reviewNo = int(top10_user.iloc[i].reviewNo)
    movieId = int(top10_user.iloc[i].movieId)
    predict = predict_rating(reviewNo, 100, distance_euclidean)
    
    for movie in predict:
        if movieId == movie[0]:
            result.append([int(reviewNo), int(movieId), movie[1]])

resultdf = pd.DataFrame(result, columns=['userId', 'movieId', 'rating'])
resultdf

Unnamed: 0,userId,movieId,rating
0,15771936,145162,4.649008
1,15771961,161967,9.263031
2,15771934,163788,8.956179
3,15771998,180399,7.0
4,15772036,86507,8.054289
5,15771976,180399,7.0
6,15771977,157297,6.588583


#### Error Rate

In [28]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(top10_user[top10_user['reviewNo']==userid]['rating']))

resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print("\nError Rate(Absolute) : ", error_rate_absol)
print("Error Rate(Squared) : ", error_rate_squared)


Error Rate(Absolute) :  1.6966357708958737
Error Rate(Squared) :  6.0152917257030625


### 2-b. Correlation

In [29]:
result = []
for i in range(10):
    reviewNo = int(top10_user.iloc[i].reviewNo)
    movieId = int(top10_user.iloc[i].movieId)
    predict = predict_rating(reviewNo, 100, distance_correlation)
    
    for movie in predict:
        if movieId == movie[0]:
            result.append([int(reviewNo), int(movieId), movie[1]])

resultdf = pd.DataFrame(result, columns=['userId', 'movieId', 'rating'])
resultdf

  dist = 1.0 - uv / np.sqrt(uu * vv)
  from ipykernel import kernelapp as app


Unnamed: 0,userId,movieId,rating
0,15771936,145162,5.433217
1,15771961,161967,10.715807
2,15771934,163788,15.406481
3,15771998,180399,7.0
4,15772036,86507,11.304849
5,15771976,180399,7.0
6,15771977,157297,0.954683


#### Error Rate

In [30]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(top10_user[top10_user['reviewNo']==userid]['rating']))

resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print("\nError Rate(Absolute) : ", error_rate_absol)
print("Error Rate(Squared) : ", error_rate_squared)


Error Rate(Absolute) :  1.8436674517673026
Error Rate(Squared) :  11.258077133072339
