In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
dummy_rating = pd.read_csv("data/dummy_rating.csv", index_col=0)

In [10]:
dummy_rating.shape

(5, 6)

In [11]:
dummy_rating.head()

Unnamed: 0,scifi1,scifi2,scifi3,comedy1,comedy2,comedy3
user1,4.0,5.0,3.0,,2.0,1.0
user2,5.0,3.0,3.0,2.0,2.0,
user3,1.0,,,4.0,5.0,4.0
user4,,2.0,1.0,4.0,,3.0
user5,1.0,,2.0,3.0,3.0,4.0


In [12]:
dummy_rating.fillna(0, inplace=True)
dummy_rating

Unnamed: 0,scifi1,scifi2,scifi3,comedy1,comedy2,comedy3
user1,4.0,5.0,3.0,0.0,2.0,1.0
user2,5.0,3.0,3.0,2.0,2.0,0.0
user3,1.0,0.0,0.0,4.0,5.0,4.0
user4,0.0,2.0,1.0,4.0,0.0,3.0
user5,1.0,0.0,2.0,3.0,3.0,4.0


In [13]:
# 평점 정보를 보정. 이후에 코사인 유사도를 사용하면 이는 피어슨 유사도에 해당
def standardize(row):
    new_row = (row - row.mean())/(row.max()-row.min())
    return new_row

dummy_rating_std = dummy_rating.apply(standardize, axis=1)
dummy_rating_std.head()

Unnamed: 0,scifi1,scifi2,scifi3,comedy1,comedy2,comedy3
user1,0.3,0.5,0.1,-0.5,-0.1,-0.3
user2,0.5,0.1,0.1,-0.1,-0.1,-0.5
user3,-0.266667,-0.466667,-0.466667,0.333333,0.533333,0.333333
user4,-0.416667,0.083333,-0.166667,0.583333,-0.416667,0.333333
user5,-0.291667,-0.541667,-0.041667,0.208333,0.208333,0.458333


In [14]:
# 정규화 없이 아이템간의 유사도 측정 행렬 만들기
corrMatrix_wo_std = pd.DataFrame(cosine_similarity(dummy_rating), index=dummy_rating.index, columns=dummy_rating.index)
corrMatrix_wo_std

Unnamed: 0,user1,user2,user3,user4,user5
user1,1.0,0.906306,0.318696,0.393893,0.431834
user2,0.906306,1.0,0.422891,0.434613,0.515716
user3,0.318696,0.422891,1.0,0.671249,0.925138
user4,0.393893,0.434613,0.671249,1.0,0.760117
user5,0.431834,0.515716,0.925138,0.760117,1.0


In [15]:
# 정규화 기반 아이템간의 유사도 측정 행렬 만들기
corrMatrix = pd.DataFrame(cosine_similarity(dummy_rating_std), index=dummy_rating.index, columns=dummy_rating.index)
corrMatrix

Unnamed: 0,user1,user2,user3,user4,user5
user1,1.0,0.68313,-0.807391,-0.589188,-0.907841
user2,0.68313,1.0,-0.648886,-0.596285,-0.785553
user3,-0.807391,-0.648886,1.0,0.253917,0.824965
user4,-0.589188,-0.596285,0.253917,1.0,0.360555
user5,-0.907841,-0.785553,0.824965,0.360555,1.0


In [16]:
def get_similar(userId):
    similar_score = corrMatrix[userId]
    # 앞서 보정된 값을 가지고 평점의 내림차순으로 정렬
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score

In [17]:
scifi_lover = "user1"

similar_users = get_similar(scifi_lover)
similar_users.head(10)

user1    1.000000
user2    0.683130
user4   -0.589188
user3   -0.807391
user5   -0.907841
Name: user1, dtype: float64