In [1]:
import pandas as pd
import numpy as np

In [2]:
class kNNWithMeans_Recommender:
    def __init__(self,sim_dataframe,average_ratings_by_item,k_factor): ## 생성자
        self.sim_dataframe = sim_dataframe
        self.average_ratings_by_item = average_ratings_by_item
        self.k_factor=k_factor
    def estimate_all_for_user(self,user_input_dict):
        ## Estimate all the unknown items to a particular user
        estimated_ratings={}
        for item_to_estimate in self.sim_dataframe.columns:
            if item_to_estimate in user_input_dict.keys():
            # in case we ever want to estimate known ratings
            # 이미 방문한 카테고리는 필요없음
                pass
            else:
                # we need to estimate that particular item
                # 방문하지 않은 카테고리
                sim_from_item = \
                    self.sim_dataframe[item_to_estimate].copy()
                    # 방문하지 않은 카테고리의 지금 까지의 유사도 
                sim_from_item.sort_values(
                    inplace=True, ascending = False)       #유사도를 정렬  가까운 노드부터 보기 위해서 ->> k 개만 비교할 것이기 때문
                    # 해당 사용지의 방문하지 않은 카테고리의 유사도 계산
                estimated_ratings[item_to_estimate] = \
                self.estimate_item(
                    item_to_estimate, sim_from_item, user_input_dict)
                
        return estimated_ratings
    
    def estimate_item(self, item_to_estimate, sim_from_item, user_input_dict):
        ## Estimate the rating of one particular item, based on a user's ratings.
        
        numerator=0 #분자
        denominator =0 # 분모
        number_of_ratings_used =0 
        
        i=1
        while i < len(sim_from_item):
            #looping through all the items
            if sim_from_item.index[i] in user_input_dict.keys():
                number_of_ratings_used += 1
                if number_of_ratings_used == self.k_factor:
                    # if we already used up k items, skip the others
                    i = len(sim_from_items)+1
                
                
                current_item=sim_from_item.index[i]
                current_sim = sim_from_item[current_item]
                current_rating = user_input_dict[current_item]
                denominator += current_sim
                numerator += current_sim * \
                (user_input_dict[current_item] - \
                self.average_ratings_by_item.loc[current_item])
                
            i+=1
        if denominator == 0 : ## 해당 카테고리는 모든 유사도가 0이다
            print(item_to_estimate)
            estimation = np.nan
        else:
            estimation = \
            self.average_ratings_by_item.loc[item_to_estimate] + \
            numerator / denominator
        
        return estimation

In [3]:
df = pd.read_csv("log_parsing.csv")
avg_ratings_by_item = df.groupby('category').mean()['stayTime'].copy()
avg_ratings_by_item.loc['cat']

20.666666666666668

In [4]:
from surprise import Reader,Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy

reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(df,reader)
data = data.build_full_trainset()

In [5]:
from surprise import KNNWithMeans
my_sim_option = {'name':'pearson','user_based':False}
model = KNNWithMeans(sim_options = my_sim_option,verbose=False)
model.fit(data)
np.savetxt('./results/pearson_sim.csv',model.sim,delimiter=',')

In [6]:
import csv

data_iids = list(data.all_items())
iid_converter = lambda x : data.to_raw_iid(x)
data_raw_iids = list(map(iid_converter,data_iids))

with open('./results/item_ids_for_sim_matrix.csv','w') as f:
    writer = csv.writer(f)
    writer.writerows(zip(data_raw_iids, data_iids))

In [7]:
pearson_sim = \
    pd.read_csv("./results/pearson_sim.csv",header=None)
sim_matrix_itemlist = \
    pd.read_csv("./results/item_ids_for_sim_matrix.csv",header=None)

pearson_sim.columns = sim_matrix_itemlist[0]
pearson_sim.index = sim_matrix_itemlist[0]
pearson_sim

Unnamed: 0_level_0,dog,model-and-actor,body,seminude,bj,withyou,person,animalcompany,artwork-dark,wedding,...,landscape,uniform,fashionshow,street,club,portrait-strong,good-omen,panorama,tour,first-birthday
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dog,1.0,0.679228,0.595619,1.0,0.0,0.192846,0.241774,0.060694,1.0,0.917663,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
model-and-actor,0.679228,1.0,0.253374,-0.078392,0.243509,0.0,0.171157,0.0,0.0,0.0,...,0.0,0.764309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
body,0.595619,0.253374,1.0,0.794544,0.095293,0.0,0.035788,0.39736,0.460615,0.0,...,0.0,0.537931,0.0,0.0,0.0,1.0,0.563621,0.0,0.0,0.0
seminude,1.0,-0.078392,0.794544,1.0,0.042224,0.0,0.133969,0.0,-0.056115,0.54139,...,0.0,0.764309,0.0,0.0,0.0,0.0,0.563621,0.0,0.0,0.0
bj,0.0,0.243509,0.095293,0.042224,1.0,0.0,0.566638,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
withyou,0.192846,0.0,0.0,0.0,0.0,1.0,0.029153,0.985453,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
person,0.241774,0.171157,0.035788,0.133969,0.566638,0.029153,1.0,0.608495,0.678179,0.839985,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
animalcompany,0.060694,0.0,0.39736,0.0,0.0,0.985453,0.608495,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
artwork-dark,1.0,0.0,0.460615,-0.056115,0.0,0.0,0.678179,0.0,1.0,-0.30182,...,0.0,0.0,0.0,0.0,0.0,-0.016335,-0.166876,-0.904194,0.0,0.0
wedding,0.917663,0.0,0.0,0.54139,0.0,0.0,0.839985,0.0,-0.30182,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0


In [8]:
k  = 5 # 근접한 몇개의 노드를 비교 할 것인지

recommender = kNNWithMeans_Recommender(pearson_sim,avg_ratings_by_item,k)

In [9]:
user_input={
    "dog":10
}

In [12]:
estimated_ratings = recommender.estimate_all_for_user(user_input)
result_sr = pd.Series(estimated_ratings)
result_sr = result_sr.dropna()
result_sr = result_sr.sort_values(ascending=False)
print(result_sr)

# sorted_ratings = \
#     sorted(estimated_ratings.items(),key = lambda x : x[1])
# sorted_ratings




seminude
bj
artwork-dark
video-excercise
video-fashion
event
artwork-creative-h-and-m
evil-omen
kids
group
musician
lookbook
video-entertainment
zoo
4050
ceo
video-wedding
video-club
video-music
video-sketch
location
landscape
uniform
fashionshow
street
club
portrait-strong
good-omen
panorama
tour
first-birthday
body               30.800529
withyou            30.773731
person              8.808079
product             7.162822
model-and-actor     3.374610
cat                 2.920398
animalcompany       0.987065
food               -3.246269
wedding            -5.099210
couple            -12.579602
dtype: float64
