# データ前処理

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors


logs = pd.read_csv('../sample/medium/log_medium.csv')
users = pd.read_csv('../sample/medium/user_medium.csv')
products = pd.read_csv('../sample/medium/product_medium.csv')

df_u = users.drop(['age','address','last_login','last_purchase'], axis = 'columns')
df_p = products.drop(['shop','tag_price'],axis='columns')

category_list = {
    "tops":0,
    "pants":1,
    "outer":2,
    "bag":3,
    "shoes":4
}
df_p["categoryId"] = df_p["category"].map(category_list)

df_po = pd.merge(df_p,logs, on='product_id')
df_pud = pd.merge(df_po,df_u, on='user_id',how='inner')
df_pu = df_pud.drop(['product_id','sub_category','order_id', 'user_id','price','tag_price'],axis='columns')


In [2]:
#割引カラムとセール商品カラムの追加
df_pud['discount'] = df_pud['price']/df_pud['tag_price']
for df_pud in [df_pud]:
    df_pud['isSaled'] = 0
    df_pud.loc[df_pud['discount'] < 1, 'isSaled'] = 1
df_pud = df_pud.drop(['product_id','sub_category', 'user_id','order_id','price','tag_price','discount'],axis='columns')
df_pud

Unnamed: 0,product_name,category,brand,categoryId,quantity,user_name,isSaled
0,Nike log pull over,tops,nike,0,1,C,1
1,Nike sports hoodie,tops,nike,0,1,C,1
2,Adidas logo-hoodie,tops,adidas,0,1,C,1
3,Adidas track pants,pants,adidas,1,1,C,0
4,Nike Air VAPORMAX,shoes,nike,4,1,C,0
5,Nike log pull over,tops,nike,0,1,J,1
6,Nike sports hoodie,tops,nike,0,1,J,0
7,Nike Sports wear jogger pants,pants,nike,1,1,J,0
8,ADICOLOR BACKPACK,bag,adidas,3,1,J,1
9,Nike log pull over,tops,nike,0,1,A,1


In [3]:
df_pud_pivot = df_pud.pivot(index='user_name', columns = 'product_name',values = 'quantity' ).fillna(0)
df_pud_pivot

product_name,ADICOLOR BACKPACK,Adidas logo-hoodie,Adidas track pants,Danton 2way bag,Danton boa-jacket,Danton inner down jacket,Freaks Store Boa-jacket,Freaks Store Super big hoodie,Freaks Store noldick knit,Nike Air VAPORMAX,Nike OH pants,Nike Sports wear jogger pants,Nike air force 1,Nike log pull over,Nike sports hoodie,The North Face Down jacket,The North Face Swead boots
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
B,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
C,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
E,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
I,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
J,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


### calc_posDegree_for_sale_product関数を用いユーザーのセールへのポジティブ度を計算する
### 【購買ログ】
購入商品が同じ２人のユーザーを比較する


In [4]:
df_pud[df_pud['user_name'] == 'A']

Unnamed: 0,product_name,category,brand,categoryId,quantity,user_name,isSaled
9,Nike log pull over,tops,nike,0,1,A,1
10,Nike sports hoodie,tops,nike,0,1,A,1
11,Nike air force 1,shoes,nike,4,1,A,0


In [5]:
df_pud[df_pud['user_name'] == 'K']

Unnamed: 0,product_name,category,brand,categoryId,quantity,user_name,isSaled
12,Nike log pull over,tops,nike,0,1,K,0
13,Nike sports hoodie,tops,nike,0,1,K,0
14,Nike air force 1,shoes,nike,4,1,K,0


### 【考察】

AさんとKさんの購買商品は同じであるが、Aさんはそのうち2つがセール価格で購入、Kさんは定価の時期に購入している。
この結果より、Aさんはセール商品にポジティブな反応を示し、Kさんは興味が無いと推測できる。

したがって、セール商品に対するポジティブ度合いの計算において以下の仮定を考える。

### 【仮定】
***ユーザーの購入商品のセール商品の割合が高ければ高いほど、ユーザーがセール商品に対するポジティブ度が高い***

この仮定をもとにユーザーのセール商品に対するポジティブ度を算出する関数を考える。
具体的なアプローチとしては、ユーザーの商品購入ログのセール商品の割合をセール商品へのポジティブ度として扱う。

### 【アルゴリズム】

In [6]:
# セール商品へのポジティブ度を計算する関数
def calc_posDegree_for_sale_product(pro_list,user):
    rownum = len(pro_list[pro_list['user_name'] == user])
    saled_cnt = pro_list[pro_list['user_name'] == user]['isSaled'].sum()
    posDegree = saled_cnt/rownum
    return(posDegree)

### 【計算例】

In [7]:
calc_posDegree_for_sale_product(df_pud,'A')

0.6666666666666666

In [8]:
calc_posDegree_for_sale_product(df_pud,'K')

0.0

# レコメンド関数

In [9]:
#入力
matrix = df_pud_pivot
matrix_sparse = csr_matrix(df_pud_pivot.values)
product_list = df_pud

In [10]:

# Scikit-learnのライブラリを利用しモデルを作成
N = 10
knn = NearestNeighbors(n_neighbors=N,algorithm= 'brute', metric= 'cosine')
 
# 前処理したデータセットでモデルを訓練
model_knn = knn.fit(matrix_sparse)

#類似度を求める関数
def get_sim(matrix,user1,user2):
    distance, indice = model_knn.kneighbors(matrix.iloc[matrix.index== user1].values.reshape(1,-1),n_neighbors=N)
    for i in range(0, len(distance.flatten())):
        if  i > 0:
            if matrix.index[indice.flatten()[i]] == user2:
                return(1 - distance.flatten()[i])

#商品集合を求める関数
def get_product_set(user):
    s = set([])
    for i in range(len(matrix.loc[user].values)):
           if matrix.loc[user].values[i] > 0:
                s.add(matrix.columns[i])
    return(s)

#セール商品かどうかのチェック関数
def check_sale(product_name, product_list):
    return(product_list[product_list['product_name'] ==  product_name]['isSaled'].values[0])



import copy 
def get_recommend(user, top_N,mode):
    totals = {}  ; simSums = {}
    # 全てのユーザー、商品リストの作成
    list_product = []
    list_user = []
    for i in range(len(matrix.values)):
        list_product.append(matrix.columns[i])
        list_user.append(matrix.index[i])
    
    #自分以外のユーザーリスト
    list_others = copy.copy(list_user)
    list_others.remove(user)
    
    # 自分の購入商品集合
    set_user = get_product_set(user)
    
    for other in list_others:
        #本人がまだ購入していない商品の集合を取得
        set_other = get_product_set(other)
        set_new_product = set_other.difference(set_user)
        #あるユーザーと本人の類似度を計算
        sim = get_sim(matrix, user,other)
        if sim is not None:
            for item in set_new_product:
                #類似度 *  
                totals.setdefault(item,0)
                score = matrix.loc[other,item]*sim 
                if mode == 1:
                    if check_sale(item,product_list):
                        posdeg_for_sale= calc_posDegree_for_sale_product(product_list, user)
                        score = score*(1 + posdeg_for_sale)
                totals[item] += score
                #ユーザーの類似度の積算値
                simSums.setdefault(item,0)
                simSums[item] += sim

    rankings = []
    #ランキングリストの作成
    for item,total in totals.items():
        if simSums[item] != 0:
            rankings.append((total/simSums[item],item))
    rankings.sort()
    rankings.reverse()
    return ([i[1] for i in rankings][:top_N])

In [11]:
#Kさんへの商品推薦
get_recommend('K',5,1)

['Nike Sports wear jogger pants',
 'Nike OH pants',
 'Nike Air VAPORMAX',
 'Freaks Store noldick knit',
 'Freaks Store Boa-jacket']

In [12]:
#Aさんへの商品推薦
get_recommend('A',5,1)

['Adidas logo-hoodie',
 'ADICOLOR BACKPACK',
 'Nike OH pants',
 'Nike Sports wear jogger pants',
 'Nike Air VAPORMAX']