In [1]:
import pandas as pd
import numpy as np

# データの前処理

In [2]:
logs = pd.read_csv('../sample/medium/log_medium.csv')
users = pd.read_csv('../sample/medium/user_medium.csv')
products = pd.read_csv('../sample/medium/product_medium.csv')

In [3]:
logs

Unnamed: 0,order_id,user_id,product_id,tag_price,price,quantity
0,1000001,1000001,1000002,3600,1800,1
1,1000002,1000001,1000010,9800,9800,1
2,1000003,1000002,1000014,23760,23760,1
3,1000004,1000002,1000012,9500,9500,1
4,1000005,1000002,1000006,9800,9800,1
5,1000006,1000003,1000003,14000,12600,1
6,1000007,1000003,1000001,3900,2000,1
7,1000008,1000004,1000015,6000,6000,1
8,1000009,1000004,1000011,9700,9700,1
9,1000010,1000005,1000008,17000,17000,1


In [4]:
users.tail()

Unnamed: 0,user_id,user_name,age,address,last_login,last_purchase
3,1000004,D,27,G city,2018-11-21 12:11:04 UTC,2018-10-17 19:56:03 UTC
4,1000005,E,54,I city,2018-11-02 13:21:56 UTC,2018-10-27 01:43:03 UTC
5,1000006,F,50,A city,2018-11-01 01:43:02 UTC,2018-10-17 19:56:03 UTC
6,1000007,G,22,A city,2018-11-01 01:43:02 UTC,2018-10-17 19:56:03 UTC
7,1000008,H,21,E city,2018-11-21 12:11:04 UTC,2018-10-17 19:56:03 UTC


In [5]:
products.head()

Unnamed: 0,product_id,product_name,sub_category,category,brand,shop,tag_price
0,1000001,Nike log pull over,sweat,tops,nike,nike,3700
1,1000002,Nike sports hoodie,hoodie,tops,nike,nike,3600
2,1000003,Adidas logo-hoodie,hoodie,tops,adidas,adidas,14000
3,1000004,Freaks Store Boa-jacket,other jacket,outer,Freaks Store,Freaks Store,8000
4,1000005,Freaks Store noldick knit,knit,tops,Freaks Store,Freaks Store,6000


# データ整形
不要なカラムを削る

In [6]:
df_u = users.drop(['age','address','last_login','last_purchase'], axis = 'columns')
df_p = products.drop(['category','shop','brand','tag_price'],axis='columns')
df_po = pd.merge(df_p,logs, on='product_id')
df_pu = pd.merge(df_po,df_u, on='user_id',how='inner')
df_pu = df_pu.drop(['product_id','sub_category','order_id', 'user_id','price','tag_price'],axis='columns')

## ユーザー*商品の行列を作成
値は購入した個数

In [7]:
df_pu_pivot = df_pu.pivot(index='user_name', columns = 'product_name',values = 'quantity').fillna(0)
df_pu_pivot

product_name,Adidas logo-hoodie,Adidas track pants,Danton 2way bag,Danton boa-jacket,Danton inner down jacket,Freak Store Super big hoodie,Freaks Store Boa-jacket,Freaks Store noldick knit,Nike OH pants,Nike air force 1,Nike log pull over,Nike sports hoodie,The North Face Down jacket,The North Face Swead boots
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
B,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
C,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
D,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
E,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [8]:
df_pu_pivot.loc['A'] > 0

product_name
Adidas logo-hoodie              False
Adidas track pants              False
Danton 2way bag                 False
Danton boa-jacket               False
Danton inner down jacket        False
Freak Store Super big hoodie    False
Freaks Store Boa-jacket         False
Freaks Store noldick knit       False
Nike OH pants                   False
Nike air force 1                 True
Nike log pull over              False
Nike sports hoodie               True
The North Face Down jacket      False
The North Face Swead boots      False
Name: A, dtype: bool

# 学習
## アプローチ
1. k近傍法を用いユーザー同士の距離を計算
1. 距離から類似度を求め、レコメンド関数を作成
1. レコメンド関数にレコメンドしたいユーザーを入力として渡し、商品のレコメンドランキングを返す

In [9]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

#疎行列変換
df_pu_pivot_sparse = csr_matrix(df_pu_pivot.values)

In [10]:
# Scikit-learnのライブラリを利用しモデルを作成
N = 5
knn = NearestNeighbors(n_neighbors=N,algorithm= 'brute', metric= 'cosine')
 
# 前処理したデータセットでモデルを訓練
model_knn = knn.fit(df_pu_pivot_sparse)

### ユーザーの検索関数

In [11]:
def search_user(string):
    print(df_pu_pivot[df_pu_pivot.index.str.contains(string)].index[0:])

### アイテムの検索関数

In [12]:
def search_item(string):
    print(df_pu_pivot.columns[df_pu_pivot.columns.str.contains(string)])

#### 検索関数使用例

In [13]:
search_item('s')
search_user('A')

Index(['Adidas logo-hoodie', 'Adidas track pants', 'Freaks Store Boa-jacket',
       'Freaks Store noldick knit', 'Nike OH pants', 'Nike sports hoodie',
       'The North Face Swead boots'],
      dtype='object', name='product_name')
Index(['A'], dtype='object', name='user_name')


### 類似度を求める関数

In [14]:
def get_sim(user1,user2):
    distance, indice = model_knn.kneighbors(df_pu_pivot.iloc[df_pu_pivot.index== user1].values.reshape(1,-1),n_neighbors=N)
    for i in range(0, len(distance.flatten())):
        if  i > 0:
            if df_pu_pivot.index[indice.flatten()[i]] == user2:
                return(1 - distance.flatten()[i])

In [15]:
#使用例　ユーザーBとAの購入ログに基づく類似度
print('simirality(A,G) = ',get_sim('B','C'))

simirality(A,G) =  None


### 購入商品を集合にする関数

In [16]:
def get_product_set(user):
    s = set([])
    for i in range(len(df_pu_pivot.loc[user].values)):
           if df_pu_pivot.loc[user].values[i] > 0:
                s.add(df_pu_pivot.columns[i])
    return(s)

### レコメンド関数

In [17]:
import copy 
def get_recommend(user, top_N):
    totals = {}  ; simSums = {}
    # 全てのユーザー、商品リストの作成
    list_product = []
    list_user = []
    for i in range(len(df_pu_pivot.values)):
        list_product.append(df_pu_pivot.columns[i])
        list_user.append(df_pu_pivot.index[i])
    
    #自分以外のユーザーリスト
    list_others = copy.copy(list_user)
    list_others.remove(user)
    
    # 自分の購入商品集合
    set_user = get_product_set(user)
    
    for other in list_others:
        #本人がまだ購入していない商品の集合を取得
        set_other = get_product_set(other)
        set_new_product = set_other.difference(set_user)
        
        #あるユーザーと本人の類似度を計算
        sim = get_sim(user,other)
        
        if sim is not None:
            for item in set_new_product:
                #類似度 *  
                totals.setdefault(item,0)
                totals[item] += df_pu_pivot.loc[other,item]*sim
                #ユーザーの類似度の積算値
                simSums.setdefault(item,0)
                simSums[item] += sim

    rankings = []
    #ランキングリストの作成
    for item,total in totals.items():
        if simSums[item] != 0:
            rankings.append((total/simSums[item],item))
    rankings.sort()
    rankings.reverse()
    return ([i[1] for i in rankings][:top_N])

In [18]:
get_recommend('C',6)

['Nike air force 1', 'Freaks Store noldick knit', 'Freaks Store Boa-jacket']

#### 参考
- Pythonで簡単な協調フィルタリングを実装するためのノート[https://qiita.com/hik0107/items/96c483afd6fb2f077985]
- 機械学習を使って630万件のレビューに基づいたアニメのレコメンド機能を作ってみよう（機械学習 k近傍法 初心者向け）
[https://www.codexa.net/collaborative-filtering-k-nearest-neighbor/]