# H&M購買予測コンペ -RFM分析と協調フィルタリング-

目的：顧客ごとに将来の購入商品を予測する。
目標：customer_idからarticle_idを予測し、最大12個出力する。
方針：
1. 顧客をRFM分析によってグループ分けし、その中から有用なグループを抽出して将来の購買予測に用いる。
2. 1.で絞り込んだ購買履歴について、商品ベースの協調フィルタリングを実施。

（参考）https://www.kaggle.com/code/luisrodri97/item-based-collaborative-filtering

In [1]:
# インポート
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

plt.style.use('seaborn-white')
sns.set_style("whitegrid")
sns.despine()
plt.rc("figure", autolayout = True)
plt.rc("axes", labelweight = "bold", labelsize = "large", titleweight = "bold", titlesize = 14, titlepad = 10)

mpl.rcParams['axes.spines.left'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.bottom'] = False
plt.rcParams["font.weight"] = "bold"
plt.rcParams["axes.labelweight"] = "bold"

<Figure size 432x288 with 0 Axes>

In [2]:
# ファイル読み込み
# transactions =  pd.read_csv("../input/i-r-tbl/for_uploading/transactions_rakus_train.csv",
#                            encoding="ISO-8859-1", dtype={'article_id':str}).drop_duplicates()
transactions =  pd.read_csv("./for_uploading/transactions_rakus_train.csv",
                            encoding = "ISO-8859-1", dtype = {'article_id':str})

In [3]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2
...,...,...,...,...,...
31218439,2020-09-06,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,720125039,0.033881,1
31218440,2020-09-06,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,740922009,0.025407,1
31218441,2020-09-06,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,791587007,0.025407,1
31218442,2020-09-06,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,804992033,0.025407,1


# RFM Analysis


In [14]:
# インポート
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree
import datetime as dt
import plotly.express as px

In [5]:
# 't_dat'列のdatetime64型への変換
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# 後段の集計で使う列を生成
transactions['date'] = pd.to_datetime(transactions['t_dat'],format='%Y-%m-%d')

In [6]:
# データサイズの確認
transactions.shape

(31218444, 6)

In [7]:
# データ中のnullの割合を確認
df_null = round(100 * (transactions.isnull().sum()) / len(transactions), 2)
df_null

t_dat               0.0
customer_id         0.0
article_id          0.0
price               0.0
sales_channel_id    0.0
date                0.0
dtype: float64

In [8]:
# 2020-03-01以降のデータのみで分析を行う
start_date = dt.datetime(2020, 3, 1)
end_date = dt.datetime(2020, 9, 6)
transactions = transactions.loc[transactions["t_dat"] >= start_date]

In [9]:
# 顧客ごとに最近購入した日の近さ(Recency), 購入回数(Frequency), 購入合計額(Monetary)を集計
rfm = transactions.groupby('customer_id').agg({
    't_dat': lambda x: (end_date - x.max()).days,
    'date': 'count',
    'price': 'sum'})
rfm.columns = ["Recency","Frequency","Monetary"]

# 購入額が0のcustomerは省く
rfm = rfm[rfm["Monetary"] > 0]

rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,1,6,0.144475
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,60,16,0.386169
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,141,10,0.428644
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,25,7,0.255814
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,116,10,0.174746


In [10]:
# 最後の購買日付が新しい順に5段階でクラス分け（5が新しく、1が古くなるように）
rfm["recency_score"] = pd.qcut(rfm['Recency'], 5, labels = [5, 4, 3, 2, 1])
# 購入回数の多い順に5段階でクラス分け（5が最も多く、1が最も少なくなるように）
rfm["frequency_score"] = pd.qcut(rfm["Frequency"].rank(method = "first"), 5, labels = [1, 2, 3, 4, 5])
# 購入額の多い順に5段階でクラス分け（5が最も多く、1が最も少なくなるように）
rfm["monetary_score"] = pd.qcut(rfm["Monetary"], 5, labels = [1, 2, 3, 4, 5])
# recency_score & frequency_scoreで"RF_score"とする
rfm["RF_score"] = (rfm["recency_score"].astype(str) + rfm["frequency_score"].astype(str))

rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,recency_score,frequency_score,monetary_score,RF_score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,1,6,0.144475,5,3,3,53
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,60,16,0.386169,3,4,4,34
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,141,10,0.428644,1,4,5,14
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,25,7,0.255814,4,3,4,43
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,116,10,0.174746,1,4,3,14


In [12]:
# RF_scoreの値ごとに'segment'に分ける
seg_map = {
    r'[1-2][1-2]': 'hibernating',
    r'[1-2][3-4]': 'at_Risk',
    r'[1-2]5': 'cant_loose',
    r'3[1-2]': 'about_to_sleep',
    r'33': 'need_attention',
    r'[3-4][4-5]': 'loyal_customers',
    r'41': 'promising',
    r'51': 'new_customers',
    r'[4-5][2-3]': 'potential_loyalists',
    r'5[4-5]': 'champions'
}
rfm['segment'] = rfm['RF_score'].replace(seg_map, regex = True)

# 'segment'ごとのRecency, Frequency, Monetaryの平均値、要素数、最大値を集計
rfm[["segment", "Recency", "Frequency", "Monetary"]].groupby("segment").agg(["mean", "count", "max"]).round()

Unnamed: 0_level_0,Recency,Recency,Recency,Frequency,Frequency,Frequency,Monetary,Monetary,Monetary
Unnamed: 0_level_1,mean,count,max,mean,count,max,mean,count,max
segment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
about_to_sleep,55.0,55862,70,3.0,55862,5,0.0,55862,1.0
at_Risk,112.0,108819,189,8.0,108819,16,0.0,108819,1.0
cant_loose,101.0,20954,189,25.0,20954,192,1.0,20954,7.0
champions,7.0,97010,15,28.0,97010,629,1.0,97010,26.0
hibernating,125.0,170728,189,2.0,170728,5,0.0,170728,1.0
loyal_customers,38.0,138404,70,21.0,138404,524,1.0,138404,14.0
need_attention,54.0,33327,70,6.0,33327,9,0.0,33327,1.0
new_customers,7.0,15361,15,2.0,15361,2,0.0,15361,1.0
potential_loyalists,18.0,93847,38,5.0,93847,9,0.0,93847,1.0
promising,27.0,18220,38,2.0,18220,2,0.0,18220,1.0


In [17]:
# treemapにて図示
x = rfm.segment.value_counts()
fig = px.treemap(x, path = [x.index], values = x)
fig.update_layout(title_text = 'Distribution of the RFM Segments',
                  title_x = 0.5,
                  title_font = dict(size = 20))
fig.update_traces(textinfo = "label+value+percent root")
fig.show()

# Item-Based Collaborative Filtering



・時間とともに流行が衰えることを考慮し、重要でないと判断した購買履歴（transactions）は省いて予測を行う。

・商品（articles）についても取引量が不十分（10個未満）なものは取り除いて予測を行う。

（参考）
https://www.kaggle.com/code/luisrodri97/item-based-collaborative-filtering

In [28]:
# インポート
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,date
22886405,2020-03-01,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,751628002,0.022017,1,2020-03-01
22886406,2020-03-01,0008968c0d451dbc5a9968da03196fe20051965edde741...,675662028,0.035576,2,2020-03-01
22886407,2020-03-01,001127bffdda108579e6cb16080440e89bf1250a776c6e...,821152004,0.025407,2,2020-03-01
22886408,2020-03-01,001127bffdda108579e6cb16080440e89bf1250a776c6e...,860738001,0.025407,2,2020-03-01
22886409,2020-03-01,00117f79ce61af038e143ee26448e8401fdbff51f48d5a...,822957002,0.045746,2,2020-03-01


In [24]:
# customer_idをindexから外す
rfm = rfm.reset_index()
rfm.head()

Unnamed: 0,index,customer_id,Recency,Frequency,Monetary,recency_score,frequency_score,monetary_score,RF_score,segment
0,0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,1,6,0.144475,5,3,3,53,potential_loyalists
1,1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,60,16,0.386169,3,4,4,34,loyal_customers
2,2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,141,10,0.428644,1,4,5,14,at_Risk
3,3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,25,7,0.255814,4,3,4,43,potential_loyalists
4,4,00007d2de826758b65a93dd24ce629ed66842531df6699...,116,10,0.174746,1,4,3,14,at_Risk


In [25]:
# transactionsとrfmを結合
transactions = pd.merge(transactions, rfm[["customer_id", "segment"]], how = 'inner', on = 'customer_id')
# segmentのうち、訓練に用いる要素を指定し、他は削除
training_segment = ['champions', 'potential_loyalists', 'new_customers', 'promising', 'loyal_customers']
transactions = transactions[transactions['segment'].isin(training_segment)]
transactions = transactions.drop('segment', axis = 1)
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,date
0,2020-03-01,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,751628002,0.022017,1,2020-03-01
1,2020-04-22,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,599580055,0.016932,2,2020-04-22
2,2020-04-22,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,599580055,0.016932,2,2020-04-22
3,2020-04-22,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,811835004,0.030492,2,2020-04-22
4,2020-04-22,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,811835004,0.030492,2,2020-04-22
...,...,...,...,...,...,...
8332034,2020-09-06,ff614d10bc983fbcdbcca38a944ddc11633409325b9c50...,841383003,0.008458,2,2020-09-06
8332035,2020-09-06,ff614d10bc983fbcdbcca38a944ddc11633409325b9c50...,841383003,0.008458,2,2020-09-06
8332036,2020-09-06,ff614d10bc983fbcdbcca38a944ddc11633409325b9c50...,841383003,0.008458,2,2020-09-06
8332037,2020-09-06,ff614d10bc983fbcdbcca38a944ddc11633409325b9c50...,841383003,0.008458,2,2020-09-06


In [26]:
# 商品を絞り込む（売上個数が10個未満の物を除く）
article_bought_count = transactions[['article_id', 'date']].groupby('article_id').count().reset_index().rename(columns = {'date': 'count'})
most_bought_articles = article_bought_count[article_bought_count['count'] > 10]['article_id'].values
transactions = transactions[transactions['article_id'].isin(most_bought_articles)]
transactions["bought"] = 1 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [27]:
# 'negative samples'の生成
np.random.seed(0)
negative_samples = pd.DataFrame({
    'article_id': np.random.choice(transactions.article_id.unique(), transactions.shape[0]),
    'customer_id': np.random.choice(transactions.customer_id.unique(), transactions.shape[0]),
    'bought': np.zeros(transactions.shape[0])
})

In [39]:
# 予測モデルのクラス
class ItemBased_RecSys:
    ''' Collaborative filtering using a custom sim(u,u'). '''

    def __init__(self, positive_transactions, negative_transactions, num_components = 10):
        ''' Constructor '''
        self.positive_transactions = positive_transactions
        self.transactions = pd.concat([positive_transactions, negative_transactions])
        self.customers = self.transactions.customer_id.values
        self.articles = self.transactions.article_id.values
        self.bought = self.transactions.bought.values
        self.num_components = num_components

        # idとインデックスの対応関係を変数に格納
        self.customer_id2index = {c: i for i, c in enumerate(np.unique(self.customers))}
        self.article_id2index = {a: i for i, a in enumerate(np.unique(self.articles))}
        
    def __sdg__(self):
        for idx in tqdm(self.training_indices):
            # idxの番号に相当するデータを一つ取得
            customer_id = self.customers[idx]
            article_id = self.articles[idx]
            bought = self.bought[idx]

            # idが一致する顧客、商品のindexを取得
            customer_index = self.customer_id2index[customer_id]
            article_index = self.article_id2index[article_id]

            # 予測・誤差の計算
            prediction = self.predict_single(customer_index, article_index)
            error = (bought - prediction) # error
            
            # 学習率（learning_rate）と誤差（error）に基づいて値（customer/articleのlatent factor）をそれぞれ更新
            self.customers_latent_matrix[customer_index] += self.learning_rate * \
                                    (error * self.articles_latent_matrix[article_index] - \
                                     self.lmbda * self.customers_latent_matrix[customer_index])
            self.articles_latent_matrix[article_index] += self.learning_rate * \
                                    (error * self.customers_latent_matrix[customer_index] - \
                                     self.lmbda * self.articles_latent_matrix[article_index])
                
    def fit(self, n_epochs = 10, learning_rate = 0.001, lmbda = 0.1):
        ''' Compute the matrix factorization R = P x Q '''
        self.learning_rate = learning_rate
        self.lmbda = lmbda
        n_samples = self.transactions.shape[0]
        
        # 'latent matrices'の初期化
        self.customers_latent_matrix = np.random.normal(scale = 1., size = (len(np.unique(self.customers)), self.num_components))
        self.articles_latent_matrix = np.random.normal(scale = 1., size = (len(np.unique(self.articles)), self.num_components))

        for epoch in range(n_epochs):
            print('Epoch: {}'.format(epoch))
            self.training_indices = np.arange(n_samples)
            
            # 訓練データのシャッフルと確率的勾配降下法の実行
            np.random.shuffle(self.training_indices)
            self.__sdg__()

    def predict_single(self, customer_index, article_index):
        ''' Make a prediction for an specific user and article '''
        prediction = np.dot(self.customers_latent_matrix[customer_index], self.articles_latent_matrix[article_index])
        prediction = np.clip(prediction, 0, 1)
        return prediction

    def default_recommendation(self):
        ''' Calculate time decaying popularity '''
        # 人気度の経時的減衰を計算し、直近で買われた商品の重要度を高く見積もる。
        self.positive_transactions['pop_factor'] = self.positive_transactions['t_dat'].apply(lambda x: 1 / (datetime.datetime(2020, 9, 21) - x).days)
        transactions_by_article = self.positive_transactions[['article_id', 'pop_factor']].groupby('article_id').sum().reset_index()
        return transactions_by_article.sort_values(by = 'pop_factor', ascending = False)['article_id'].values[:12]

    def predict(self, customers):
        ''' Make recommendations '''
        recommendations = []
        self.articles_latent_matrix[np.isnan(self.articles_latent_matrix)] = 0
        # コサイン類似度の計算
        similarity_matrix = cosine_similarity(self.articles_latent_matrix, self.articles_latent_matrix, dense_output = False)

        # 各商品に対し、最も類似した12個の商品のインデックスを含む行列となるよう、類似度行列を変換する
        similarity_matrix = np.argsort(similarity_matrix, axis = 1)
        similarity_matrix = similarity_matrix[:, -12:]

        # 初期の予測を算出（人気の経時的減衰を考慮したもの）
        default_recommendation = self.default_recommendation()

        # 顧客、商品ごとに12個の商品をグループ化し、顧客の商品購入回数を計算
        transactions_by_customer = self.positive_transactions[['customer_id', 'article_id', 'bought']].groupby(['customer_id', 'article_id']).count().reset_index()
        most_bought_article = transactions_by_customer.loc[transactions_by_customer.groupby('customer_id').bought.idxmax()]['article_id'].values

        # 予測
        for customer in tqdm(customers):
            try:
                rec_aux1 = []
                rec_aux2 = []
                aux = []

                # 顧客別に最も購入された商品を取得する
                user_most_bought_article_id = most_bought_article[self.customer_id2index[customer]]

                # 類似度行列を用いて、最も類似している6つの商品を取得
                rec_aux1 = self.articles[similarity_matrix[self.article_id2index[user_most_bought_article_id]]]
                # デフォルト予測の半分を返す
                rec_aux2 = default_recommendation

                # レコメンドリストをマージ
                for rec_idx in range(6):
                    aux.append(rec_aux2[rec_idx])
                    aux.append(rec_aux1[rec_idx])

                recommendations.append(' '.join(aux))
            except:
                # デフォルト予測を返す
                recommendations.append(' '.join(default_recommendation))
        
        return pd.DataFrame({
            'customer_id': customers,
            'prediction': recommendations,
        })

In [40]:
# ハイパーパラメータを指定しモデルを実行。
rec = ItemBased_RecSys(transactions, negative_samples, num_components = 1000)
rec.fit(n_epochs = 1)

Epoch: 0


100%|██████████| 12069616/12069616 [06:59<00:00, 28757.27it/s]


In [41]:
# テストデータ読み込み
test = pd.read_csv('./for_uploading/sample_submission_rakus_latest.csv',
                   encoding="ISO-8859-1", 
                   dtype={'article_id':str}).customer_id.unique()
# test = pd.read_csv('../input/i-r-tbl/for_uploading/sample_submission_rakus_latest.csv',
#                   encoding="ISO-8859-1", 
#                   dtype={'article_id':str}).customer_id.unique()

In [42]:
recommendations = rec.predict(test)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

100%|██████████| 1362281/1362281 [00:03<00:00, 403969.25it/s]


In [43]:
recommendations.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,706016001 562245025 751471001 732842021 610776...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,706016001 716672019 751471001 866218009 610776...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,706016001 751471001 610776002 372860002 918292...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,706016001 751471001 610776002 372860002 918292...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,706016001 679854011 751471001 802871004 610776...


In [56]:
# 提出用にarticle_idの文字列を整形
recommendations.prediction = '0' + recommendations.prediction
recommendations.prediction = recommendations.prediction.str.replace(' ', ' 0')
recommendations.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0562245025 0751471001 0732842021 06...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0716672019 0751471001 0866218009 06...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0751471001 0610776002 0372860002 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0751471001 0610776002 0372860002 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0679854011 0751471001 0802871004 06...


In [57]:
# 予測結果をcsvファイルへ出力
recommendations.to_csv('submission.csv', index = False)