データの読み込みと前処理を行うためのnotebookです。  
モデルの学習と予測にはここで処理をかけたデータを利用するようにして下さい。

## 必要なライブラリのimport

In [1]:
import warnings
import time
import sys
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.utils import summary_data_from_transaction_data

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

warnings.simplefilter(action='ignore', category=FutureWarning)

## データの読み込み

In [2]:
def reduce_mem_usage(df, verbose=True):
    """
    データフレームのメモリ使用量を減らす。

    Parameters
    ----------
    df : pd.DataFrame
        メモリ使用量を削減したいデータフレーム。
    verbose : bool, optional
        メモリ使用量の削減結果を出力するかどうか（デフォルトは True）。

    Returns
    -------
    pd.DataFrame
        メモリ使用量が削減されたデータフレーム。
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def binarize(df):
    """
    指定された列を二値化する。

    Parameters
    ----------
    df : pd.DataFrame
        二値化対象のデータフレーム。

    Returns
    -------
    pd.DataFrame
        二値化されたデータフレーム。
    """

    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y': 1, 'N': 0})
    return df


def read_data(input_file):
    """
    指定されたファイルからデータを読み込み、前処理を行う。

    Parameters
    ----------
    input_file : str
        読み込むデータファイルのパス。

    Returns
    -------
    pd.DataFrame
        前処理されたデータフレーム。
    """
    
    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (pd.Timestamp('2018-02-01') - df['first_active_month']).dt.days
    return df

In [17]:
train = read_data('../data/row/train.csv')
test = read_data('../data/row/test.csv')

new_transactions = pd.read_csv('../data/row/new_merchant_transactions.csv',
                               parse_dates=['purchase_date'])

historical_transactions = pd.read_csv('../data/row/historical_transactions.csv',
                                      parse_dates=['purchase_date'])

historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

## 特徴量作成

In [18]:
def calculate_month_diff(transactions):
    """
    purchase_dateとmonth_lagを基にmonth_diffを計算する。
    
    Parameters
    ----------
    transactions : pd.DataFrame
        取引データのデータフレーム。
    
    Returns
    -------
    pd.DataFrame
        month_diff列が追加されたデータフレーム。
    """
    current_date = pd.Timestamp(datetime.datetime.today())
    transactions['month_diff'] = ((current_date - transactions['purchase_date']).dt.days) // 30
    transactions['month_diff'] += transactions['month_lag']
    return transactions


def encode_categorical_columns(df, columns):
    """
    指定されたカテゴリカル列をワンホットエンコーディングする。
    
    Parameters
    ----------
    df : pd.DataFrame
        エンコード対象のデータフレーム。
    columns : list of str
        エンコードするカテゴリカル列のリスト。
    
    Returns
    -------
    pd.DataFrame
        ワンホットエンコードされたデータフレーム。
    """
    return pd.get_dummies(df, columns=columns)


def reduce_mem_usage(df, verbose=True):
    """
    データフレームのメモリ使用量を減らす。

    Parameters
    ----------
    df : pd.DataFrame
        メモリ使用量を削減したいデータフレーム。
    verbose : bool, optional
        メモリ使用量の削減結果を出力するかどうか（デフォルトは True）。

    Returns
    -------
    pd.DataFrame
        メモリ使用量が削減されたデータフレーム。
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def aggregate_transactions(history):
    """
    取引データを集計する。
    
    Parameters
    ----------
    history : pd.DataFrame
        取引データのデータフレーム。
    
    Returns
    -------
    pd.DataFrame
        集計されたデータフレーム。
    """
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).astype(np.int64) * 1e-9
    
    agg_func = {
        'category_1': ['sum', 'mean'],
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        'merchant_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'subsector_id': ['nunique'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],
        'purchase_month': ['mean', 'max', 'min', 'std'],
        'purchase_date': [np.ptp, 'min', 'max'],
        'month_lag': ['mean', 'max', 'min', 'std'],
        'month_diff': ['mean']
    }
    
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history


def aggregate_per_month(history):
    """
    月ごとの取引データを集計する。
    
    Parameters
    ----------
    history : pd.DataFrame
        取引データのデータフレーム。
    
    Returns
    -------
    pd.DataFrame
        月ごとに集計されたデータフレーム。
    """
    grouped = history.groupby(['card_id', 'month_lag'])

    agg_func = {
        'purchase_amount': ['count', 'sum', 'mean', 'min', 'max', 'std'],
        'installments': ['count', 'sum', 'mean', 'min', 'max', 'std'],
    }

    intermediate_group = grouped.agg(agg_func)
    intermediate_group.columns = ['_'.join(col).strip() for col in intermediate_group.columns.values]
    intermediate_group.reset_index(inplace=True)

    final_group = intermediate_group.groupby('card_id').agg(['mean', 'std'])
    final_group.columns = ['_'.join(col).strip() for col in final_group.columns.values]
    final_group.reset_index(inplace=True)
    
    return final_group


def successive_aggregates(df, field1, field2):
    """
    指定されたフィールドを基に連続集計を行う。
    
    Parameters
    ----------
    df : pd.DataFrame
        取引データのデータフレーム。
    field1 : str
        集計の基準となるフィールド。
    field2 : str
        集計されるフィールド。
    
    Returns
    -------
    pd.DataFrame
        連続集計されたデータフレーム。
    """
    t = df.groupby(['card_id', field1])[field2].mean()
    u = pd.DataFrame(t).reset_index().groupby('card_id')[field2].agg(['mean', 'min', 'max', 'std'])
    u.columns = [field1 + '_' + field2 + '_' + col for col in u.columns.values]
    u.reset_index(inplace=True)
    return u


In [19]:
# データ準備
historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'])
new_transactions['purchase_date'] = pd.to_datetime(new_transactions['purchase_date'])

# 月の差を計算
historical_transactions = calculate_month_diff(historical_transactions)
new_transactions = calculate_month_diff(new_transactions)

# カテゴリカル列をワンホットエンコーディング
historical_transactions = encode_categorical_columns(historical_transactions, ['category_2', 'category_3'])
new_transactions = encode_categorical_columns(new_transactions, ['category_2', 'category_3'])

# メモリ使用量の削減
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

# authorized_flagの平均を計算
agg_fun = {'authorized_flag': ['mean']}
auth_mean = historical_transactions.groupby(['card_id']).agg(agg_fun)
auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns.values]
auth_mean.reset_index(inplace=True)

# authorized_flagに基づいてデータを分割
authorized_transactions = historical_transactions[historical_transactions['authorized_flag'] == 1]
historical_transactions = historical_transactions[historical_transactions['authorized_flag'] == 0]

# purchase_month列を追加
historical_transactions['purchase_month'] = historical_transactions['purchase_date'].dt.month
authorized_transactions['purchase_month'] = authorized_transactions['purchase_date'].dt.month
new_transactions['purchase_month'] = new_transactions['purchase_date'].dt.month

# データの集計
history = aggregate_transactions(historical_transactions)
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]

authorized = aggregate_transactions(authorized_transactions)
authorized.columns = ['auth_' + c if c != 'card_id' else c for c in authorized.columns]

new = aggregate_transactions(new_transactions)
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]

# 月ごとのデータの集計
final_group = aggregate_per_month(authorized_transactions)

# 連続集計
additional_fields = successive_aggregates(new_transactions, 'category_1', 'purchase_amount')
additional_fields = additional_fields.merge(successive_aggregates(new_transactions, 'installments', 'purchase_amount'), on='card_id', how='left')
additional_fields = additional_fields.merge(successive_aggregates(new_transactions, 'city_id', 'purchase_amount'), on='card_id', how='left')
additional_fields = additional_fields.merge(successive_aggregates(new_transactions, 'category_1', 'installments'), on='card_id', how='left')


Mem. usage decreased to 1332.66 Mb (57.1% reduction)
Mem. usage decreased to 86.12 Mb (58.9% reduction)


In [None]:
# LTVの計算
# 1. CSVファイルの読み込みと特定のカラムの選択
n_merc = pd.read_csv('../data/row/new_merchant_transactions.csv', usecols = ['card_id', 'purchase_amount', 'purchase_date'])
h_merc = pd.read_csv('../data/row/historical_transactions.csv', usecols = ['card_id', 'purchase_amount', 'purchase_date'])
merc = pd.concat([n_merc, h_merc], axis = 0)

# 2. 購入額の変換
n_merc['transformed'] = (n_merc['purchase_amount'] / 0.00001503)
n_merc['transformed'] = n_merc['transformed'] - n_merc['transformed'].min()
n_merc['transformed'] = np.log1p(n_merc['transformed'])

h_merc['transformed'] = (h_merc['purchase_amount'] / 0.00001503)
h_merc['transformed'] = h_merc['transformed'] - h_merc['transformed'].min()
h_merc['transformed'] = np.log1p(h_merc['transformed'])

merc['transformed'] = (merc['purchase_amount'] / 0.00001503)
merc['transformed'] = merc['transformed'] - merc['transformed'].min()
merc['transformed'] = np.log1p(merc['transformed'])

# 顧客ごとの取引履歴の集計
n_customer_transactions = n_merc.groupby('card_id').agg({
    'transformed': ['sum', 'count'],
    'purchase_date': ['min', 'max']
}).reset_index()

h_customer_transactions = h_merc.groupby('card_id').agg({
    'transformed': ['sum', 'count'],
    'purchase_date': ['min', 'max']
}).reset_index()

m_customer_transactions = merc.groupby('card_id').agg({
    'transformed': ['sum', 'count'],
    'purchase_date': ['min', 'max']
}).reset_index()

# カラム名をフラットにする
n_customer_transactions.columns = ['card_id', 'total_purchase_amount', 'transaction_count', 'first_purchase_date', 'last_purchase_date']
h_customer_transactions.columns = ['card_id', 'total_purchase_amount', 'transaction_count', 'first_purchase_date', 'last_purchase_date']
m_customer_transactions.columns = ['card_id', 'total_purchase_amount', 'transaction_count', 'first_purchase_date', 'last_purchase_date']

# 日付をdatetime型に変換
n_customer_transactions['first_purchase_date'] = pd.to_datetime(n_customer_transactions['first_purchase_date'])
n_customer_transactions['last_purchase_date'] = pd.to_datetime(n_customer_transactions['last_purchase_date'])

h_customer_transactions['first_purchase_date'] = pd.to_datetime(h_customer_transactions['first_purchase_date'])
h_customer_transactions['last_purchase_date'] = pd.to_datetime(h_customer_transactions['last_purchase_date'])

m_customer_transactions['first_purchase_date'] = pd.to_datetime(m_customer_transactions['first_purchase_date'])
m_customer_transactions['last_purchase_date'] = pd.to_datetime(m_customer_transactions['last_purchase_date'])

# 継続購買期間（日数）の計算
n_customer_transactions['lifetime_days'] = (n_customer_transactions['last_purchase_date'] - n_customer_transactions['first_purchase_date']).dt.days
h_customer_transactions['lifetime_days'] = (h_customer_transactions['last_purchase_date'] - h_customer_transactions['first_purchase_date']).dt.days
m_customer_transactions['lifetime_days'] = (m_customer_transactions['last_purchase_date'] - m_customer_transactions['first_purchase_date']).dt.days

# 平均購入価格の計算
n_customer_transactions['avg_purchase_value'] = n_customer_transactions['total_purchase_amount'] / n_customer_transactions['transaction_count']
h_customer_transactions['avg_purchase_value'] = h_customer_transactions['total_purchase_amount'] / h_customer_transactions['transaction_count']
m_customer_transactions['avg_purchase_value'] = m_customer_transactions['total_purchase_amount'] / m_customer_transactions['transaction_count']

# 平均購入頻度の計算（1年間を仮定）
n_customer_transactions['avg_purchase_frequency'] = n_customer_transactions['transaction_count'] / (n_customer_transactions['lifetime_days'] / 365)
h_customer_transactions['avg_purchase_frequency'] = h_customer_transactions['transaction_count'] / (h_customer_transactions['lifetime_days'] / 365)
m_customer_transactions['avg_purchase_frequency'] = m_customer_transactions['transaction_count'] / (m_customer_transactions['lifetime_days'] / 365)

# LTVの計算
n_customer_transactions['new_LTV'] = n_customer_transactions['avg_purchase_value'] * n_customer_transactions['avg_purchase_frequency'] * n_customer_transactions['lifetime_days']
h_customer_transactions['hist_LTV'] = h_customer_transactions['avg_purchase_value'] * h_customer_transactions['avg_purchase_frequency'] * h_customer_transactions['lifetime_days']
m_customer_transactions['total_LTV'] = m_customer_transactions['avg_purchase_value'] * m_customer_transactions['avg_purchase_frequency'] * m_customer_transactions['lifetime_days']

n_df_ltv = n_customer_transactions[['card_id', 'new_LTV']]
h_df_ltv = h_customer_transactions[['card_id', 'hist_LTV']]
m_df_ltv = m_customer_transactions[['card_id', 'total_LTV']]

In [None]:
# CLVの計算
from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.utils import summary_data_from_transaction_data

# CSVファイルの読み込みと特定のカラムの選択
n_merc = pd.read_csv('../data/row/new_merchant_transactions.csv', usecols = ['card_id', 'purchase_amount', 'purchase_date'])
h_merc = pd.read_csv('../data/row/historical_transactions.csv', usecols = ['card_id', 'purchase_amount', 'purchase_date'])
merc = pd.concat([n_merc, h_merc], axis = 0)

#　購入額の変換
n_merc['transformed'] = (n_merc['purchase_amount'] / 0.00001503)
n_merc['transformed'] = n_merc['transformed'] - n_merc['transformed'].min()
n_merc['transformed'] = np.log1p(n_merc['transformed'])

h_merc['transformed'] = (h_merc['purchase_amount'] / 0.00001503)
h_merc['transformed'] = h_merc['transformed'] - h_merc['transformed'].min()
h_merc['transformed'] = np.log1p(h_merc['transformed'])

merc['transformed'] = (merc['purchase_amount'] / 0.00001503)
merc['transformed'] = merc['transformed'] - merc['transformed'].min()
merc['transformed'] = np.log1p(merc['transformed'])

# 日付型に変換
n_merc['purchase_date'] = pd.to_datetime(n_merc['purchase_date'])
h_merc['purchase_date'] = pd.to_datetime(h_merc['purchase_date'])
merc['purchase_date'] = pd.to_datetime(merc['purchase_date'])

# RFMテーブルの作成
n_rfm_summary = summary_data_from_transaction_data(n_merc, 'card_id', 'purchase_date', monetary_value_col = 'transformed', observation_period_end = n_merc['purchase_date'].max())
h_rfm_summary = summary_data_from_transaction_data(h_merc, 'card_id', 'purchase_date', monetary_value_col = 'transformed', observation_period_end = h_merc['purchase_date'].max())
m_rfm_summary = summary_data_from_transaction_data(merc, 'card_id', 'purchase_date', monetary_value_col = 'transformed', observation_period_end = merc['purchase_date'].max())

# カラム名をフラットにする
n_rfm_summary.columns = ['new_frequency_rfm', 'new_recency_rfm', 'new_T_rfm', 'new_monetary_value']
h_rfm_summary.columns = ['hist_frequency_rfm', 'hist_recency_rfm', 'hist_T_rfm', 'hist_monetary_value']
m_rfm_summary.columns = ['total_frequency_rfm', 'total_recency_rfm', 'total_T_rfm', 'total_monetary_value']

n_rfm_summary = n_rfm_summary[n_rfm_summary['new_monetary_value'] > 0]

# BG/NBDモデルの適用（ペナルティを追加）
n_bgf = BetaGeoFitter(penalizer_coef = 1)
n_bgf.fit(n_rfm_summary['new_frequency_rfm'], n_rfm_summary['new_recency_rfm'], n_rfm_summary['new_T_rfm'])

h_bgf = BetaGeoFitter(penalizer_coef = 1)
h_bgf.fit(h_rfm_summary['hist_frequency_rfm'], h_rfm_summary['hist_recency_rfm'], h_rfm_summary['hist_T_rfm'])

m_bgf = BetaGeoFitter(penalizer_coef = 1)
m_bgf.fit(m_rfm_summary['total_frequency_rfm'], m_rfm_summary['total_recency_rfm'], m_rfm_summary['total_T_rfm'])

# 顧客ごとの予測購入回数
n_rfm_summary['new_predicted_purchases'] = n_bgf.conditional_expected_number_of_purchases_up_to_time(12, n_rfm_summary['new_frequency_rfm'], n_rfm_summary['new_recency_rfm'], n_rfm_summary['new_T_rfm'])
h_rfm_summary['hist_predicted_purchases'] = h_bgf.conditional_expected_number_of_purchases_up_to_time(12, h_rfm_summary['hist_frequency_rfm'], h_rfm_summary['hist_recency_rfm'], h_rfm_summary['hist_T_rfm'])
m_rfm_summary['total_predicted_purchases'] = m_bgf.conditional_expected_number_of_purchases_up_to_time(15, m_rfm_summary['total_frequency_rfm'], m_rfm_summary['total_recency_rfm'], m_rfm_summary['total_T_rfm'])

# Gamma-Gammaモデルの適用
n_ggf = GammaGammaFitter(penalizer_coef = 1)
n_ggf.fit(n_rfm_summary['new_frequency_rfm'], n_rfm_summary['new_monetary_value'])

h_ggf = GammaGammaFitter(penalizer_coef = 1)
h_ggf.fit(h_rfm_summary['hist_frequency_rfm'], h_rfm_summary['hist_monetary_value'])

m_ggf = GammaGammaFitter(penalizer_coef = 1)
m_ggf.fit(m_rfm_summary['total_frequency_rfm'], m_rfm_summary['total_monetary_value'])

# 顧客ごとの予測収益
n_rfm_summary['new_predicted_monetary_value'] = n_ggf.conditional_expected_average_profit(n_rfm_summary['new_frequency_rfm'], n_rfm_summary['new_monetary_value'])
h_rfm_summary['hist_predicted_monetary_value'] = h_ggf.conditional_expected_average_profit(h_rfm_summary['hist_frequency_rfm'], h_rfm_summary['hist_monetary_value'])
m_rfm_summary['total_predicted_monetary_value'] = m_ggf.conditional_expected_average_profit(m_rfm_summary['total_frequency_rfm'], m_rfm_summary['total_monetary_value'])

# CLVの計算
n_rfm_summary['new_clv'] = n_ggf.customer_lifetime_value(
    n_bgf,
    n_rfm_summary['new_frequency_rfm'],
    n_rfm_summary['new_recency_rfm'],
    n_rfm_summary['new_T_rfm'],
    n_rfm_summary['new_monetary_value'],
    time=12, # 12ヶ月の期間でCLVを予測
    discount_rate=0.01
)

h_rfm_summary['hist_clv'] = h_ggf.customer_lifetime_value(
    h_bgf,
    h_rfm_summary['hist_frequency_rfm'],
    h_rfm_summary['hist_recency_rfm'],
    h_rfm_summary['hist_T_rfm'],
    h_rfm_summary['hist_monetary_value'],
    time=12, # 12ヶ月の期間でCLVを予測
    discount_rate=0.01
)

m_rfm_summary['total_clv'] = m_ggf.customer_lifetime_value(
    m_bgf,
    m_rfm_summary['total_frequency_rfm'],
    m_rfm_summary['total_recency_rfm'],
    m_rfm_summary['total_T_rfm'],
    m_rfm_summary['total_monetary_value'],
    time=15, # 12ヶ月の期間でCLVを予測
    discount_rate=0.01
)

In [None]:
# Churnの計算
# 1. CSVファイルの読み込みと特定のカラムの選択
n_merc = pd.read_csv('../data/row/new_merchant_transactions.csv', usecols = ['card_id', 'purchase_amount', 'purchase_date'])
h_merc = pd.read_csv('../data/row/historical_transactions.csv', usecols = ['card_id', 'purchase_amount', 'purchase_date'])
merc = pd.concat([n_merc, h_merc], axis = 0)

n_merc['purchase_date'] = pd.to_datetime(n_merc['purchase_date'])
h_merc['purchase_date'] = pd.to_datetime(h_merc['purchase_date'])
merc['purchase_date'] = pd.to_datetime(merc['purchase_date'])

# 5. 購入額の変換
n_merc['transformed'] = (n_merc['purchase_amount'] / 0.00001503)
n_merc['transformed'] = n_merc['transformed'] - n_merc['transformed'].min()
n_merc['transformed'] = np.log1p(n_merc['transformed'])

h_merc['transformed'] = (h_merc['purchase_amount'] / 0.00001503)
h_merc['transformed'] = h_merc['transformed'] - h_merc['transformed'].min()
h_merc['transformed'] = np.log1p(h_merc['transformed'])

merc['transformed'] = (merc['purchase_amount'] / 0.00001503)
merc['transformed'] = merc['transformed'] - merc['transformed'].min()
merc['transformed'] = np.log1p(merc['transformed'])

# n_merc
# 特徴量エンジニアリング
n_merc['year_month'] = n_merc['purchase_date'].dt.to_period('M')
agg_func = {
    'transformed': ['sum', 'mean', 'max', 'min'],
    'purchase_date': ['count']
}
n_merc_customer_features = n_merc.groupby('card_id').agg(agg_func).reset_index()
n_merc_customer_features.columns = ['card_id', 'total_purchase_amount', 'average_purchase_amount', 'max_purchase_amount', 'min_purchase_amount', 'purchase_count']

# 最後の購入日を特徴量として追加
n_merc_last_purchase_date = n_merc.groupby('card_id')['purchase_date'].max().reset_index()
n_merc_last_purchase_date.columns = ['card_id', 'last_purchase_date']

# 現在の日付を設定（最新の取引日＋1を現在の日付と仮定）
n_merc_current_date = n_merc['purchase_date'].max() + pd.Timedelta(days=1)

# 最後の購入日からの日数を計算
n_merc_customer_features = n_merc_customer_features.merge(n_merc_last_purchase_date, on = 'card_id')
n_merc_customer_features['days_since_last_purchase'] = (n_merc_current_date - n_merc_customer_features['last_purchase_date']).dt.days

# ターゲット変数（チャーンかどうか）を作成
# ここでは30日間購入がない顧客をチャーンと仮定
n_merc_customer_features['churn'] = (n_merc_customer_features['days_since_last_purchase'] > 30).astype(int)

# h_merc
# 特徴量エンジニアリング
h_merc['year_month'] = h_merc['purchase_date'].dt.to_period('M')
agg_func = {
    'purchase_amount': ['sum', 'mean', 'max', 'min'],
    'purchase_date': ['count']
}
h_merc_customer_features = h_merc.groupby('card_id').agg(agg_func).reset_index()
h_merc_customer_features.columns = ['card_id', 'total_purchase_amount', 'average_purchase_amount', 'max_purchase_amount', 'min_purchase_amount', 'purchase_count']

# 最後の購入日を特徴量として追加
h_merc_last_purchase_date = h_merc.groupby('card_id')['purchase_date'].max().reset_index()
h_merc_last_purchase_date.columns = ['card_id', 'last_purchase_date']

# 現在の日付を設定（最新の取引日＋1を現在の日付と仮定）
h_merc_current_date = h_merc['purchase_date'].max() + pd.Timedelta(days=1)

# 最後の購入日からの日数を計算
h_merc_customer_features = h_merc_customer_features.merge(h_merc_last_purchase_date, on = 'card_id')
h_merc_customer_features['days_since_last_purchase'] = (h_merc_current_date - h_merc_customer_features['last_purchase_date']).dt.days

# ターゲット変数（チャーンかどうか）を作成
# ここでは30日間購入がない顧客をチャーンと仮定
h_merc_customer_features['churn'] = (h_merc_customer_features['days_since_last_purchase'] > 30).astype(int)

# merc
# 特徴量エンジニアリング
merc['year_month'] = merc['purchase_date'].dt.to_period('M')
agg_func = {
    'purchase_amount': ['sum', 'mean', 'max', 'min'],
    'purchase_date': ['count']
}
merc_customer_features = merc.groupby('card_id').agg(agg_func).reset_index()
merc_customer_features.columns = ['card_id', 'total_purchase_amount', 'average_purchase_amount', 'max_purchase_amount', 'min_purchase_amount', 'purchase_count']

# 最後の購入日を特徴量として追加
merc_last_purchase_date = merc.groupby('card_id')['purchase_date'].max().reset_index()
merc_last_purchase_date.columns = ['card_id', 'last_purchase_date']

# 現在の日付を設定（最新の取引日＋1を現在の日付と仮定）
merc_current_date = merc['purchase_date'].max() + pd.Timedelta(days=1)

# 最後の購入日からの日数を計算
merc_customer_features = merc_customer_features.merge(merc_last_purchase_date, on = 'card_id')
merc_customer_features['days_since_last_purchase'] = (merc_current_date - merc_customer_features['last_purchase_date']).dt.days

# ターゲット変数（チャーンかどうか）を作成
# ここでは30日間購入がない顧客をチャーンと仮定
merc_customer_features['churn'] = (merc_customer_features['days_since_last_purchase'] > 30).astype(int)

# n_merch
# 特徴量とターゲットの分割
n_merc_features = ['total_purchase_amount', 'average_purchase_amount', 'max_purchase_amount', 'min_purchase_amount', 'purchase_count', 'days_since_last_purchase']
n_merc_X = n_merc_customer_features[n_merc_features]
n_merc_y = n_merc_customer_features['churn']

# 訓練データとテストデータに分割
n_merc_X_train, n_merc_X_test, n_merc_y_train, n_merc_y_test = train_test_split(n_merc_X, n_merc_y, test_size=0.2, random_state=42)

# モデルの訓練
n_merc_model = RandomForestClassifier(random_state=42)
n_merc_model.fit(n_merc_X_train, n_merc_y_train)

# 予測
n_merc_y_pred = n_merc_model.predict(n_merc_X_test)

# h_merch
# 特徴量とターゲットの分割
h_merc_features = ['total_purchase_amount', 'average_purchase_amount', 'max_purchase_amount', 'min_purchase_amount', 'purchase_count', 'days_since_last_purchase']
h_merc_X = h_merc_customer_features[h_merc_features]
h_merc_y = h_merc_customer_features['churn']

# 訓練データとテストデータに分割
h_merc_X_train, h_merc_X_test, h_merc_y_train, h_merc_y_test = train_test_split(h_merc_X, h_merc_y, test_size=0.2, random_state=42)

# モデルの訓練
h_merc_model = RandomForestClassifier(random_state=42)
h_merc_model.fit(h_merc_X_train, h_merc_y_train)

# 予測
h_merc_y_pred = h_merc_model.predict(h_merc_X_test)

# merch
# 特徴量とターゲットの分割
merc_features = ['total_purchase_amount', 'average_purchase_amount', 'max_purchase_amount', 'min_purchase_amount', 'purchase_count', 'days_since_last_purchase']
merc_X = merc_customer_features[merc_features]
merc_y = merc_customer_features['churn']

# 訓練データとテストデータに分割
merc_X_train, merc_X_test, merc_y_train, merc_y_test = train_test_split(merc_X, merc_y, test_size=0.2, random_state=42)

# モデルの訓練
merc_model = RandomForestClassifier(random_state=42)
merc_model.fit(merc_X_train, merc_y_train)

# 予測
merc_y_pred = merc_model.predict(merc_X_test)

n_merc_customer_features['new_predicted_churn'] = n_merc_model.predict(n_merc_X)
h_merc_customer_features['hist_predicted_churn'] = h_merc_model.predict(h_merc_X)
merc_customer_features['total_predicted_churn'] = merc_model.predict(merc_X)

n_df_churn = n_merc_customer_features[['card_id', 'new_predicted_churn']]
h_df_churn = h_merc_customer_features[['card_id', 'hist_predicted_churn']]
m_df_churn = merc_customer_features[['card_id', 'total_predicted_churn']]

In [20]:
# データの結合
train = pd.merge(train, history, on='card_id', how='left')
test = pd.merge(test, history, on='card_id', how='left')

train = pd.merge(train, authorized, on='card_id', how='left')
test = pd.merge(test, authorized, on='card_id', how='left')

train = pd.merge(train, new, on='card_id', how='left')
test = pd.merge(test, new, on='card_id', how='left')

train = pd.merge(train, final_group, on='card_id', how='left')
test = pd.merge(test, final_group, on='card_id', how='left')

train = pd.merge(train, auth_mean, on='card_id', how='left')
test = pd.merge(test, auth_mean, on='card_id', how='left')

train = pd.merge(train, additional_fields, on='card_id', how='left')
test = pd.merge(test, additional_fields, on='card_id', how='left')

train = pd.merge(train, n_df_ltv, on='card_id', how='left')
test = pd.merge(test, n_df_ltv, on='card_id', how='left')

train = pd.merge(train, h_df_ltv, on='card_id', how='left')
test = pd.merge(test, h_df_ltv, on='card_id', how='left')

train = pd.merge(train, m_df_ltv, on='card_id', how='left')
test = pd.merge(test, m_df_ltv, on='card_id', how='left')

train = pd.merge(train, n_rfm_summary, on='card_id', how='left')
test = pd.merge(test, n_rfm_summary, on='card_id', how='left')

train = pd.merge(train, h_rfm_summary, on='card_id', how='left')
test = pd.merge(test, h_rfm_summary, on='card_id', how='left')

train = pd.merge(train, m_rfm_summary, on='card_id', how='left')
test = pd.merge(test, m_rfm_summary, on='card_id', how='left')

train = pd.merge(train, n_df_churn, on='card_id', how='left')
test = pd.merge(test, n_df_churn, on='card_id', how='left')

train = pd.merge(train, h_df_churn, on='card_id', how='left')
test = pd.merge(test, h_df_churn, on='card_id', how='left')

train = pd.merge(train, m_df_churn, on='card_id', how='left')
test = pd.merge(test, m_df_churn, on='card_id', how='left')

## 前処理終了後のデータの保存
- 基本的にモデルの学習・ハイパーパラメータチューニングを行う際にはここで作成した同じデータを使い回して下さい。
- 適宜前処理を変更した場合はファイル名を変えるなどして管理して下さい。

In [22]:
# データの保存
train.to_csv('../data/processed/processed20240614_train.csv',index=None)
test.to_csv('../data/processed/processed20240614_test.csv',index=None)