In [1]:
import warnings
import time
import sys
import datetime
import pickle
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)

%matplotlib inline

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
def preprocessing(df):
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y':1, 'N':0})
    for col in ['category_3']:
        df[col] = df[col].map({'A':1, 'B':2,'C':3})
    return df

def read_data(input_file):
    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (pd.Timestamp('2018-02-01') - df['first_active_month']).dt.days
    return df

In [4]:
# データの読み込み
path_new = '../../../../data/raw/new_merchant_transactions.csv'
path_hist = '../../../../data/raw/historical_transactions.csv'
path_train = '../../../../data/raw/train.csv'
path_test = '../../../../data/raw/test.csv'

new = pd.read_csv(path_new, parse_dates=['purchase_date'])
hist = pd.read_csv(path_hist ,parse_dates=['purchase_date'])

new = preprocessing(new)
hist = preprocessing(hist)

new = reduce_mem_usage(new)
hist = reduce_mem_usage(hist)

train = pd.read_csv(path_train, parse_dates=['first_active_month'])
test = pd.read_csv(path_test, parse_dates=['first_active_month'])

target = train['target']
#del train['target']

Mem. usage decreased to 76.76 Mb (63.4% reduction)
Mem. usage decreased to 1193.84 Mb (61.6% reduction)


In [14]:
train = pd.read_csv(path_train, parse_dates=['first_active_month'])
test = pd.read_csv(path_test, parse_dates=['first_active_month'])

## train, test 処理

In [15]:
test_null_id = test[test.first_active_month.isna()].card_id.iloc[-1]
null_month = hist[hist.card_id == test_null_id].purchase_date.dt.strftime("%Y-%m-01").min()
test.fillna({'first_active_month':null_month},inplace=True)

In [16]:
for df in [test, train]:
    df['elapsed_time'] = (pd.Timestamp('2018-02-01') - df['first_active_month']).dt.days

In [17]:
for df in [test,train]:
    df['first_active_month']  = (df['first_active_month'] - df['first_active_month'].min()).dt.days

In [18]:
# 分類用
train_cls = train.copy()
train_cls.target = train.target.apply(lambda x: 0 if x<-33 else 1)

# 回帰用
train_reg = train.query('target > -30').reset_index(drop=True)

## 特徴量作成

In [19]:
for df in [hist, new]:
    df['category_2'].fillna(1,inplace=True) # 1が最頻値
    df['category_3'].fillna(1,inplace=True) # 1が最頻値
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True) # 最頻値

In [26]:
# 承認フラグ　Yes,Noの数と比
# histのみ（newはYesのみのため）
authorized_flag = hist[['card_id','authorized_flag']].groupby(['card_id','authorized_flag']).agg(count=('card_id','count')).reset_index()

# カードIDごとの合計を計算
total_counts = authorized_flag.groupby('card_id')['count'].sum().reset_index()
total_counts = total_counts.rename(columns={'count': 'total_count'})

# 元のデータと合計を結合
authorized_flag = authorized_flag.merge(total_counts, on='card_id')

# 割合を計算
authorized_flag['ratio'] = authorized_flag['count'] / authorized_flag['total_count']
#hist[['card_id','authorized_flag']].groupby(['card_id']).agg(count=('authorized_flag','mean')).reset_index()

# 結果を整形
authorized_ratio = authorized_flag.pivot(index='card_id', columns='authorized_flag', values=['ratio','count']).reset_index().rename(columns={0: 'No', 1: 'Yes'})
authorized_ratio.columns = ['_'.join(col).strip() for col in authorized_ratio.columns.values]
authorized_ratio.rename(columns={'card_id_': 'card_id'},inplace=True)

In [27]:
hist_id_u = hist.groupby('card_id',as_index=False)\
    .agg(
            frequency = ('card_id','count'),
            amount_total = ('purchase_amount','sum'),
            amount_mean = ('purchase_amount','mean'),
            merchant_category_nu = ('merchant_category_id','nunique'),
            merchant_category_mode = ('merchant_category_id',lambda x: x.mode()[0]),
            city_nu = ('city_id','nunique'),
            city_mode = ('city_id',lambda x: x.mode()[0]),
            state_nu = ('state_id','nunique'),
            state_mode = ('state_id',lambda x: x.mode()[0]),
            date_min  = ('purchase_date','min'),
            date_max  = ('purchase_date','max'),
            installments_mean = ('installments','mean'),
            installments_max = ('installments','max'),
            installments_min = ('installments','min'),
            month_lag_mean = ('month_lag','mean'),
            month_lag_max = ('month_lag','max'),
            month_lag_min = ('month_lag','min'),
            category_1_mod = ('category_1',lambda x: x.mode()[0]),
            category_2_mod = ('category_2',lambda x: x.mode()[0]),
            category_3_mod = ('category_3',lambda x: x.mode()[0]),
        )

In [34]:
hist_id_u = hist_id_u.merge(authorized_ratio,on='card_id')

In [35]:
hist_id_u['duration'] = (hist_id_u['date_max'] - hist_id_u['date_min']).dt.days
hist_id_u.date_max = (hist_id_u.date_max - hist_id_u.date_min.min()).dt.days
hist_id_u.date_min = (hist_id_u.date_min - hist_id_u.date_min.min()).dt.days

hist_id_u.merchant_category_mode = pd.to_numeric(hist_id_u.merchant_category_mode, errors='coerce')

hist_id_u.columns = ['hist_' + col if col != 'card_id' else col for col in hist_id_u.columns]

In [37]:
new_id_u = new.groupby('card_id',as_index=False).agg(
            frequency = ('card_id','count'),
            #amount_total = ('purchase_amount','sum'),
            #amount_mean = ('purchase_amount','mean'),
            merchant_category_nu = ('merchant_category_id','nunique'),
            merchant_category_mode = ('merchant_category_id',lambda x: x.mode()[0]),
            city_nu = ('city_id','nunique'),
            city_mode = ('city_id',lambda x: x.mode()[0]),
            state_nu = ('state_id','nunique'),
            state_mode = ('state_id',lambda x: x.mode()[0]),
            date_min  = ('purchase_date','min'),
            date_max  = ('purchase_date','max'),
            installments_mean = ('installments','mean'),
            installments_max = ('installments','max'),
            installments_min = ('installments','min'),
            month_lag_mean = ('month_lag','mean'),
            month_lag_max = ('month_lag','max'),
            month_lag_min = ('month_lag','min'),
            category_1_mod = ('category_1',lambda x: x.mode()[0]),
            category_2_mod = ('category_2',lambda x: x.mode()[0]),
            category_3_mod = ('category_3',lambda x: x.mode()[0])
        )

In [38]:
new_id_u['duration'] = (new_id_u['date_max'] - new_id_u['date_min']).dt.days
new_id_u.date_max = (new_id_u.date_max - new_id_u.date_min.min()).dt.days
new_id_u.date_min = (new_id_u.date_min - new_id_u.date_min.min()).dt.days

new_id_u.merchant_category_mode = pd.to_numeric(new_id_u.merchant_category_mode, errors='coerce')

new_id_u.columns = ['new_' + col if col != 'card_id' else col for col in new_id_u.columns]

In [55]:
total = pd.concat([hist,new],axis=0).reset_index(drop=True)

In [56]:
total_id_u = total.groupby('card_id',as_index=False)\
    .agg(
            frequency = ('card_id','count'),
            amount_total = ('purchase_amount','sum'),
            amount_mean = ('purchase_amount','mean'),
            merchant_category_nu = ('merchant_category_id','nunique'),
            merchant_category_mode = ('merchant_category_id',lambda x: x.mode()[0]),
            city_nu = ('city_id','nunique'),
            city_mode = ('city_id',lambda x: x.mode()[0]),
            state_nu = ('state_id','nunique'),
            state_mode = ('state_id',lambda x: x.mode()[0]),
            date_min  = ('purchase_date','min'),
            date_max  = ('purchase_date','max'),
            installments_mean = ('installments','mean'),
            installments_max = ('installments','max'),
            installments_min = ('installments','min'),
            month_lag_mean = ('month_lag','mean'),
            month_lag_max = ('month_lag','max'),
            month_lag_min = ('month_lag','min'),
            category_1_mod = ('category_1',lambda x: x.mode()[0]),
            category_2_mod = ('category_2',lambda x: x.mode()[0]),
            category_3_mod = ('category_3',lambda x: x.mode()[0]),
        )

In [57]:
total_id_u['duration'] = (total_id_u['date_max'] - total_id_u['date_min']).dt.days
total_id_u.date_max = (total_id_u.date_max - total_id_u.date_min.min()).dt.days
total_id_u.date_min = (total_id_u.date_min - total_id_u.date_min.min()).dt.days

total_id_u.merchant_category_mode = pd.to_numeric(total_id_u.merchant_category_mode, errors='coerce')

total_id_u.columns = ['total_' + col if col != 'card_id' else col for col in total_id_u.columns]

In [62]:
train = train.merge(hist_id_u, on='card_id',how='left')
test = test.merge(hist_id_u, on='card_id',how='left')

train = train.merge(total_id_u, on='card_id',how='left')
test = test.merge(total_id_u, on='card_id',how='left')

In [63]:
for df in [train, test]:
    df['hist_count_No'].fillna(0,inplace=True)
    df['hist_ratio_No'].fillna(0,inplace=True)

In [65]:
test

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,elapsed_time,hist_frequency,hist_amount_total,hist_amount_mean,hist_merchant_category_nu,hist_merchant_category_mode,hist_city_nu,hist_city_mode,hist_state_nu,hist_state_mode,hist_date_min,hist_date_max,hist_installments_mean,hist_installments_max,hist_installments_min,hist_month_lag_mean,hist_month_lag_max,hist_month_lag_min,hist_category_1_mod,hist_category_2_mod,hist_category_3_mod,hist_ratio_No,hist_ratio_Yes,hist_count_No,hist_count_Yes,hist_duration,total_frequency,total_amount_total,total_amount_mean,total_merchant_category_nu,total_merchant_category_mode,total_city_nu,total_city_mode,total_state_nu,total_state_mode,total_date_min,total_date_max,total_installments_mean,total_installments_max,total_installments_min,total_month_lag_mean,total_month_lag_max,total_month_lag_min,total_category_1_mod,total_category_2_mod,total_category_3_mod,total_duration
0,1978,C_ID_0ab67a22ab,3,3,1,306,68,-40.733734,-0.599025,16,879,7,235,3,12,93,362,2.073529,12,1,-3.632353,0,-8,0,1.0,2.0,0.352941,0.647059,24.0,44.0,268,71,-42.511078,-0.598748,17,879,7,235,3,12,93,423,2.056338,12,1,-3.394366,2,-8,0,1.0,2.0,330
1,1888,C_ID_130fd0cbdd,2,3,0,396,78,-49.136513,-0.629955,16,705,4,117,3,13,12,413,1.064103,4,1,-10.410256,0,-13,0,4.0,2.0,0.012821,0.987179,1.0,77.0,401,87,-55.080849,-0.633113,22,705,4,117,3,13,12,474,1.080460,4,1,-9.183908,2,-13,0,4.0,2.0,462
2,2100,C_ID_b709037bc5,5,1,1,184,13,4.528841,0.348372,8,422,4,143,4,5,236,397,3.384615,10,-1,-2.076923,0,-6,0,5.0,3.0,0.307692,0.692308,4.0,9.0,161,15,4.708528,0.313902,9,422,4,143,5,5,236,436,3.666667,10,-1,-1.666667,1,-6,0,5.0,3.0,199
3,2222,C_ID_d27d835a9f,2,1,0,62,26,-13.690715,-0.526566,18,80,1,69,1,9,337,415,1.461538,6,-1,-1.230769,0,-2,0,1.0,2.0,0.000000,1.000000,0.0,26.0,77,36,-19.435101,-0.539864,25,307,3,69,3,9,337,471,1.861111,12,-1,-0.527778,2,-2,0,1.0,2.0,134
4,1491,C_ID_2b5e3df5c2,5,1,1,793,110,25.139385,0.228540,31,705,5,277,4,13,2,422,1.090909,4,1,-6.227273,0,-13,0,4.0,2.0,0.209091,0.790909,23.0,87.0,419,116,37.205791,0.320740,33,705,5,277,4,13,2,466,1.077586,4,-1,-5.844828,2,-13,0,4.0,2.0,464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123618,2161,C_ID_7a239d2eda,2,2,0,123,73,-49.404781,-0.676778,18,367,2,-1,2,-1,281,416,0.000000,0,0,-1.767123,0,-4,0,1.0,1.0,0.041096,0.958904,3.0,70.0,134,73,-49.404781,-0.676778,18,367,2,-1,2,-1,281,416,0.000000,0,0,-1.767123,0,-4,0,1.0,1.0,134
123619,2131,C_ID_75ace375ae,3,1,1,153,7,-4.914616,-0.702088,3,705,3,299,2,9,265,389,0.000000,0,0,-2.714286,-1,-5,0,1.0,1.0,0.000000,1.000000,0.0,7.0,123,11,-7.572819,-0.688438,5,705,4,299,2,9,265,472,0.000000,0,0,-1.272727,2,-5,0,1.0,1.0,206
123620,1766,C_ID_21d56d950c,5,1,1,518,37,-6.583716,-0.177938,11,705,3,143,3,5,106,412,1.594595,7,-1,-3.837838,0,-10,0,5.0,2.0,0.027027,0.972973,1.0,36.0,305,37,-6.583716,-0.177938,11,705,3,143,3,5,106,412,1.594595,7,-1,-3.837838,0,-10,0,5.0,2.0,305
123621,2039,C_ID_6c46fc5a9d,2,1,0,245,62,-39.123627,-0.631026,15,367,4,302,2,7,151,302,1.935484,5,1,-2.145161,0,-4,0,3.0,3.0,0.306452,0.693548,19.0,43.0,151,68,-42.903412,-0.630933,18,367,5,302,3,7,151,328,1.838235,5,-1,-1.867647,1,-4,0,3.0,3.0,177


In [66]:
path_train_prepro = '../../../../data/processed/train_processed_0619.csv'
path_test_preprp = '../../../../data/processed/test_processed_0619.csv'

train.to_csv(path_train_prepro,index=False)
test.to_csv(path_test_preprp,index=False)