In [1]:
import warnings
import time
import sys
import datetime
import pickle
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)

%matplotlib inline

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [43]:
def preprocessing(df):
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y':1, 'N':0})
    for col in ['category_3']:
        df[col] = df[col].map({'A':1, 'B':2,'C':3})
    return df

def read_data(input_file):
    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (pd.Timestamp('2018-02-01') - df['first_active_month']).dt.days
    return df

In [44]:
# データの読み込み
path_new = '../../../../data/raw/new_merchant_transactions.csv'
path_hist = '../../../../data/raw/historical_transactions.csv'
path_train = '../../../../data/raw/train.csv'
path_test = '../../../../data/raw/test.csv'

new = pd.read_csv(path_new, parse_dates=['purchase_date'])
hist = pd.read_csv(path_hist ,parse_dates=['purchase_date'])

new = preprocessing(new)
hist = preprocessing(hist)

new = reduce_mem_usage(new)
hist = reduce_mem_usage(hist)

train = pd.read_csv(path_train, parse_dates=['first_active_month'])
test = pd.read_csv(path_test, parse_dates=['first_active_month'])

target = train['target']
#del train['target']

Mem. usage decreased to 76.76 Mb (63.4% reduction)
Mem. usage decreased to 1193.84 Mb (61.6% reduction)


In [179]:
train = pd.read_csv(path_train, parse_dates=['first_active_month'])
test = pd.read_csv(path_test, parse_dates=['first_active_month'])

## NULL処理

In [182]:
test_null_id = test[test.first_active_month.isna()].card_id.iloc[-1]
null_month = hist[hist.card_id == test_null_id].purchase_date.dt.strftime("%Y-%m-01").min()
test.fillna({'first_active_month':null_month},inplace=True)

In [183]:
for df in [test, train]:
    df['elapsed_time'] = (pd.Timestamp('2018-02-01') - df['first_active_month']).dt.days

In [45]:
for df in [hist, new]:
    df['category_2'].fillna(1,inplace=True) # 1が最頻値
    df['category_3'].fillna(1,inplace=True) # 1が最頻値
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True) # 最頻値

## 特徴量作成

In [71]:
# 承認フラグ　Yes,Noの数と比
# histのみ（newはYesのみのため）
authorized_flag = hist[['card_id','authorized_flag']].groupby(['card_id','authorized_flag']).agg(count=('card_id','count')).reset_index()

# カードIDごとの合計を計算
total_counts = authorized_flag.groupby('card_id')['count'].sum().reset_index()
total_counts = total_counts.rename(columns={'count': 'total_count'})

# 元のデータと合計を結合
authorized_flag = authorized_flag.merge(total_counts, on='card_id')

# 割合を計算
authorized_flag['ratio'] = authorized_flag['count'] / authorized_flag['total_count']

# 結果を整形
authorized_ratio = authorized_flag.pivot(index='card_id', columns='authorized_flag', values=['ratio','count']).reset_index().rename(columns={0: 'No', 1: 'Yes'})
authorized_ratio.columns = ['_'.join(col).strip() for col in authorized_ratio.columns.values]
authorized_ratio.rename(columns={'card_id_': 'card_id'},inplace=True)

In [145]:
hist_id_u = hist.groupby('card_id',as_index=False)\
    .agg(
            frequency = ('card_id','count'),
            amount_total = ('purchase_amount','sum'),
            amount_mean = ('purchase_amount','mean'),
            merchant_category_nu = ('merchant_category_id','nunique'),
            merchant_category_mode = ('merchant_category_id',lambda x: x.mode()[0]),
            city_nu = ('city_id','nunique'),
            city_mode = ('city_id',lambda x: x.mode()[0]),
            state_nu = ('state_id','nunique'),
            state_mode = ('state_id',lambda x: x.mode()[0]),
            date_min  = ('purchase_date','min'),
            date_max  = ('purchase_date','max'),
            installments_mean = ('installments','mean'),
            installments_max = ('installments','max'),
            installments_min = ('installments','min'),
            month_lag_mean = ('month_lag','mean'),
            month_lag_max = ('month_lag','max'),
            month_lag_min = ('month_lag','min'),
            category_1_mod = ('category_1',lambda x: x.mode()[0]),
            category_2_mod = ('category_2',lambda x: x.mode()[0]),
            category_3_mod = ('category_3',lambda x: x.mode()[0]),
        )

In [146]:
hist_id_u = hist_id_u.merge(authorized_ratio,on='card_id')

#hist_id_u.drop(columns={'ratio_Yes_x','ratio_Yes_y','ratio_No_x','ratio_No_y','count_Yes_x','count_Yes_y','count_No_x','count_No_y',},inplace=True)

In [147]:
hist_id_u['duration'] = (hist_id_u['date_max'] - hist_id_u['date_min']).dt.days

In [148]:
hist_id_u.columns = ['hist_' + col if col != 'card_id' else col for col in hist_id_u.columns]

In [149]:
new_id_u = new.groupby('card_id',as_index=False).agg(
            frequency = ('card_id','count'),
            amount_total = ('purchase_amount','sum'),
            amount_mean = ('purchase_amount','mean'),
            merchant_category_nu = ('merchant_category_id','nunique'),
            merchant_category_mode = ('merchant_category_id',lambda x: x.mode()[0]),
            city_nu = ('city_id','nunique'),
            city_mode = ('city_id',lambda x: x.mode()[0]),
            state_nu = ('state_id','nunique'),
            state_mode = ('state_id',lambda x: x.mode()[0]),
            date_min  = ('purchase_date','min'),
            date_max  = ('purchase_date','max'),
            installments_mean = ('installments','mean'),
            installments_max = ('installments','max'),
            installments_min = ('installments','min'),
            month_lag_mean = ('month_lag','mean'),
            month_lag_max = ('month_lag','max'),
            month_lag_min = ('month_lag','min'),
            category_1_mod = ('category_1',lambda x: x.mode()[0]),
            category_2_mod = ('category_2',lambda x: x.mode()[0]),
            category_3_mod = ('category_3',lambda x: x.mode()[0])
        )

In [150]:
new_id_u['duration'] = (new_id_u['date_max'] - new_id_u['date_min']).dt.days

In [151]:
new_id_u.columns = ['new_' + col if col != 'card_id' else col for col in new_id_u.columns]

In [152]:
hist_id_u.hist_merchant_category_mode = pd.to_numeric(hist_id_u.hist_merchant_category_mode, errors='coerce')

In [185]:
train = train.merge(hist_id_u, on='card_id',how='left')
test = test.merge(hist_id_u, on='card_id',how='left')

# newは欠損値をどう埋めるかが問題になるのでいったん無視
#train = train.merge(new_id_u, on='card_id',how='left')
#test = test.merge(new_id_u, on='card_id',how='left')

In [187]:
for df in [train, test]:
    df['hist_count_No'].fillna(0,inplace=True)
    df['hist_ratio_No'].fillna(0,inplace=True)

In [189]:
path_train_prepro = '../../../../data/processed/train_processed_0618.csv'
path_test_preprp = '../../../../data/processed/test_processed_0618.csv'

train.to_csv(path_train_prepro,index=False)
test.to_csv(path_test_preprp,index=False)