## More EDA and feature engineering
- EDA: [Extensive EDA and Modeling XGB Hyperopt](https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt)
- modeling: [IEEE - GB-2 (make Amount useful again)](https://www.kaggle.com/kyakovlev/ieee-gb-2-make-amount-useful-again)
- feature engineering 중점 사항: 이걸 하나씩 바꿔 가면서 정확도 해보기!! submission 파일 따로 저장하고 **커밋시 메시지에 해당 내용 꼭 쓰기**
    - null 데이터는 일단 그대로 두기
    - P_emaildomain: boolean으로. mail.com or not
    - R_emaildomain: boolean으로? apple.com or not
    - id_23: ip_proxy or not? ip_proxy는 사기치려고 들어오는 사람들.

In [12]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys, gc, warnings, random
print(os.listdir("../input"))

import lightgbm as lgb

from tqdm import tqdm

import math
warnings.filterwarnings('ignore')

#standard plotly imports
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode

#import cufflinks
import plotly.figure_factory as ff

#using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)

#preprocessing, modeling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

['df_id.pkl', 'df_pca.pkl', 'df_test.pkl', 'df_train.pkl', 'df_trans.pkl', 'sample_submission.csv', 'test_0823.pkl', 'test_identity.csv', 'test_transaction.csv', 'train_0823.pkl', 'train_identity.csv', 'train_transaction.csv']


In [13]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
        end_mem, 100 * (start_mem - end_mem) / start_mem))
    
    return df

In [16]:
# PCA 안한걸로 읽기
df_train = pd.read_pickle("../input/df_train.pkl")
df_test = pd.read_pickle("../input/df_test.pkl")
#must be 645.97
print("{:1.2f} Mb".format(df_train.memory_usage().sum() / 1024**2))
#must be 561.50
print("{:1.2f} Mb".format(df_test.memory_usage().sum() / 1024**2)) 

645.97 Mb
561.50 Mb


In [33]:
# 추가적인 전처리
# df_train.P_emaildomain.fillna("NoInf", inplace=True)
# df_test.R_emaildomain.fillna("NoInf", inplace=True)

emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
          'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft',
          'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink',
          'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other',
          'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
          'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 
          'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
          'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
          'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
          'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
          'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 
          'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other',
          'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

us_emails = ['gmail', 'net', 'edu']

# https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
for c in ['P_emaildomain', 'R_emaildomain']:
    # bin -> emails dict에 따라 매핑(수정)
    df_train[c + '_bin'] = df_train[c].map(emails)
    df_test[c + '_bin'] = df_test[c].map(emails)
    
    # suffix -> 도메인 중 맨 마지막(. 뒤에) 부분
    df_train[c + '_suffix'] = df_train[c].map(lambda x: str(x).split('.')[-1])
    df_test[c + '_suffix'] = df_test[c].map(lambda x: str(x).split('.')[-1])
    
    # suffix 확인 -> us_email 값에 해당 안하면 그대로. 해당하면 'us'로
    df_train[c + '_suffix'] = df_train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    df_test[c + '_suffix'] = df_test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [34]:
# 확인
df_train[['P_emaildomain', 'P_emaildomain_bin', 'P_emaildomain_suffix']].head()

Unnamed: 0,P_emaildomain,P_emaildomain_bin,P_emaildomain_suffix
0,,,
1,gmail.com,google,com
2,outlook.com,microsoft,com
3,yahoo.com,yahoo,com
4,gmail.com,google,com


In [35]:
# object 형식 Label Encoding. object 형식은 모델에 들어갈 수 없음
# 근데 숫자로 된 범주형 변수들은 안하나?
for f in df_train.drop('isFraud', axis=1).columns:
    if df_train[f].dtype == 'object' or df_test[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df_train[f].values) + list(df_test[f].values)) #더하면 어떻게 되는거지?
        df_train[f] = lbl.transform(list(df_train[f].values))
        df_test[f] = lbl.transform(list(df_test[f].values))
        # 이렇게 되면 NaN은 NaN으로 transform?

- extensive EDA에서 TransactionAmt를 표준화, log 처리 모두 하는 이유가 이해되지 않음
- 로그 취할 경우 정규분포 꼴. 그래서 **log만 취해 보기**

In [7]:
# 표준화
# df_train['Trans_min_std'] = (df_train['TransactionAmt'] - df_train['TransactionAmt'].mean()) / df_train['TransactionAmt'].std()
# df_test['Trans_min_std'] = (df_test['TransactionAmt'] - df_test['TransactionAmt'].mean()) / df_test['TransactionAmt'].std()

In [36]:
# 이상치 영향 줄이기 위해 TransactionAmt는 log
df_train['TransactionAmt'] = np.log(df_train['TransactionAmt'])
df_test['TransactionAmt'] = np.log(df_test['TransactionAmt'])

In [37]:
print(df_train.shape)
print(df_test.shape)

(590540, 438)
(506691, 437)


In [38]:
## TransactionDT timedelta 적용하기(fraud 파일)
import datetime

START_DATE = "2017-12-01"
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

# df_train 처리
df_train['Date'] = df_train['TransactionDT'].apply(
lambda x: (startdate + datetime.timedelta(seconds = x))) #DT 값은 초??

print(df_train['Date'].head())

df_train['_Weekdays'] = df_train['Date'].dt.dayofweek
df_train['_Hours'] = df_train['Date'].dt.hour
df_train['_Days'] = df_train['Date'].dt.day

# df_test 처리
df_test['Date'] = df_test['TransactionDT'].apply(
lambda x: (startdate + datetime.timedelta(seconds = x))) #DT 값은 초??

print(df_test['Date'].head())

df_test['_Weekdays'] = df_test['Date'].dt.dayofweek
df_test['_Hours'] = df_test['Date'].dt.hour
df_test['_Days'] = df_test['Date'].dt.day

0   2017-12-02 00:00:00
1   2017-12-02 00:00:01
2   2017-12-02 00:01:09
3   2017-12-02 00:01:39
4   2017-12-02 00:01:46
Name: Date, dtype: datetime64[ns]
0   2018-07-02 00:00:24
1   2018-07-02 00:01:03
2   2018-07-02 00:01:50
3   2018-07-02 00:01:50
4   2018-07-02 00:01:57
Name: Date, dtype: datetime64[ns]


In [14]:
%%time
# 여기까지만 하고 일단 데이터 저장
df_train = pd.read_pickle("../input/train_0823.pkl")
df_test = pd.read_pickle("../input/test_0823.pkl")
#must be 533.90
print("{:1.2f} Mb".format(df_train.memory_usage().sum() / 1024**2))
#must be 465.34
print("{:1.2f} Mb".format(df_test.memory_usage().sum() / 1024**2)) 

# df_train = reduce_mem_usage(df_train)
# df_test = reduce_mem_usage(df_test)
# df_train.to_pickle("../input/train_0823.pkl")
# df_test.to_pickle("../input/test_0823.pkl")

533.90 Mb
465.34 Mb
Wall time: 3.73 s


In [15]:
print(df_train.shape)
print(df_test.shape)

(590540, 442)
(506691, 441)


## Modeling

In [4]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED = 42
seed_everything(SEED)
LOCAL_TEST = False
TARGET = 'isFraud'

In [5]:
rm_cols = [
    'TransactionID','TransactionDT', 'Date', #Date는 모델에 들어갈 수 없는 dtype
    TARGET,
]

features_columns = list(df_train)
for col in rm_cols:
    if col in features_columns:
        features_columns.remove(col)

In [6]:
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1, # -1 이 무슨 의미인지?
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':1,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                }

In [7]:
def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=2):
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    X,y = tr_df[features_columns], tr_df[target]    
    P,P_y = tt_df[features_columns], tt_df[target]  

    tt_df = tt_df[['TransactionID',target]]    
    predictions = np.zeros(len(tt_df))
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold:',fold_)
        tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
        vl_x, vl_y = X.iloc[val_idx,:], y[val_idx]
            
        print(len(tr_x),len(vl_x))
        tr_data = lgb.Dataset(tr_x, label=tr_y)

        if LOCAL_TEST:
            vl_data = lgb.Dataset(P, label=P_y) 
        else:
            vl_data = lgb.Dataset(vl_x, label=vl_y)  

        estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 200,
        )   
        
        pp_p = estimator.predict(P)
        predictions += pp_p/NFOLDS

        if LOCAL_TEST:
            feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
            print(feature_imp)
        
        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()
        
    tt_df['prediction'] = predictions
    
    return tt_df

In [8]:
sample_submission = pd.read_csv("../input/sample_submission.csv",
                               index_col = 'TransactionID')

df_test = df_test.merge(sample_submission, how='left', left_index=True,
                         right_index=True, on='TransactionID')

In [9]:
%%time
if LOCAL_TEST:
    test_predictions = make_predictions(df_train, df_test, features_columns, TARGET, lgb_params)
    print(metrics.roc_auc_score(test_predictions[TARGET], test_predictions['prediction']))
else:
    lgb_params['learning_rate'] = 0.005
    lgb_params['n_estimators'] = 1800
    lgb_params['early_stopping_rounds'] = 100    
    test_predictions = make_predictions(df_train, df_test, features_columns, TARGET, lgb_params, NFOLDS=2)

Fold: 0
295270 295270
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.931643	valid_1's auc: 0.912555
[400]	training's auc: 0.957062	valid_1's auc: 0.927869
[600]	training's auc: 0.976893	valid_1's auc: 0.939547
[800]	training's auc: 0.987908	valid_1's auc: 0.947471
[1000]	training's auc: 0.99296	valid_1's auc: 0.951956
[1200]	training's auc: 0.995662	valid_1's auc: 0.954688
[1400]	training's auc: 0.997225	valid_1's auc: 0.956466
[1600]	training's auc: 0.998047	valid_1's auc: 0.957722
[1800]	training's auc: 0.998623	valid_1's auc: 0.958734
Did not meet early stopping. Best iteration is:
[1800]	training's auc: 0.998623	valid_1's auc: 0.958734
Fold: 1
295270 295270
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.935022	valid_1's auc: 0.908718
[400]	training's auc: 0.960564	valid_1's auc: 0.924263
[600]	training's auc: 0.978034	valid_1's auc: 0.935223
[800]	training's auc: 0.988573	valid_1's auc: 0.94246
[1000]	t

In [10]:
if not LOCAL_TEST:
    test_predictions['isFraud'] = test_predictions['prediction']
    test_predictions[['TransactionID','isFraud']].to_csv('gb2_submission.csv', index=False)