## lightGBM 적용한 파일
- fraud_modeling에서 전처리한 파일 이용
- 몇개 칼럼 중 소수 값은 Other로 대체하고, V1~V339는 30개 column으로 PCA 한 것

In [17]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys, gc, warnings, random
import lightgbm as lgb

from tqdm import tqdm

import math
warnings.filterwarnings('ignore')

#standard plotly imports
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode

#import cufflinks
import plotly.figure_factory as ff

#using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)

#preprocessing, modeling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
        end_mem, 100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
%%time
# df.to_pickle("../input/df_pca.pkl")
df = pd.read_pickle("../input/df_pca.pkl")
# df = reduce_mem_usage(df)

Wall time: 2.08 s


### Setting train and test back

In [5]:
df_train, df_test = df[df['isFraud'] != 'test'], df[df['isFraud'] == 'test'].drop(
'isFraud', axis=1)

print(df_train.shape) # 정규화 과정에서 한 과정 합쳤기 때문에 열이 하나 적음
print(df_test.shape)

(590540, 134)
(506691, 133)


## LightGBM modeling with 4 threads

In [6]:
sample_submission = pd.read_csv("../input/sample_submission.csv",
                               index_col = 'TransactionID')

In [20]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED = 421
seed_everything(SEED)
LOCAL_TEST = False
TARGET = 'isFraud'

rm_cols = [
    'TransactionID', 'TransactionDT', TARGET
]

# 참고: list(X_train) 하면 칼럼명들이 나옴
# rm_cols에 해당하는 칼럼 제거
features_columns = list(df_train)
for col in rm_cols:
    if col in features_columns:
        features_columns.remove(col)

In [13]:
lgb_params = {
    'objective':'binary',
    'boosting_type':'gbdt',
    'metric':'auc',
    'n_jobs':4, #cpu 개수
    'learning_rate':0.01,
    'num_leaves': 2**8,
    'max_depth':-1,
    'tree_learner':'serial',
    'colsample_bytree': 0.7,
    'subsample_freq':1,
    'subsample':1,
    'n_estimators':800,
    'max_bin':255,
    'verbose':-1,
    'seed': SEED,
    'early_stopping_rounds':100,
}

In [14]:
# fitting, prediction 같이 하는 함수 -> train, test set 모두 필요
def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=2):
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    X,y = tr_df[features_columns], tr_df[target]    
    P,P_y = tt_df[features_columns], tt_df[target]  

    tt_df = tt_df[['TransactionID',target]]    
    predictions = np.zeros(len(tt_df))
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold:',fold_)
        tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
        vl_x, vl_y = X.iloc[val_idx,:], y[val_idx]
            
        print(len(tr_x),len(vl_x))
        tr_data = lgb.Dataset(tr_x, label=tr_y)

        if LOCAL_TEST:
            vl_data = lgb.Dataset(P, label=P_y) 
        else:
            vl_data = lgb.Dataset(vl_x, label=vl_y)  

        estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 200
        )   
        
        pp_p = estimator.predict(P)
        predictions += pp_p/NFOLDS

        if LOCAL_TEST:
            feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
            print(feature_imp)
        
        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()
        
    tt_df['prediction']  = predictions
    
    return tt_df

In [15]:
df_test = df_test.merge(sample_submission, how='left', left_index=True,
                         right_index=True, on='TransactionID')

In [None]:
%%time
if LOCAL_TEST:
    test_predictions = make_predictions(df_train, df_test,
                        features_columns, TARGET, lgb_params)
    print(metrics.roc_auc_score(test_predictions[TARGET], test_predictions['prediction']))
else:
    lgb_params['learning_rate'] = 0.01
    lgb_params['n_estimators'] = 800
    lgb_params['early_stopping_rounds'] = 100    
    test_predictions = make_predictions(df_train, df_test, features_columns, TARGET, lgb_params, NFOLDS=6) #NFOLDS=2

Fold: 0
492116 98424
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.954775	valid_1's auc: 0.932252
[400]	training's auc: 0.980755	valid_1's auc: 0.950727
[600]	training's auc: 0.990626	valid_1's auc: 0.95852
[800]	training's auc: 0.994687	valid_1's auc: 0.961858
Did not meet early stopping. Best iteration is:
[800]	training's auc: 0.994687	valid_1's auc: 0.961858
Fold: 1
492116 98424
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.954439	valid_1's auc: 0.931079
[400]	training's auc: 0.980927	valid_1's auc: 0.949358
[600]	training's auc: 0.990623	valid_1's auc: 0.956299


In [19]:
#LOCAL_TEST 는 어디서 True로 변하는 건지?

########################### Export
if not LOCAL_TEST:
    test_predictions['isFraud'] = test_predictions['prediction']
    test_predictions[['TransactionID','isFraud']].to_csv('submission.csv', index=False)