In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import lightgbm as lgb

from DNN_preprocessing import bureau_preprocessing, previous_application_preprocessing

# テストファイル読み込み
app_test = pd.read_csv('application_test.csv')
app_train = pd.read_csv('application_train.csv')

app_test['is_test'] = 1 
app_test['is_train'] = 0
app_train['is_test'] = 0
app_train['is_train'] = 1

train_Y = app_train['TARGET']
train_X = app_train.drop(['TARGET'], axis = 1)

# テストID
test_id = app_test['SK_ID_CURR']
test_X = app_test

# 前処理のためにトレインとテストを連結
data = pd.concat([train_X, test_X], axis=0)
print(len(train_Y[train_Y==1]),len(train_Y[train_Y==0]))

24825 282686


In [6]:
# bureauデータと結合
df_bureau=bureau_preprocessing('bureau.csv')
data = data.merge(right=df_bureau.reset_index(), how='left', on='SK_ID_CURR')

# previous_applicationデータと結合
df_pre_app=previous_application_preprocessing('previous_application.csv')
data = data.merge(right=df_pre_app.reset_index(), how='left', on='SK_ID_CURR')
data.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,pre_app_DAYS_DECISION,pre_app_SELLERPLACE_AREA,pre_app_CNT_PAYMENT,pre_app_DAYS_FIRST_DRAWING,pre_app_DAYS_FIRST_DUE,pre_app_DAYS_LAST_DUE_1ST_VERSION,pre_app_DAYS_LAST_DUE,pre_app_DAYS_TERMINATION,pre_app_NFLAG_INSURED_ON_APPROVAL,pre_app_count
0,100002,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,-606.0,500.0,24.0,365243.0,-565.0,125.0,-25.0,-17.0,0.0,1.0
1,100003,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,-1305.0,533.0,10.0,365243.0,-1274.333333,-1004.333333,-1054.333333,-1047.333333,0.666667,3.0
2,100004,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,-815.0,30.0,4.0,365243.0,-784.0,-694.0,-724.0,-714.0,0.0,1.0
3,100006,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,-272.444444,894.222222,23.0,365243.0,91066.5,91584.0,182477.5,182481.75,0.0,9.0
4,100007,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,-1222.833333,409.166667,20.666667,365243.0,-1263.2,-837.2,72136.2,72143.8,0.6,6.0


In [2]:
# カテゴリ変数を取得する関数
def _get_categorical_features(df):
    feats = [col for col in list(df.columns) if df[col].dtype == 'object']
    return feats

# カテゴリ変数をファクトライズ (整数に置換)する関数
def _factorize_categoricals(df, cats):
    for col in cats:
        df[col], _ = pd.factorize(df[col])
    return df 

# カテゴリ変数のダミー変数 (二値変数化)を作成する関数
def _get_dummies(df, cats):
    for col in cats:
        df = pd.concat([df, pd.get_dummies(df[col], prefix=col, drop_first=True)], axis=1)
        df = df.drop(col, axis=1)
    return df 

# カテゴリ変数を取得
data_cats = _get_categorical_features(data)

# ダミー変数を取得
dummied_data = _get_dummies(data, data_cats)
print('finish')

finish


In [3]:
trainX = dummied_data[dummied_data['is_train'] == 1]
trainX = trainX.drop('is_test',axis=1)
trainX = trainX.drop('is_train',axis=1)
testX = dummied_data[dummied_data['is_test'] == 1]
testX = testX.drop('is_test',axis=1)
testX = testX.drop('is_train',axis=1)
print('finish')

finish


In [5]:
trainX.columns = list(map(lambda x:x.replace(' ','_'),trainX.columns))
trainX.columns = list(map(lambda x:x.replace(',','_'),trainX.columns))

import random

def get_subset(df_x,df_y):
    tgt_index = list(df_y[df_y==1].index)
    other_index = list(df_y[df_y==0].index)
    tgt_cnpt_index = random.sample(other_index,len(tgt_index)*5) # このサンプリングに依存してしまうという問題はある
    train_index = tgt_index + tgt_cnpt_index
    return df_x.loc[train_index,:],df_y[train_index]

#trainX_resampling, train_Y_resampling = get_subset(trainX,train_Y)

x_train, x_val, y_train, y_val = train_test_split(trainX, train_Y, test_size=0.2, random_state=18)
print(len(y_train[y_train==1]),len(y_train[y_train==0]))

19904 226104


In [7]:
from sklearn.model_selection import GridSearchCV, KFold

# 思考停止してlgboostやります
lgbc = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary')
# gridsearchのパラメータは増やしてもあり
params_lgb = {
        'learning_rate': [0.1],
        'max_depth': [-1],
        'subsample_for_bin':[200000,50000],
        'colsample_bytree': [0.5,1.0],
        'num_leaves':[31,40],
        'n_estimators':[10,20],
        'min_child_weight':[0.1,1e-3],
        'min_child_samples':[10,20],
        'min_split_gain':[0,0.01]
        }
lgbcf = GridSearchCV(lgbc, params_lgb, cv=KFold(n_splits=3, shuffle=True),scoring='roc_auc',n_jobs=2, verbose=1) #n_jobs=-1は急ぐ時

lgbcf.fit(x_train, y_train)
print('最適なパラメータ：',lgbcf.best_params_)
print('testデータにおけるスコア: {}'.format(lgbcf.score(x_val, y_val)))

ans = lgbcf.predict_proba(testX)
result = pd.DataFrame({'SK_ID_CURR':test_id, 'TARGET':ans[:,1]})
result.to_csv('my_submit.csv', index=False)

Fitting 3 folds for each of 128 candidates, totalling 384 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  3.4min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 13.4min
[Parallel(n_jobs=2)]: Done 384 out of 384 | elapsed: 168.7min finished


最適なパラメータ： {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 10, 'min_child_weight': 0.1, 'min_split_gain': 0.01, 'n_estimators': 20, 'num_leaves': 40, 'subsample_for_bin': 50000}
testデータにおけるスコア: 0.7419790032914163


In [8]:
from sklearn.model_selection import GridSearchCV, KFold

# 思考停止してlgboostやります
lgbc = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary')
# gridsearchのパラメータは増やしてもあり
params_lgb = {
        'learning_rate': [0.1],
        'max_depth': [-1],
        'subsample_for_bin':[200000,50000],
        'colsample_bytree': [0.5,1.0],
        'num_leaves':[31,40],
        'n_estimators':[10,20],
        'min_child_weight':[0.1,1e-3],
        'min_child_samples':[10,20],
        'min_split_gain':[0,0.01]
        }
lgbcf = GridSearchCV(lgbc, params_lgb, cv=KFold(n_splits=3, shuffle=True),scoring='roc_auc',n_jobs=2, verbose=1) #n_jobs=-1は急ぐ時

lgbcf.fit(trainX, train_Y)
print('最適なパラメータ：',lgbcf.best_params_)
#print('testデータにおけるスコア: {}'.format(lgbcf.score(x_val, y_val)))

ans = lgbcf.predict_proba(testX)
result = pd.DataFrame({'SK_ID_CURR':test_id, 'TARGET':ans[:,1]})
result.to_csv('my_submit.csv', index=False)

Fitting 3 folds for each of 128 candidates, totalling 384 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  4.1min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 17.1min
[Parallel(n_jobs=2)]: Done 384 out of 384 | elapsed: 63.5min finished


最適なパラメータ： {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 10, 'min_child_weight': 0.1, 'min_split_gain': 0.01, 'n_estimators': 20, 'num_leaves': 40, 'subsample_for_bin': 200000}
