In [0]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime
import lightgbm as lgb

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

import math
warnings.filterwarnings('ignore')

In [0]:
#データの取得
train_tr=pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_id=pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_tr=pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_id=pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

In [0]:
# 参考　https://www.kaggle.com/kartikathale/fraud-detection-eda-basic-logistic-regression
# 入力したDataFrameの占有メモリ削減
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
#train,testそれぞれ結合してまとめる
train=pd.merge(train_tr,train_id,on='TransactionID',how='left')
test=pd.merge(test_tr,test_id,on='TransactionID',how='left')
del test_tr,test_id,train_tr,train_id
gc.collect()

11

In [0]:
# メモリ使用量減少
train=reduce_mem_usage(train)
test=reduce_mem_usage(test)

Mem. usage decreased to 650.48 Mb (66.8% reduction)
Mem. usage decreased to 565.37 Mb (66.3% reduction)


In [0]:
#trainカラムの欠損率
null_list=pd.DataFrame(train.isnull().sum()).rename(columns={0: 'sum_null'})
null_list['null_percent'] = 100*null_list['sum_null']/(train.shape[0])
null_list=null_list[null_list['null_percent']>80].index

#欠損率80%以上をdrop
train.drop(list(null_list),axis=1,inplace=True)
test.drop(list(null_list),axis=1,inplace=True)

del null_list
gc.collect()

0

In [0]:
#trainの正解ラベル
y=train["isFraud"]
train.drop("isFraud",axis=1,inplace=True)

In [0]:
SEED=42
#seedをSEEDに固定

In [0]:
#主なドメイン以外はotherに入れる
#P_emaildomainの整理
def Pcolumns(df):
    remain=list(df['P_emaildomain'].value_counts().index)
    dor=['gmail.com','outlook.com','yahoo.com','mail.com','anonymous.com',
     'hotmail.com','verizon.net','aol.com','me.com','comcast.net']
    li=[i for i in remain if i not in dor]
  
    df.loc[df['P_emaildomain'].str.contains('gmail.com', na=False), 'P_emaildomain'] = 'gmail'
    df.loc[df['P_emaildomain'].str.contains('outlook.com', na=False), 'P_emaildomain'] = 'outlook'
    df.loc[df['P_emaildomain'].str.contains('yahoo.com', na=False), 'P_emaildomain'] = 'yahoo'
    df.loc[df['P_emaildomain'].str.contains('mail.com', na=False), 'P_emaildomain'] = 'mail'
    df.loc[df['P_emaildomain'].str.contains('anonymous.com', na=False), 'P_emaildomain'] = 'anonymous'
    df.loc[df['P_emaildomain'].str.contains('hotmail.com', na=False), 'P_emaildomain'] = 'hotmail'
    df.loc[df['P_emaildomain'].str.contains('verizon.net', na=False), 'P_emaildomain'] = 'verizon'
    df.loc[df['P_emaildomain'].str.contains('aol.com', na=False), 'P_emaildomain'] = 'aol'
    df.loc[df['P_emaildomain'].str.contains('me.com', na=False), 'P_emaildomain'] = 'me'
    df.loc[df['P_emaildomain'].str.contains('comcast.net', na=False), 'P_emaildomain'] = 'comcast'

    for i in li:
        df.loc[df['P_emaildomain'].str.contains(i, na=False), 'P_emaildomain'] = 'other'
    return df

    
#R_emaildomainの整理 
def Rcolumns(df):
    remain=list(df['R_emaildomain'].value_counts().index)
    dor=['gmail','hotmail.com','outlook.com',"anonymous.com",'charter.net','prodigy.net.mx',
       'comcast.net','live.com.mx','icloud.com','yahoo.com','aol.com','juno.com','att.net',
       'verizon.net','yahoo.com.mx','bellsouth.net']
    li=[i for i in remain if i not in dor]

    #整理
    df.loc[df['R_emaildomain'].str.contains('gmail.com', na=False), 'R_emaildomain'] = 'gmail'
    df.loc[df['R_emaildomain'].str.contains('hotmail.com', na=False), 'R_emaildomain'] = 'hotmail'
    df.loc[df['R_emaildomain'].str.contains('outlook.com', na=False), 'R_emaildomain'] = 'outlook'
    df.loc[df['R_emaildomain'].str.contains('anonymous.com', na=False), 'R_emaildomain'] = 'anonymous'
    df.loc[df['R_emaildomain'].str.contains('charter.net', na=False), 'R_emaildomain'] = 'charter'
    df.loc[df['R_emaildomain'].str.contains('prodigy.net.mx', na=False), 'R_emaildomain'] = 'prodigy'
    df.loc[df['R_emaildomain'].str.contains('comcast.net', na=False), 'R_emaildomain'] = 'comcast'
    df.loc[df['R_emaildomain'].str.contains('live.com.mx', na=False), 'R_emaildomain'] = 'live'
    df.loc[df['R_emaildomain'].str.contains('icloud.com', na=False), 'R_emaildomain'] = 'icloud'
    df.loc[df['R_emaildomain'].str.contains('yahoo.com', na=False), 'R_emaildomain'] = 'yahoo'
    df.loc[df['R_emaildomain'].str.contains('aol.com', na=False), 'R_emaildomain'] = 'aol'
    df.loc[df['R_emaildomain'].str.contains('juno.com', na=False), 'R_emaildomain'] = 'juno'
    df.loc[df['R_emaildomain'].str.contains('att.net', na=False), 'R_emaildomain'] = 'att'
    df.loc[df['R_emaildomain'].str.contains('verizon.net', na=False), 'R_emaildomain'] = 'verzion'
    df.loc[df['R_emaildomain'].str.contains('yahoo.com.mx', na=False), 'R_emaildomain'] = 'yahoo'
    df.loc[df['R_emaildomain'].str.contains('bellsouth.net', na=False), 'R_emaildomain'] = 'bellisouth'

    for i in li:
        df.loc[df['R_emaildomain'].str.contains(i, na=False), 'R_emaildomain'] = 'other'
    return df

train=Rcolumns(train)
test=Rcolumns(test)

In [0]:
train=Pcolumns(train)
test=Pcolumns(test)

In [0]:
#objectカラムをダミーか
def dumm(h):
    re1=[i for i in h.select_dtypes(include=object).columns]
    h=pd.get_dummies(h, columns=list(re1),drop_first=True) 
    return h

In [0]:
#DeviceInfoはダミー化するとカラムが大量になる。また、デバイス情報の整理法が不明なので今回は消す
train.drop('DeviceInfo',axis=1,inplace=True)
test.drop('DeviceInfo',axis=1,inplace=True)

#ダミー化
li_obj=[i for i in list(train.select_dtypes(include=object).columns)]
le_obj=[i for i in list(test.select_dtypes(include=object).columns)]
train = pd.concat([train.drop(li_obj,axis=1),dumm(train)],axis=1)
test=pd.concat([test.drop(le_obj,axis=1),dumm(test)],axis=1)

In [0]:
# #データの出力
train.to_pickle('../input/train.pkl')
test.to_pickle('../input/test.pkl')
y.to_pickle("../input/y.pkl")

In [0]:
#分割1回目　random_seedを1~5変える
f_train,f_test,g_train,g_test = train_test_split(train,y,test_size=0.2,
                                                 random_state=1,
                                                 stratify=y)
#kfoldするときの注意点
# https://blog.amedama.jp/entry/2018/06/21/235951

In [0]:
del LabelEncoder,tqdm
del le_obj,li_obj,Pcolumns,dumm,Rcolumns,train
gc.collect()

72

In [0]:
#訓練用データセット　検証用デーｔセット→最適なブーストラウンド数を決める
lgb_train = lgb.Dataset(f_train,g_train)
lgb_eval = lgb.Dataset(f_test, g_test, reference=lgb_train)

#パラメータ
params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'n_estimators':800,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

In [0]:
del f_train,g_train,y,test
gc.collect()

0

In [0]:
#訓練
model = lgb.train(params, lgb_train, 
                  valid_sets=lgb_eval,
                  num_boost_round=3000,
                  early_stopping_rounds=15)

[1]	valid_0's auc: 0.856194
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.857958
[3]	valid_0's auc: 0.85978
[4]	valid_0's auc: 0.860836
[5]	valid_0's auc: 0.866349
[6]	valid_0's auc: 0.867813
[7]	valid_0's auc: 0.868501
[8]	valid_0's auc: 0.872094
[9]	valid_0's auc: 0.873283
[10]	valid_0's auc: 0.874315
[11]	valid_0's auc: 0.874998
[12]	valid_0's auc: 0.875415
[13]	valid_0's auc: 0.876405
[14]	valid_0's auc: 0.877338
[15]	valid_0's auc: 0.877941
[16]	valid_0's auc: 0.878202
[17]	valid_0's auc: 0.87871
[18]	valid_0's auc: 0.87917
[19]	valid_0's auc: 0.879359
[20]	valid_0's auc: 0.879604
[21]	valid_0's auc: 0.879682
[22]	valid_0's auc: 0.883429
[23]	valid_0's auc: 0.88381
[24]	valid_0's auc: 0.88405
[25]	valid_0's auc: 0.884219
[26]	valid_0's auc: 0.884472
[27]	valid_0's auc: 0.885115
[28]	valid_0's auc: 0.885469
[29]	valid_0's auc: 0.886048
[30]	valid_0's auc: 0.886583
[31]	valid_0's auc: 0.886846
[32]	valid_0's auc: 0.887292
[33]	valid_0's auc: 0.8

In [0]:
#学習済みモデルを保存する
import pickle
with open('../input/model.pickle', mode='wb') as fp:
    pickle.dump(model, fp)

In [0]:
# test=pd.read_pickle('../input/test.pkl')
#テストデータを予測する
y_pred=model.predict(f_test,num_iteration=model.best_iteration)

In [0]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(g_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)

0.9642113246295697


In [0]:
sample=pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
sample.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5
2,3663551,0.5
3,3663552,0.5
4,3663553,0.5


In [0]:
y_pred2=model.predict(test,num_iteration=model.best_iteration)
sample['isFraud']=y_pred2
sample.to_csv("submission.csv",index=False)