In [0]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/test_identity.csv
/kaggle/input/sample_submission.csv
/kaggle/input/train_identity.csv
/kaggle/input/train_transaction.csv
/kaggle/input/test_transaction.csv


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import gc
import lightgbm as lgb

import multiprocessing
import warnings
import seaborn as sns

warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

In [0]:
#filesに取り出すファイル名を格納
files = ['../input/test_identity.csv', 
         '../input/test_transaction.csv',
         '../input/train_identity.csv',
         '../input/train_transaction.csv',
         '../input/sample_submission.csv']

In [0]:
# 帰り値でread_csvする関数を作りまわす
def load_data(file):
    return pd.read_csv(file)

#並列処理でcsvを読み込む https://qiita.com/taka-kawa/items/d1fc1bc0acb3a6ca3031
#wifth構文でcloseを省略 https://reiki4040.hatenablog.com/entry/20130331/1364723288
with multiprocessing.Pool() as pool:
    test_id, test_tr, train_id, train_tr, sub = pool.map(load_data, files)

In [0]:
#mergeとdelでメモリ節約
train = pd.merge(train_tr, train_id, on='TransactionID', how='left')
test = pd.merge(test_tr, test_id, on='TransactionID', how='left')

del test_id, test_tr, train_id, train_tr
gc.collect();

In [0]:
col_tr=train.columns
col_te=test.columns
print(
    train.shape,test.shape
)#isFraudの文trainの方が1カラム多い

(590540, 434) (506691, 433)


In [0]:
#訓練データの正解ラベル→X_testに入れる
X_test=train["isFraud"]

#訓練データの学習用データはisFraud以外→Xに入れる
X_train=train.drop("isFraud",axis=1)

In [0]:
X_test.value_counts()
print("不正利用率",100*X_test.value_counts()[1]/(sum(X_test.value_counts())))

不正利用 3.499000914417313


In [0]:
del train
gc.collect()

7

In [0]:
X_test.dtypes#正解ラベル
X_train.dtypes#objectを排除するかダミー変数にする

TransactionID       int64
TransactionDT       int64
TransactionAmt    float64
ProductCD          object
card1               int64
                   ...   
id_36              object
id_37              object
id_38              object
DeviceType         object
DeviceInfo         object
Length: 433, dtype: object

In [0]:
# X_train.columns.values

In [0]:
cold=X_train.columns
"V" in cold[1] 
vin = [i for i in X_train.columns if "V" in i]
idin = [i for i in X_train.columns if "id" in i]

In [0]:
#X_trainとX_testをランダムフォレスト用にコピー
XX_test=X_test.copy()
XX_train=X_train.copy()


In [0]:
XX_train.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,2987001,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,2987002,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,2987003,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,2987004,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [0]:

#V列は落としてみる
XX_train=XX_train.drop(vin,axis=1)
XX_train=XX_train.drop(["DeviceInfo","DeviceType"],axis=1)
XX_train=XX_train.drop(idin,axis=1)
# 
# #objectをダミー化
XX_train=pd.get_dummies(XX_train)



In [0]:
del vin,idin
gc.collect()

35

In [0]:
print(XX_train.shape)
XX_train.head()

(590540, 191)


Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,M5_F,M5_T,M6_F,M6_T,M7_F,M7_T,M8_F,M8_T,M9_F,M9_T
0,2987000,86400,68.5,13926,,150.0,142.0,315.0,87.0,19.0,...,1,0,0,1,0,0,0,0,0,0
1,2987001,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,,...,0,1,0,1,0,0,0,0,0,0
2,2987002,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,...,1,0,1,0,1,0,1,0,1,0
3,2987003,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,,...,0,1,1,0,0,0,0,0,0,0
4,2987004,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,,...,0,0,0,0,0,0,0,0,0,0


In [0]:
feature_name=list(XX_train.columns.values)

In [0]:
col_na=pd.DataFrame(XX_train.isnull().sum())
nacolum=col_na[col_na[0]>0].index

In [0]:
from sklearn.impute import SimpleImputer
imr=SimpleImputer(strategy="mean")
XX_train=pd.DataFrame(imr.fit_transform(XX_train))#平均値補間

In [0]:
XX_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,181,182,183,184,185,186,187,188,189,190
0,2987000.0,86400.0,68.5,13926.0,362.555488,150.0,142.0,315.0,87.0,19.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2987001.0,86401.0,29.0,2755.0,404.0,150.0,102.0,325.0,87.0,118.50218,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002.0,86469.0,59.0,4663.0,490.0,150.0,166.0,330.0,87.0,287.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,2987003.0,86499.0,50.0,18132.0,567.0,150.0,117.0,476.0,87.0,118.50218,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2987004.0,86506.0,50.0,4497.0,514.0,150.0,102.0,420.0,87.0,118.50218,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
"""
ランダムフォレスト
"""
from sklearn.ensemble import RandomForestRegressor

f_train,f_val,g_train,g_val=train_test_split(XX_train,XX_test,test_size=0.2,random_state=0)
reg_model=RandomForestRegressor(random_state=0)
reg_model.fit(f_train,g_train)
reg_pred=reg_model.predict(f_val)

fti = reg_model.feature_importances_

feature=[]
print('Feature Importances:')
feature_names=list(XX_train.columns)
for i, feat in enumerate(feature_names):
    feature.append(fti[i])
    print('\t{0:10d} : {1:>.6f}'.format(feat, fti[i]))

In [0]:
del XX_train,XX_test,f_train,f_val,g_train,g_val,reg_model,fti
gc.collect()

In [0]:
l1=pd.DataFrame(feature,index=feature_name)
# l2=pd.DataFrame
# l1.columns='importance'
# for i in len()
col_in=l1[l1[0]>0].sort_values(by=l1.columns[0],ascending=False)[:50].index
col_in
#     .sort_values()ascending=False

In [0]:
#試しにVの文字が入るデータは落とす
col_d=[ i for i in X_train.columns if "V" in i]
X_train=X_train.drop(col_d,axis=1)

In [0]:
# X_train.dtypes.value_counts()#float 60/object 31/int64 3
#object型のカラムを抽出
# X_train.dtypes
# X_train.select_dtypes(include=object)
# X_train.select_dtypes(include=object).id_30.value_counts()#OS
# X_train.select_dtypes(include=object).id_31.value_counts()#ブラウザ
# X_train.select_dtypes(include=object).id_34.value_counts()#macth_status
# X_train.select_dtypes(include=object).DeviceInfo.value_counts()#Deviceinfo
# X_train.select_dtypes(include=object).DeviceType.value_counts()#DeviceType
# ダミー化変数にして膨れないようなもの
# ProductCD,card4,card6,P_emaildomain,R_emailcomain,id_34,DeviceType
# ダミー化が面倒なので上記以外も落とす
lis=["P_emaildomain","R_emaildomain","id_34","DeviceInfo","DeviceType"]
lis_obj=X_train.select_dtypes(include=object)
lis_d=lis_obj.drop(lis,axis=1)
X_train=X_train.drop(lis_d,axis=1)
test=test.drop(lis_d,axis=1)


In [0]:
# del lis,lis_obj,lis_d
# gc.collect()

In [0]:
"""
ProductCD, card4, card6, P_emaildomain, R_emaildomain, id_34, DeviceType

データ型がobjectなんで判定できません
https://note.nkmk.me/python-pandas-get-dummies/
"""
X_train=pd.get_dummies(X_train)
test=pd.get_dummies(test)


In [0]:
# test.dtypes
# test.columns.values

In [0]:
# """
# lgbmを使う練習
# trainデータのみで学習と予測を行う
# """
# #訓練データの試験用正規ラベル→yに入れる
# # df_tr_tr.isFraud
# X_test=df_trtr["isFraud"]
# # y_test=df_te_tr["isFraud"]

# #訓練データの学習用データはisFraud以外→Xに入れる
# X_train=df_trtr.drop("isFraud",axis=1)
# # y_test=df_te_tr.drop("isFraud",axis=1)

# test=df_tetr

In [0]:
#訓練データをさらに検証と学習に分割
f_train,f_test,g_train,g_test=train_test_split(X_train,X_test)

In [0]:
del X_train,X_test
gc.collect()

In [0]:
f_train.head()
f_train.dtypes.unique()
for u in f_train.columns:
    f_train[u].astype(np.float64)

In [0]:
#データセットを作成する
f_name=[i for i in f_train.columns]
lgb_train=lgb.Dataset(f_train,g_train,feature_name=f_name)
# featue_nameは[]形式で渡さないとダメ　いきなり.columsnでは渡せない
lgb_eval=lgb.Dataset(f_test,g_test,reference=lgb_train)

In [0]:
# lgbm_params={"objective":"binary","metric":'auc'}
lgbm_params={"objective":"binary","metric":'auc'}

In [0]:
#学習
model=lgb.train(lgbm_params,lgb_train,valid_sets=lgb_eval)
# model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval, num_boost_round=40)


In [0]:
#テストデータを予測する
# y_pred=model.predict(f_test,num_iteration=model.best_iteration)
y_pred=model.predict(test,num_iteration=model.best_iteration)
# y_pred_max=np.argmax(y_pred,axis=1)

In [0]:
y_pred

In [0]:
# y_pred_max=pd.DataFrame(y_pred_max)
# y_pred_max.head()
# # y_pred_ma
# count=y_pred_max[0].value_counts()
# shape=y_pred_max.shape
# print(count,"\n|",shape)

In [0]:
sub.isFraud=y_pred

In [0]:
# sub.head()

In [0]:
#出力
sub.isFraud=y_pred
sub.to_csv("submission.csv",index=False)

In [0]:

# importanceを表示する
# importance = pd.DataFrame(model.feature_importance(),index=f_name,columns=['importance'])

# importance.sort_values(by="importance",ascending=True).plot.hist(bins=20)
# importance

#作成したモデルにおいて識別に寄与しないカラムを除去する
# lis=importance.query("importance < 0.5")
# lis.head()
# display(importance.sort_values(by="importance",ascending=False))
# lis.index

In [0]:
# lis_q=list(lis.index)
# lis_q

In [0]:
# f_train.drop(lis_q,axis=1).head()
# f_train.head()

In [0]:
# f_train=f_train.drop(lis_q,axis=1)

In [0]:
# #邪魔なカラムを削除して再度学習
# # f_train.columns
# # f_train=f_train.drop(lis_q,axis=1)
# # g_train=g_train.drop(lis_q,axis=1)
# #データセットを作成する
# f_name=[i for i in f_train.columns]
# lgb_train=lgb.Dataset(f_train,g_train,feature_name=f_name)
# # featue_nameは[]形式で渡さないとダメ　いきなり.columsnでは渡せない
# lgb_eval=lgb.Dataset(f_test,g_test,reference=lgb_train)

In [0]:
# # lgbm_params={"objective":"binary","metric":'auc'}
# lgbm_params={'num_leaves': 500,
#           'min_data_in_leaf': 106,
#           'objective': 'binary',
#           'max_depth': -1,
#           "boosting_type": "gbdt",
#           "bagging_seed": 11,
#           "metric": 'auc',
#           "verbosity": -1,
#           'random_state': 47
#          }

In [0]:
# #学習
# model=lgb.train(lgbm_params,lgb_train,valid_sets=lgb_eval)
# # model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval, num_boost_round=40)


In [0]:
# y_pred2=model.predict(test,num_iteration=model.best_iteration)

In [0]:

# pred=pd.DataFrame([y_pred,y_pred2])

In [0]:
# pred.head()

In [0]:

# sub.isFraud=y_pred2
# sub.to_csv("submission.csv",index=False)