In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ライブラリの読み込み
%%time
from random import shuffle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

CPU times: user 667 ms, sys: 161 ms, total: 828 ms
Wall time: 1.52 s


In [3]:
#機械学習ライブラリ
%%time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from sklearn.metrics import r2_score

from scipy.optimize import minimize

CPU times: user 88.2 ms, sys: 40.4 ms, total: 129 ms
Wall time: 373 ms


In [4]:
# データの読み込み
train = pd.read_csv("drive/MyDrive/見原/Data/train.csv")
test = pd.read_csv("drive/MyDrive/見原/Data/test.csv")
sample_sub = pd.read_csv("drive/MyDrive/見原/Data/sample_submission.csv")

In [5]:
# 5種類の特徴量に絞る
use_features = ["NAME_CONTRACT_TYPE", "AMT_INCOME_TOTAL", "EXT_SOURCE_2", "OWN_CAR_AGE", "ORGANIZATION_TYPE"]
target = train["TARGET"]

train = train[use_features]
train["TARGET"] = target
test = test[use_features]

In [None]:
#データの前処理
# EXT_SOURCE_2の欠損値を平均値で補完
train["EXT_SOURCE_2"].fillna(train["EXT_SOURCE_2"].mean(), inplace=True)
test["EXT_SOURCE_2"].fillna(train["EXT_SOURCE_2"].mean(), inplace=True)

train.isnull().sum()

In [None]:
# NAME_CONTRACT_TYPEの数値化（Label Encoding）
train["NAME_CONTRACT_TYPE"].replace({'Cash loans': 0, 'Revolving loans': 1}, inplace=True)
test["NAME_CONTRACT_TYPE"].replace({'Cash loans': 0, 'Revolving loans': 1}, inplace=True)

train.head(5)

Unnamed: 0,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,EXT_SOURCE_2,OWN_CAR_AGE,ORGANIZATION_TYPE,TARGET
0,0,112500.0,0.372591,,School,0
1,0,225000.0,0.449567,,XNA,0
2,0,54000.0,0.569503,,Postal,0
3,0,67500.0,0.105235,,XNA,0
4,0,157500.0,0.20249,,Business Entity Type 3,1


In [None]:
# ORGANIZATION_TYPEの数値化（Count Encoding）
organization_ce = train["ORGANIZATION_TYPE"].value_counts()
train["ORGANIZATION_TYPE"] = train["ORGANIZATION_TYPE"].map(organization_ce)
test["ORGANIZATION_TYPE"] = test["ORGANIZATION_TYPE"].map(organization_ce)

train.head(5)

Unnamed: 0,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,EXT_SOURCE_2,OWN_CAR_AGE,ORGANIZATION_TYPE,TARGET
0,0,112500.0,0.372591,,4991,0
1,0,225000.0,0.449567,,30898,0
2,0,54000.0,0.569503,,1185,0
3,0,67500.0,0.105235,,30898,0
4,0,157500.0,0.20249,,37943,1


In [None]:
# OWN_CAR_AGEの60以上の値（外れ値）を欠損値扱いする
train.loc[train["OWN_CAR_AGE"] >= 60, "OWN_CAR_AGE"] = np.nan
test.loc[test["OWN_CAR_AGE"] >= 60, "OWN_CAR_AGE"] = np.nan


In [None]:
# OWN_CAR_AGEをグループ分け
train["OWN_CAR_AGE"] = train["OWN_CAR_AGE"] // 10
test["OWN_CAR_AGE"] = test["OWN_CAR_AGE"] // 10

train["OWN_CAR_AGE"].unique()

array([nan,  0.,  2.,  1.,  3.,  4.,  5.])

In [None]:
# OWN_CAR_AGEをOne Hot Encoding
train_car_age_ohe = pd.get_dummies(train["OWN_CAR_AGE"]).add_prefix("OWN_CAR_AGE_")
test_car_age_ohe = pd.get_dummies(test["OWN_CAR_AGE"]).add_prefix("OWN_CAR_AGE_")

train = pd.concat([train, train_car_age_ohe], axis=1)
test = pd.concat([test, test_car_age_ohe], axis=1)

train.drop('OWN_CAR_AGE', axis=1, inplace=True)
test.drop('OWN_CAR_AGE', axis=1, inplace=True)

In [None]:
# 目的変数と説明変数に分割
X = train.drop("TARGET", axis=1).values
y = train["TARGET"].values
X_test = test.values

In [None]:
# 標準化
sc = StandardScaler()
sc.fit(X)
X_std = pd.DataFrame(sc.transform(X))
X_test_std = pd.DataFrame(sc.transform(X_test))

In [None]:
# 訓練データと評価データに分割
X_train, X_valid, y_train, y_valid = train_test_split(X_std, y, test_size=0.1, shuffle = True)

In [None]:
#XGBoost
import xgboost as xgb
xgb = xgb.XGBClassifier(random_state=13)
xgb.fit(X_train, y_train)

xgb_train_pred = xgb.predict_proba(X_train)[:, 1]
xgb_valid_pred = xgb.predict_proba(X_valid)[:, 1]

print(f"Train Score: {roc_auc_score(y_train, xgb_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, xgb_valid_pred)}")


Train Score: 0.740803917663867
Valid Score: 0.6563151002943768


In [None]:
# LightGBM
import lightgbm as lgb
lgb = lgb.LGBMClassifier(random_state=0)
lgb.fit(X_train, y_train)

lgb_train_pred = lgb.predict_proba(X_train)[:, 1]
lgb_valid_pred = lgb.predict_proba(X_valid)[:, 1]
print(f"Train Score: {roc_auc_score(y_train, lgb_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, lgb_valid_pred)}")

Train Score: 0.7148354620791179
Valid Score: 0.6604762036557846


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

rf_train_pred = rf.predict_proba(X_train)[:, 1]
rf_valid_pred = rf.predict_proba(X_valid)[:, 1]
print(f"Train Score: {roc_auc_score(y_train, rf_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, rf_valid_pred)}")


Train Score: 0.9999517975642506
Valid Score: 0.5739335660474348


In [None]:
# MLPClassifier
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state=0)
mlp.fit(X_train, y_train)

mlp_train_pred = mlp.predict_proba(X_train)[:, 1]
mlp_valid_pred = mlp.predict_proba(X_valid)[:, 1]
print(f"Train Score: {roc_auc_score(y_train, mlp_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, mlp_valid_pred)}")

Train Score: 0.6774892356192576
Valid Score: 0.6615401242117398


In [None]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)

lr_train_pred = lr.predict_proba(X_train)[:, 1]
lr_valid_pred = lr.predict_proba(X_valid)[:, 1]
print(f"Train Score: {roc_auc_score(y_train, lr_train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, lr_valid_pred)}")

Train Score: 0.6627970501930767
Valid Score: 0.6560362209549184


# 以下アンサンブル

In [None]:
# モデルの数（４）で割ってる
train_pred = (lr_train_pred + mlp_train_pred + lgb_train_pred + xgb_train_pred)/4
valid_pred = (lr_valid_pred + mlp_valid_pred + lgb_valid_pred + xgb_valid_pred)/4

print(f"Train Score: {roc_auc_score(y_train, train_pred)}")
print(f"Valid Score: {roc_auc_score(y_valid, valid_pred)}")

Train Score: 0.7120913781959509
Valid Score: 0.6633933475317713


In [None]:
# 各モデルの予測を特徴量として持つDataFrameを作成
train_x_with_predictions = pd.DataFrame()
test_x_with_predictions = pd.DataFrame()

# 各モデルの予測をtrain_xとtest_xに追加
train_x_with_predictions['xgb'] = xgb.predict_proba(X_train)[:, 1]
train_x_with_predictions['lgb'] = lgb.predict_proba(X_train)[:, 1]
train_x_with_predictions['rf'] = rf.predict_proba(X_train)[:, 1]
train_x_with_predictions['mlp'] = mlp.predict_proba(X_train)[:, 1]
train_x_with_predictions['lr'] = lr.predict_proba(X_train)[:, 1]

test_x_with_predictions['xgb'] = xgb.predict_proba(X_test)[:, 1]
test_x_with_predictions['lgb'] = lgb.predict_proba(X_test)[:, 1]
test_x_with_predictions['rf'] = rf.predict_proba(X_test)[:, 1]
test_x_with_predictions['mlp'] = mlp.predict_proba(X_test)[:, 1]
test_x_with_predictions['lr'] = lr.predict_proba(X_test)[:, 1]

In [None]:
train_x_with_predictions.head(2)

Unnamed: 0,xgb,lgb,rf,mlp,lr
0,0.072249,0.038864,0.02,0.041458,0.038767
1,0.005154,0.012638,0.0,0.018628,0.038388


In [None]:
test_x_with_predictions.head(2)

Unnamed: 0,xgb,lgb,rf,mlp,lr
0,0.056094,0.076491,0.15,0.0,1.0
1,0.055701,0.06507,0.27,0.0,1.0


## Adversarial validation
---
詳細は「Kaggleで勝つデータ分析の技術」, p378~

---

In [None]:
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score

n_sampling = 50  # サンプリングの回数
frac_sampling = 0.5  # サンプリングで学習データから取り出す割合

def score(weights, data_x, data_y):
    # 評価指標はAUCとする
    y_prob = np.dot(data_x, weights)
    return -roc_auc_score(data_y, y_prob)

# 複数のモデルをサンプルベースで加重平均する
num_models = 5
results = []
for i in range(n_sampling):
    seed = i
    idx = pd.Series(np.arange(len(y_train))).sample(frac=frac_sampling, replace=False,
                                                    random_state=seed)
    x_sample = pd.DataFrame(train_x_with_predictions).iloc[idx]
    y_sample = pd.DataFrame(y_train).iloc[idx]

    # サンプリングしたデータに対して、加重平均の重みを最適化する
    init_weights = np.ones(x_sample.shape[1]) / x_sample.shape[1]  # 全てのモデルに均等な重みを初期値とする
    constraints = (
        {'type': 'ineq', 'fun': lambda weights: np.sum(weights) - 1.0},
    )
    result = minimize(score, x0=init_weights,
                      args=(x_sample.values, y_sample),
                      constraints=constraints,
                      method='COBYLA')
    results.append(result.x)

# モデルごとの重みの平均を計算する
results = np.array(results)
weights = results.mean(axis=0)

# 重みを使ってモデルの予測を組み合わせる
ensemble_pred = np.dot(test_x_with_predictions.values, weights)

## あとはがんばれ（adversarial validation の続きは任せました）

# 以下提出

In [None]:
# テストデータに対する予測値の作成
pred = xgb.predict_proba(X_test)[:, 1] # とりまXGBBoostの予測を提出してるので、適宜修正してくれ

In [None]:
# 予測結果を提出用のフォーマットに格納
sample_sub['TARGET'] = pred
sample_sub

Unnamed: 0,SK_ID_CURR,TARGET
0,171202,0.056094
1,171203,0.055701
2,171204,0.095453
3,171205,0.031494
4,171206,0.062444
...,...,...
61495,232697,0.040403
61496,232698,0.028361
61497,232699,0.027494
61498,232700,0.044139


In [None]:
# 提出用のcsvファイルを作成
sample_sub.to_csv('submission.csv',index=False)