---
> ここでは迷ったら使えといわれるLightGBMを使ってみます。  
> 他のモデルとの比較も行います。  
---

決定木のアンサンブル学習(勾配ブースティングを用いた手法)  
欠損値はnp.nanであれば自動で処理してくれるとのこと。さすがMicrosoft  
データセットは共通してKaggleの有名なTitanicを用いる

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [54]:
import lightgbm as lgb

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
csv_data = pd.read_csv("/content/drive/MyDrive/Datasets/Titanic/train.csv")

In [57]:
df = pd.DataFrame(csv_data)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [58]:
print(df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [59]:
data = df.loc[:,["Pclass","Sex","SibSp","Parch","Survived"]]
data

Unnamed: 0,Pclass,Sex,SibSp,Parch,Survived
0,3,male,1,0,0
1,1,female,1,0,1
2,3,female,0,0,1
3,1,female,1,0,1
4,3,male,0,0,0
...,...,...,...,...,...
886,2,male,0,0,0
887,1,female,0,0,1
888,3,female,1,2,0
889,1,male,0,0,1


In [60]:
data["Sex"] = pd.factorize(data.loc[: , "Sex"])[0] # 性別を数値化

In [61]:
t = data["Survived"]
x = data.drop("Survived",axis=1)

In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Pclass    891 non-null    int64
 1   Sex       891 non-null    int64
 2   SibSp     891 non-null    int64
 3   Parch     891 non-null    int64
 4   Survived  891 non-null    int64
dtypes: int64(5)
memory usage: 34.9 KB


In [63]:
x_train, x_test, t_train, t_test = train_test_split(x, t, test_size=0.3, random_state=0)

## scikit-learnのAPIを用いてsklearn風に実装できる

In [64]:
model = lgb.LGBMClassifier()

In [65]:
model.fit(x_train, t_train)

[LightGBM] [Info] Number of positive: 242, number of negative: 381
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388443 -> initscore=-0.453862
[LightGBM] [Info] Start training from score -0.453862


In [66]:
print(model.score(x_train, t_train))
print(model.score(x_test, t_test))

0.8170144462279294
0.7798507462686567


# 他のモデルの正解率との比較

In [67]:
score_li = []
score_li.append([model.score(x_train, t_train),model.score(x_test, t_test)])

ロジスティック回帰

In [68]:
from sklearn.linear_model import LogisticRegression

In [69]:
lr = LogisticRegression()

In [70]:
lr.fit(x_train, t_train)

In [71]:
score_li.append([lr.score(x_train, t_train),lr.score(x_test, t_test)])

k近傍法

In [72]:
from sklearn.neighbors import KNeighborsClassifier

In [73]:
knc = KNeighborsClassifier()

In [74]:
knc.fit(x_train, t_train)

In [75]:
score_li.append([knc.score(x_train, t_train),knc.score(x_test, t_test)])

SVM

In [76]:
from sklearn.svm import SVC

In [77]:
svc = SVC()

In [78]:
svc.fit(x_train, t_train)

In [79]:
score_li.append([svc.score(x_train, t_train),svc.score(x_test, t_test)])

ランダムフォレスト

In [80]:
from sklearn.ensemble import RandomForestClassifier

In [81]:
rfc = RandomForestClassifier()

In [82]:
rfc.fit(x_train, t_train)

In [83]:
print("train : {}".format(rfc.score(x_train, t_train)))
print("test : {}".format(rfc.score(x_test, t_test)))

train : 0.8218298555377207
test : 0.7835820895522388


In [84]:
score_li.append([rfc.score(x_train, t_train),rfc.score(x_test, t_test)])

In [85]:
score_li

[[0.8170144462279294, 0.7798507462686567],
 [0.8025682182985554, 0.7947761194029851],
 [0.7640449438202247, 0.7761194029850746],
 [0.8073836276083467, 0.7947761194029851],
 [0.8218298555377207, 0.7835820895522388]]

In [86]:
columns = ["train","test"]
index = ["LGBM","LR","KNeighbors","SVM","RFC"]
df = pd.DataFrame(data=score_li,columns=columns, index=index)
df

Unnamed: 0,train,test
LGBM,0.817014,0.779851
LR,0.802568,0.794776
KNeighbors,0.764045,0.776119
SVM,0.807384,0.794776
RFC,0.82183,0.783582


**結果 ランダムフォレスト、SVMの方がいい。**
今回のデータセットではLightGBMの良さが分からなかった。パラメータチューニングを行えばよくなるのかなぁ。たくさんのモデルを試した方がよさそう。

# optunaを用いてハイパーパラメータを自動チューニング
## 完成版

In [87]:
! pip install optuna



In [88]:
# optunaを通してlightgbmをインポート
from optuna.integration import lightgbm as lgb_o

In [89]:
dtrain = lgb_o.Dataset(x_train, t_train)
dtest  = lgb_o.Dataset(x_test, t_test)

In [90]:
param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state' : 0,
    }

In [91]:
best = lgb_o.train(
                 param,
                 dtrain,
                 valid_sets=[dtrain,dtest]
                   )

[I 2023-10-21 11:35:06,031] A new study created in memory with name: no-name-d3a08e51-b950-4c0f-a585-53cd98036288
feature_fraction, val_score: 0.466564:  14%|#4        | 1/7 [00:00<00:02,  2.56it/s][I 2023-10-21 11:35:06,433] Trial 0 finished with value: 0.46656367148597566 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.46656367148597566.
feature_fraction, val_score: 0.435330:  29%|##8       | 2/7 [00:01<00:05,  1.07s/it][I 2023-10-21 11:35:07,976] Trial 1 finished with value: 0.4353299617247752 and parameters: {'feature_fraction': 0.4}. Best is trial 1 with value: 0.4353299617247752.
feature_fraction, val_score: 0.435330:  43%|####2     | 3/7 [00:04<00:06,  1.55s/it][I 2023-10-21 11:35:10,114] Trial 2 finished with value: 0.4353299617247752 and parameters: {'feature_fraction': 0.5}. Best is trial 1 with value: 0.4353299617247752.
feature_fraction, val_score: 0.435330:  57%|#####7    | 4/7 [00:04<00:03,  1.14s/it][I 2023-10-21 11:35:10,616] Tria

In [92]:
best.params

{'objective': 'binary',
 'metric': 'binary_logloss',
 'verbosity': -1,
 'boosting_type': 'gbdt',
 'random_state': 0,
 'feature_pre_filter': False,
 'lambda_l1': 2.874283819107178e-08,
 'lambda_l2': 5.887495632596935,
 'num_leaves': 31,
 'feature_fraction': 0.4,
 'bagging_fraction': 0.5442180346525405,
 'bagging_freq': 5,
 'min_child_samples': 50,
 'num_iterations': 1000}

In [93]:
model = lgb.LGBMClassifier(**best.params)

In [94]:
model.fit(x_train, t_train)



In [95]:
pred = model.predict(x_test)
pred

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0])

In [96]:
from sklearn.metrics import accuracy_score

In [97]:
accuracy_score(pred, t_test)

0.7873134328358209

In [98]:
pred = best.predict(x_test)

In [99]:
pred = pred >0.5
pred

array([False, False, False,  True,  True, False,  True,  True, False,
        True, False,  True, False,  True,  True,  True, False, False,
       False, False, False,  True, False, False,  True,  True, False,
        True,  True, False, False,  True, False, False, False, False,
       False, False, False, False,  True, False, False, False,  True,
       False, False,  True, False,  True, False,  True,  True, False,
       False, False, False, False, False, False, False,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True, False,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False, False,  True,  True,  True,  True, False,
       False, False, False,  True, False, False,  True, False, False,
       False, False, False, False, False, False, False,  True,  True,
       False,  True, False,  True, False,  True, False,  True,  True,
        True, False,  True, False, False, False, False, False, False,
       False, False,

In [100]:
accuracy_score(t_test, pred)

0.7873134328358209

# k分割公差検証を用いて平均値から推定。

In [101]:
from sklearn.model_selection import KFold

In [102]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [103]:
score_list = []

In [108]:
for fold, (train_index, valid_index) in enumerate(kf.split(x_train, t_train)):
  train_x = x_train.iloc[train_index]
  valid_x = x_train.iloc[valid_index]
  train_t = t_train.iloc[train_index]
  valid_t = t_train.iloc[valid_index]

  lgb_train = lgb.Dataset(train_x, train_t)
  lgb_valid = lgb.Dataset(valid_x, valid_t)

  lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state' : 0,
  }

  gbm = lgb_o.train(params=lgb_params,
                    train_set=lgb_train,
                    valid_sets=[lgb_train, lgb_valid],
                    # early_stopping_rounds=20,
                    # verbose_eval=-1
                    )

  oof = gbm.predict(x_test)
  score_list.append(oof)

[I 2023-10-21 11:37:28,896] A new study created in memory with name: no-name-d0b4f0cd-ea7e-41a7-838a-0e143d68593f
feature_fraction, val_score: 0.466956:  14%|#4        | 1/7 [00:00<00:01,  4.28it/s][I 2023-10-21 11:37:29,150] Trial 0 finished with value: 0.4669557934499378 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.4669557934499378.
feature_fraction, val_score: 0.466956:  29%|##8       | 2/7 [00:00<00:01,  3.38it/s][I 2023-10-21 11:37:29,489] Trial 1 finished with value: 0.4889437865472238 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.4669557934499378.
feature_fraction, val_score: 0.466956:  43%|####2     | 3/7 [00:00<00:01,  3.72it/s][I 2023-10-21 11:37:29,727] Trial 2 finished with value: 0.4669557934499378 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.4669557934499378.
feature_fraction, val_score: 0.466956:  57%|#####7    | 4/7 [00:01<00:00,  3.55it/s][I 2023-10-21 11:37:30,030] Trial 

In [119]:
score_array = np.array(score_list)

In [120]:
test = np.mean(score_array, axis=0)
test.shape

(268,)

In [123]:
df = pd.DataFrame(score_array).T
df

Unnamed: 0,0,1,2,3,4
0,0.130611,0.130456,0.115908,0.149028,0.141428
1,0.130611,0.130456,0.115908,0.149028,0.141428
2,0.055729,0.083395,0.057224,0.083753,0.113824
3,0.946765,0.965416,0.974539,0.966712,0.939089
4,0.528239,0.548459,0.470140,0.519196,0.588116
...,...,...,...,...,...
263,0.314917,0.342886,0.354659,0.358478,0.340217
264,0.130611,0.130456,0.115908,0.149028,0.141428
265,0.221766,0.213999,0.143797,0.204446,0.113767
266,0.615491,0.621921,0.664263,0.651178,0.587979


In [125]:
test[0]

0.1334863454540021

In [128]:
first_row = df.iloc[0]
np.mean(first_row)

0.1334863454540021

In [129]:
pred = test > 0.5

In [130]:
accuracy_score(t_test, pred)

0.7910447761194029