---
> ここでは迷ったら使えといわれるLightGBMを使ってみます。  
> 他のモデルとの比較も行います。  
---

決定木のアンサンブル学習(勾配ブースティングを用いた手法)  
欠損値はnp.nanであれば自動で処理してくれるとのこと。さすがMicrosoft  
データセットは共通してKaggleの有名なTitanicを用いる

In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [105]:
import lightgbm as lgb

In [106]:
from sklearn.model_selection import train_test_split

In [107]:
csv_data = pd.read_csv("/content/drive/MyDrive/Datasets/Titanic/train.csv")

In [108]:
df = pd.DataFrame(csv_data)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [109]:
print(df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [110]:
data = df.loc[:,["Pclass","Sex","SibSp","Parch","Survived"]]
data

Unnamed: 0,Pclass,Sex,SibSp,Parch,Survived
0,3,male,1,0,0
1,1,female,1,0,1
2,3,female,0,0,1
3,1,female,1,0,1
4,3,male,0,0,0
...,...,...,...,...,...
886,2,male,0,0,0
887,1,female,0,0,1
888,3,female,1,2,0
889,1,male,0,0,1


In [111]:
data["Sex"] = pd.factorize(data.loc[: , "Sex"])[0] # 性別を数値化

In [112]:
t = data["Survived"].values
x = data.drop("Survived",axis=1).values

In [113]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Pclass    891 non-null    int64
 1   Sex       891 non-null    int64
 2   SibSp     891 non-null    int64
 3   Parch     891 non-null    int64
 4   Survived  891 non-null    int64
dtypes: int64(5)
memory usage: 34.9 KB


In [114]:
x_train, x_test, t_train, t_test = train_test_split(x, t, test_size=0.3, random_state=0)

In [115]:
model = lgb.LGBMClassifier()

In [116]:
model.fit(x_train, t_train)

[LightGBM] [Info] Number of positive: 242, number of negative: 381
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388443 -> initscore=-0.453862
[LightGBM] [Info] Start training from score -0.453862


In [117]:
print(model.score(x_train, t_train))
print(model.score(x_test, t_test))

0.8170144462279294
0.7798507462686567


# 他のモデルの正解率との比較

In [118]:
score_li = []
score_li.append([model.score(x_train, t_train),model.score(x_test, t_test)])

ロジスティック回帰

In [119]:
from sklearn.linear_model import LogisticRegression

In [120]:
lr = LogisticRegression()

In [121]:
lr.fit(x_train, t_train)

In [122]:
score_li.append([lr.score(x_train, t_train),lr.score(x_test, t_test)])

k近傍法

In [123]:
from sklearn.neighbors import KNeighborsClassifier

In [124]:
knc = KNeighborsClassifier()

In [125]:
knc.fit(x_train, t_train)

In [126]:
score_li.append([knc.score(x_train, t_train),knc.score(x_test, t_test)])

SVM

In [127]:
from sklearn.svm import SVC

In [128]:
svc = SVC()

In [129]:
svc.fit(x_train, t_train)

In [130]:
score_li.append([svc.score(x_train, t_train),svc.score(x_test, t_test)])

ランダムフォレスト

In [131]:
from sklearn.ensemble import RandomForestClassifier

In [132]:
rfc = RandomForestClassifier()

In [133]:
rfc.fit(x_train, t_train)

In [134]:
print("train : {}".format(rfc.score(x_train, t_train)))
print("test : {}".format(rfc.score(x_test, t_test)))

train : 0.8218298555377207
test : 0.7910447761194029


In [135]:
score_li.append([rfc.score(x_train, t_train),rfc.score(x_test, t_test)])

In [136]:
score_li

[[0.8170144462279294, 0.7798507462686567],
 [0.8025682182985554, 0.7947761194029851],
 [0.7640449438202247, 0.7761194029850746],
 [0.8073836276083467, 0.7947761194029851],
 [0.8218298555377207, 0.7910447761194029]]

In [138]:
columns = ["train","test"]
index = ["LGBM","LR","KNeighbors","SVM","RFC"]
df = pd.DataFrame(data=score_li,columns=columns, index=index)
df

Unnamed: 0,train,test
LGBM,0.817014,0.779851
LR,0.802568,0.794776
KNeighbors,0.764045,0.776119
SVM,0.807384,0.794776
RFC,0.82183,0.791045


**結果 ランダムフォレスト、SVMの方がいい。**
今回のデータセットではLightGBMの良さが分からなかった。パラメータチューニングを行えばよくなるのかなぁ。たくさんのモデルを試した方がよさそう。