# IRISのデータを利用した予測モデル実習

# 学習するデータをセットする

In [1]:
import pandas as pd
import numpy as np

In [4]:
iris = pd.read_csv('data/iris_nan_sample.csv')
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,,1.4,0.2,setosa
5,5.4,,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [3]:
# sepal_widthの欠測値の数を確認
iris.isnull().sum()

sepal_length     0
sepal_width     28
petal_length     0
petal_width      0
species          0
dtype: int64

In [8]:
# 独立変数(x,問題) : speciesとsepal_widthを除いた残りの変数
# 従属変数(y,正解) : sepal_width

In [9]:
# 問題と正解のデータ分類すること(nullは区別しない)
iris_x = iris[['sepal_length', 'petal_length', 'petal_width']].copy()
iris_y = iris['sepal_width'].copy()

In [10]:
# 学習用 / 予測用
# 条件索引を利用してsepal_widthがnullであるインデックスが何度か確認する.
nan_idx = iris.index[iris['sepal_width'].isnull()]
nan_idx

Int64Index([  4,   5,   7,  10,  17,  20,  26,  30,  34,  38,  45,  46,  78,
             82,  83,  85,  87,  91,  92,  93,  96, 104, 117, 122, 123, 129,
            131, 133],
           dtype='int64')

In [11]:
# 条件索引を利用してsepal_widthがnullではないインデックスが何度か確認する
# not_nan_idx = iris.index[iris['sepal_width'].notnull()]
not_nan_idx = iris.index[~iris['sepal_width'].isnull()]
not_nan_idx

Int64Index([  0,   1,   2,   3,   6,   8,   9,  11,  12,  13,
            ...
            140, 141, 142, 143, 144, 145, 146, 147, 148, 149],
           dtype='int64', length=122)

In [14]:
# iris_xとiris_yでnot_nan_idxインデックスの番号に該当する資料だけを照会してtrain_x,train_yにそれぞれ保存する。
train_x = iris_x.iloc[not_nan_idx]
train_y = iris_y.iloc[not_nan_idx]

In [15]:
# iris_xでnan_idxインデックス番号に該当する資料だけを照会してpred_xに保存する。
pred_x = iris_x.iloc[nan_idx]

## 特定属性の欠測値予測

- 使用データ: train_x, train_y, pred_x
- 予測モデル : 特定属性の欠測値予測
    - LinearRegression
    - Decision Tree Regressor
    - RandomForestRegressor
    - XGBoostRegressor

- 結晶係数、MSE値を比較して最も性能の良いモデルを確認
- {'linear_reg':[0000、0000]}

In [18]:
# ライブラリを設置 : pip install xgboost
# 数値の予想にはRegressor系を、分類時はClassifier系を使用します。
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [19]:
linear_reg = LinearRegression()
dt_reg = DecisionTreeRegressor()
rf_reg = RandomForestRegressor()
xgb_reg = XGBRegressor()

In [20]:
from sklearn.metrics import mean_squared_error

In [21]:
answer = pd.read_csv('data/iris_answer.csv')
answer.drop('Unnamed: 0', axis=1, inplace=True)

In [22]:
answer

Unnamed: 0,answer
0,3.6
1,3.9
2,3.4
3,3.7
4,3.5
5,3.4
6,3.4
7,3.1
8,3.1
9,3.0


In [23]:
# 線形回帰による学習と予測
# 学習
linear_reg.fit(train_x, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [24]:
linear_reg.score(train_x, train_y)

0.5327479906532779

In [25]:
linear_reg.predict(train_x)

array([3.42107017, 3.31337246, 3.26423694, 3.09326373, 3.21232264,
       3.0441282 , 3.19431355, 3.14239925, 3.19902688, 3.10546916,
       3.91513649, 3.80659456, 3.76217236, 3.62897347, 3.42300472,
       3.48350145, 3.38607462, 3.42687382, 2.96671272, 3.25009696,
       3.41635684, 3.47491902, 3.0885504 , 3.64504801, 3.35586011,
       3.63646557, 3.48434567, 3.69502775, 3.25287573, 3.36250799,
       3.48628022, 3.21703596, 3.10269038, 3.49208388, 3.24925273,
       3.15182591, 3.47020569, 3.36722131, 3.23760723, 3.09213521,
       3.12713075, 2.77931297, 3.08742188, 2.59419978, 2.98165873,
       2.68466492, 3.02027727, 2.73682532, 2.62138941, 2.99857748,
       2.86706704, 2.75296756, 3.06741054, 3.25174721, 2.66134439,
       2.70080716, 2.9844375 , 2.77073054, 2.8286946 , 3.10240608,
       2.80403764, 2.6319741 , 3.0882661 , 3.19789836, 3.07134735,
       3.08186433, 2.99833137, 2.77544387, 2.77350931, 2.55364669,
       3.13655741, 2.77459964, 2.77931297, 2.48456753, 2.71603

In [26]:
linear_reg.predict(pred_x)

array([3.36722131, 3.52792365, 3.30865914, 3.52405454, 3.4815669 ,
       3.40693019, 3.37109042, 3.14239925, 3.25481028, 3.10269038,
       3.32002034, 3.30394581, 2.8767398 , 2.93892497, 2.58586346,
       2.93723653, 2.97585507, 2.81152974, 2.8803628 , 2.73851377,
       2.76988632, 2.80815285, 2.92727947, 2.80628601, 2.98552783,
       2.82211443, 3.08967025, 2.68691328])

In [27]:
linear_reg.score(pred_x, answer)

0.4379978332553496

In [29]:
# Decision Treeを利用した学習と予測
# max_depthによる違いがあるかどうかもチェックする。
dt_reg = DecisionTreeRegressor(max_depth=6)

In [30]:
dt_reg.fit(train_x, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=6, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [31]:
dt_reg.score(train_x, train_y)

0.8709386937439612

In [32]:
dt_reg.predict(train_x)

array([3.57777778, 3.19166667, 3.19166667, 3.19166667, 3.4       ,
       2.9       , 3.19166667, 3.19166667, 3.19166667, 3.        ,
       4.        , 4.4       , 3.9       , 3.8       , 3.57777778,
       3.57777778, 3.6       , 3.3       , 3.4       , 3.19166667,
       3.57777778, 3.57777778, 3.19166667, 3.57777778, 4.1       ,
       4.2       , 3.19166667, 3.5       , 3.19166667, 3.57777778,
       3.5       , 2.3       , 3.2       , 3.5       , 3.57777778,
       3.19166667, 3.57777778, 3.19166667, 3.15      , 3.        ,
       3.15      , 2.4       , 3.        , 2.85714286, 2.85714286,
       2.4       , 3.        , 2.7       , 2.        , 2.85714286,
       2.2       , 2.85714286, 2.9       , 3.        , 2.85714286,
       2.85714286, 2.85714286, 2.45      , 2.85714286, 2.8       ,
       2.85714286, 2.85714286, 3.        , 3.        , 3.        ,
       3.        , 2.6       , 2.45      , 2.45      , 2.85714286,
       3.        , 2.85714286, 2.4       , 2.85714286, 2.85714

In [33]:
dt_reg.score(pred_x, answer)

0.43255492125375167

In [34]:
# RandomForestを活用した予測
rf_reg = RandomForestRegressor(n_estimators=100)

In [35]:
rf_reg.fit(train_x, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [36]:
rf_reg.score(train_x, train_y)

0.9317918369231016

In [37]:
rf_reg.predict(train_x)

array([3.502     , 3.153     , 3.219     , 3.16      , 3.319     ,
       2.961     , 3.187     , 3.32      , 3.222     , 3.058     ,
       3.981     , 4.156     , 3.795     , 3.893     , 3.703     ,
       3.664     , 3.423     , 3.461     , 3.387     , 3.145     ,
       3.52      , 3.498     , 3.249     , 3.62      , 3.841     ,
       3.992     , 3.248     , 3.712     , 3.416     , 3.464     ,
       3.436     , 2.653     , 3.059     , 3.432     , 3.67      ,
       3.189     , 3.642     , 3.253     , 3.107     , 3.07      ,
       3.046     , 2.45013333, 2.927     , 2.815     , 3.028     ,
       2.313     , 2.923     , 2.609     , 2.175     , 2.927     ,
       2.347     , 2.855     , 2.77633333, 3.059     , 2.933     ,
       2.677     , 2.48      , 2.475     , 3.11      , 2.72133333,
       2.65      , 2.801     , 2.965     , 2.99      , 2.911     ,
       2.984     , 2.501     , 2.41833333, 2.38866667, 2.84833333,
       3.076     , 2.905     , 2.45013333, 2.64533333, 2.788  

In [38]:
rf_reg.score(pred_x, answer)

0.47630811546841006

In [39]:
# XGBoostを利用した学習と予測
xgb_reg = XGBRegressor(n_estimators=50)

In [40]:
xgb_reg.fit(train_x, train_y)



  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=50,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [41]:
xgb_reg.score(train_x, train_y)

0.7940332428085368

In [42]:
xgb_reg.predict(train_x)

array([3.5736427, 3.239203 , 3.239203 , 3.239203 , 3.239203 , 2.9163852,
       3.2737336, 3.239203 , 3.2286863, 2.9397724, 3.8126009, 3.9026337,
       3.6188598, 3.8765488, 3.5736427, 3.580441 , 3.3700516, 3.4330142,
       3.239203 , 3.2450435, 3.6120615, 3.6120615, 3.239203 , 3.6188598,
       3.667947 , 3.8101673, 3.291727 , 3.7246234, 3.2737336, 3.5736427,
       3.2450435, 2.7736974, 2.9163852, 3.2518418, 3.580441 , 3.239203 ,
       3.6120615, 3.2450435, 3.015273 , 2.9189823, 2.9135945, 2.5984936,
       2.9540374, 2.8313534, 2.9415452, 2.329463 , 2.9540374, 2.6255677,
       2.2172203, 2.8313534, 2.5163324, 2.8407452, 2.705115 , 2.9540374,
       2.8313534, 2.6586895, 2.8313534, 2.5502994, 2.8770406, 2.705115 ,
       2.7707202, 2.8407452, 2.9189823, 2.9540374, 2.949764 , 2.872831 ,
       2.5324516, 2.4436781, 2.4258301, 2.751806 , 2.9634292, 2.8313534,
       2.5984936, 2.7247322, 2.8313534, 2.8313534, 2.8313534, 2.5041957,
       2.8313534, 3.2520895, 2.7974558, 3.0176709, 

In [43]:
xgb_reg.score(pred_x, answer)

0.5071772613826141