# Sprint 機械学習スクラッチ入門

## 【問題1】train_test_splitのスクラッチ

スクラッチの練習として、scikit-learnのtrain_test_splitを自作してみます。以下の雛形をベースとして関数を完成させてください。

In [235]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def scratch_train_test_split(X, y, train_size=0.8,):
    """
    検証データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      訓練データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    class ScrachTrainTestSplitError(Exception):
        pass
            
    # Xとyの桁数チェック
    if(len(X) != len(y)):
        raise ScrachTrainTestSplitError("errr check length X or y")
    
    # seed設定
    np.random.seed(0)
    # シャッフルのための乱数
    permutation = np.random.permutation(len(X))
    # シャッフル
    X = X[permutation]
    y = y[permutation]

    # testとtrainの分割割合
    if(train_size != "" and 0 < train_size < 1):
        train_size_length = round(len(X) * train_size)
    elif(test_size != "" and 0 < test_size < 1):
        test_size_length = round(len(X) * test_size)
        train_size_length = len(X) - test_size_length
    else:
        raise ScrachTrainTestSplitError("error check train size or test size")
    
    # testとtrainに分割する
    X_train = X[:train_size_length]
    X_test = X[train_size_length:]
    y_train = y[:train_size_length]
    y_test = y[train_size_length:]
    
    pass
    return X_train, X_test, y_train, y_test

In [236]:
# 検証用データ生成
X = np.arange(100)
y = np.random.randint(0, 2, size=100)
X,y

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
 array([0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
        0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
        0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0]))

In [237]:
#データの分割（今回作成したScratch）
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.80)
#X_train.shape, X_test.shape, y_train.shape, y_test.shape

# 検証用変数
scratch_X_train, scratch_X_test, scratch_y_train, scratch_y_test = X_train, X_test, y_train, y_test
scratch_X_train,scratch_X_test, scratch_y_train, scratch_y_test

(array([26, 86,  2, 55, 75, 93, 16, 73, 54, 95, 53, 92, 78, 13,  7, 30, 22,
        24, 33,  8, 43, 62,  3, 71, 45, 48,  6, 99, 82, 76, 60, 80, 90, 68,
        51, 27, 18, 56, 63, 74,  1, 61, 42, 41,  4, 15, 17, 40, 38,  5, 91,
        59,  0, 34, 28, 50, 11, 35, 23, 52, 10, 31, 66, 57, 79, 85, 32, 84,
        14, 89, 19, 29, 49, 97, 98, 69, 20, 94, 72, 77]),
 array([25, 37, 81, 46, 39, 65, 58, 12, 88, 70, 87, 36, 21, 83,  9, 96, 67,
        64, 47, 44]),
 array([1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
        1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0]),
 array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1]))

In [238]:
#データの分割（train_test_splitオリジナル）
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# 検証用変数
origin_X_train, origin_X_test, origin_y_train, origin_y_test = X_train, X_test, y_train, y_test
origin_X_train, origin_X_test, origin_y_train, origin_y_test

(array([79,  1, 45, 28, 60, 52, 25, 39, 97, 44, 16, 55, 83, 49, 22, 70, 47,
         4, 82, 94, 53, 66, 26, 84, 31, 63,  8, 75, 98, 57, 71, 99, 86, 96,
        69, 24, 30, 13, 40, 56, 68, 95, 81, 19, 38, 91, 54, 32, 51, 85, 11,
        89, 90, 36, 65, 88, 41, 14, 27, 50, 20, 46, 67, 35, 62,  2, 59, 23,
        58, 43, 10,  0, 73, 21, 77, 42,  3, 93, 48, 34]),
 array([18, 29, 64, 92, 72, 87,  5, 15, 12, 17, 61, 76,  9, 78, 80,  7, 33,
         6, 37, 74]),
 array([1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
        1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
        1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1]),
 array([1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0]))

In [239]:
# listの結合・並び替えを行い、等しいことを確認
scrach_X = np.sort(np.concatenate([scratch_X_train, scratch_X_test]))
scrach_y = np.sort(np.concatenate([scratch_y_train, scratch_y_test]))
origin_X = np.sort(np.concatenate([origin_X_train, origin_X_test]))
origin_y = np.sort(np.concatenate([origin_y_train, origin_y_test]))
print(scrach_X == origin_X)
print(scrach_y == origin_y)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True]
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True 

# 【問題2】 分類問題を解くコードの作成

上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

### irisデータセット

In [240]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, r2_score
dataset = load_iris()
X = pd.DataFrame(dataset.data, columns=["sepal_length", "sepal_width", "petal_length", "petal_width"])
y = pd.DataFrame(dataset.target, columns=["Species"])
df = pd.concat([X, y], axis=1) # Xとyを結合
df = df[(df['Species'] == 1) | (df['Species'] == 2)] # 目的変数はvirgicolorとvirginicaのみ使用
df_ndarray = df.values # ndarrayに変換

# 特徴量と目的変数を格納
X = df_ndarray[:, 0:4]
y = df_ndarray[:, -1]

#データの分割
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.80)

In [241]:
#ロジスティック回帰
from sklearn import linear_model
logistic = linear_model.SGDClassifier(loss='log')
logistic.fit(X_train, y_train)
y_predict_logistic = logistic.predict(X_test)
y_pred = y_predict_logistic

# 評価
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[10  0]
 [ 0 10]]
accuracy =  1.0
precision =  1.0
recall =  1.0
f1 score =  1.0


In [242]:
# SVM
from sklearn.svm import SVC
svc = SVC(gamma='auto')
svc.fit(X_train, y_train)
y_predict_svc = svc.predict(X_test)
y_pred = y_predict_svc

# 評価
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[10  0]
 [ 0 10]]
accuracy =  1.0
precision =  1.0
recall =  1.0
f1 score =  1.0


In [243]:
# 決定木
from sklearn.tree import DecisionTreeRegressor
# 検証用データの学習
dtr = DecisionTreeRegressor().fit(X_train, y_train)
y_predict_dtr = dtr.predict(X_test)
y_pred = y_predict_dtr

# 評価
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[10  0]
 [ 1  9]]
accuracy =  0.95
precision =  0.9090909090909091
recall =  1.0
f1 score =  0.9523809523809523


### シンプルデータセット1作成コード

In [244]:
import numpy as np
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

#データの分割
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.80)

In [245]:
#ロジスティック回帰
from sklearn import linear_model
logistic = linear_model.SGDClassifier(loss='log')
logistic.fit(X_train, y_train)
y_predict_logistic = logistic.predict(X_test)
y_pred = y_predict_logistic

# 評価
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[56  0]
 [ 0 44]]
accuracy =  1.0
precision =  1.0
recall =  1.0
f1 score =  1.0


In [246]:
# SVM
from sklearn.svm import SVC
svc = SVC(gamma='auto')
svc.fit(X_train, y_train)
y_predict_svc = svc.predict(X_test)
y_pred = y_predict_svc

# 評価
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[56  0]
 [ 0 44]]
accuracy =  1.0
precision =  1.0
recall =  1.0
f1 score =  1.0


In [247]:
# 決定木
from sklearn.tree import DecisionTreeRegressor
# 検証用データの学習
dtr = DecisionTreeRegressor().fit(X_train, y_train)
y_predict_dtr = dtr.predict(X_test)
y_pred = y_predict_dtr

# 評価
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[56  0]
 [ 0 44]]
accuracy =  1.0
precision =  1.0
recall =  1.0
f1 score =  1.0


### シンプルデータセット2作成コード

In [248]:
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

#データの分割
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.80)

In [249]:
#ロジスティック回帰
from sklearn import linear_model
logistic = linear_model.SGDClassifier(loss='log')
logistic.fit(X_train, y_train)
y_predict_logistic = logistic.predict(X_test)
y_pred = y_predict_logistic

# 評価
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[4 1]
 [1 2]]
accuracy =  0.75
precision =  0.6666666666666666
recall =  0.6666666666666666
f1 score =  0.6666666666666666


In [250]:
# SVM
from sklearn.svm import SVC
svc = SVC(gamma='auto')
svc.fit(X_train, y_train)
y_predict_svc = svc.predict(X_test)
y_pred = y_predict_svc

# 評価
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[3 2]
 [0 3]]
accuracy =  0.75
precision =  0.6
recall =  1.0
f1 score =  0.7499999999999999


In [251]:
# 決定木
from sklearn.tree import DecisionTreeRegressor
# 検証用データの学習
dtr = DecisionTreeRegressor().fit(X_train, y_train)
y_predict_dtr = dtr.predict(X_test)
y_pred = y_predict_dtr

# 評価
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[5 0]
 [0 3]]
accuracy =  1.0
precision =  1.0
recall =  1.0
f1 score =  1.0


# 【問題3】 回帰問題を解くコードの作成

線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

### House Pricesデータセット

In [252]:
df_original = pd.read_csv("dataset/train.csv")
df = df_original[["GrLivArea", "YearBuilt", "SalePrice"]]

# 説明変数と目的変数に分割し、ndarrayに変換
X = df[["GrLivArea", "YearBuilt"]].values
y = df[["SalePrice"]].values

#データの分割
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.80)

In [253]:
#線形回帰(SGDRegressor)
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

regress = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
regress.fit(X_train, y_train.ravel())
y_predict_regress = regress.predict(X_test)
y_pred = y_predict_regress

In [254]:
# 評価（平均二乗誤差と決定係数）
print('mse_linear = ', mean_squared_error(y_true=y_test, y_pred=y_pred))
print('r2_score = ', r2_score(y_true=y_test, y_pred=y_pred))

mse_linear =  1.3728556049076472e+29
r2_score =  -2.787295989227454e+19
