In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.datasets import load_iris
import pandas as pd

**【問題1】train_test_splitのスクラッチ**

スクラッチの練習として、scikit-learnのtrain_test_splitを自作してみます。以下の雛形をベースとして関数を完成させてください。

In [2]:
def scratch_train_test_split(X, y, train_size=0.8,):
    """
    検証データを分割する。
    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定
    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      訓練データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """



    X0, X1 = X.shape
    y0 = len(y)
    
    # 入力チェック
    if type(X) != type(np.arange(1)) or type(y) != type(np.arange(1)):
      raise ValueError("ndarrayじゃない")
    elif X0 != y0 or y.shape != np.arange(X0,).shape:
      raise ValueError("シェイプが違う")
    elif train_size < 0 or 1 < train_size:
      raise ValueError("トレインサイズがおかしい")
    else:
      pass


    X0 = round(X0*train_size)

    # X,yを結合
    Xy = np.hstack([X, y.reshape(len(X),1)])

    # Xyをシャッフル
    np.random.shuffle(Xy)

    # 結合した配列を元のシェイプに分割
    X, y = np.array_split(Xy, X1, 1)

    # 戻り値を設定
    X_train, X_test = np.array_split(X, [X0], 0)
    y_train, y_test = np.array_split(y, [X0], 0)

    n, m = y_train.shape
    y_train = y_train.reshape(n*m,)
    n, m = y_test.shape
    y_test = y_test.reshape(n*m,)
    
    return X_train, X_test, y_train, y_test

In [3]:
X = np.arange(20).reshape(2,10).T
y = np.arange(10)

print(X)
print(y)

[[ 0 10]
 [ 1 11]
 [ 2 12]
 [ 3 13]
 [ 4 14]
 [ 5 15]
 [ 6 16]
 [ 7 17]
 [ 8 18]
 [ 9 19]]
[0 1 2 3 4 5 6 7 8 9]


In [4]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, 0.7)

print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[ 5 15]
 [ 3 13]
 [ 8 18]
 [ 0 10]
 [ 4 14]
 [ 9 19]
 [ 1 11]]
[[ 7 17]
 [ 2 12]
 [ 6 16]]
[5 3 8 0 4 9 1]
[7 2 6]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[ 5 15]
 [ 1 11]
 [ 8 18]
 [ 3 13]
 [ 2 12]
 [ 4 14]
 [ 6 16]]
[[ 9 19]
 [ 7 17]
 [ 0 10]]
[5 1 8 3 2 4 6]
[9 7 0]


【問題2】 **分類問題を解くコードの作成**

*ロジスティック回帰*

In [6]:
# irisデータセットの読み込み
iris = load_iris()

# irisデータセットの読み込み
iris = load_iris()

iris_dataset = load_iris()
X = pd.DataFrame(iris_dataset.data, columns=["sepal_length", "sepal_width", "petal_length", "petal_width"])
y = pd.DataFrame(iris_dataset.target, columns=["Species"])

df = pd.concat([X, y], axis=1)

# virgicolorとvirginicaの列を抽出
# df = df.loc[:,["sepal_length","petal_length","Species"]]

# virgicolorとvirginicaの行を抽出
df = df[df["Species"].isin(['1','2'])]

print(df)

     sepal_length  sepal_width  petal_length  petal_width  Species
50            7.0          3.2           4.7          1.4        1
51            6.4          3.2           4.5          1.5        1
52            6.9          3.1           4.9          1.5        1
53            5.5          2.3           4.0          1.3        1
54            6.5          2.8           4.6          1.5        1
..            ...          ...           ...          ...      ...
145           6.7          3.0           5.2          2.3        2
146           6.3          2.5           5.0          1.9        2
147           6.5          3.0           5.2          2.0        2
148           6.2          3.4           5.4          2.3        2
149           5.9          3.0           5.1          1.8        2

[100 rows x 5 columns]


In [7]:
# dfを分割
from sklearn.model_selection import train_test_split

# シード値は random_state（データを分割する際の乱数のシード値）で指定できます。
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.loc[:, "Species"])

print("X_train.shape : {}, X_test.shape : {}".format(X_train.shape, X_test.shape))

X_train.shape : (75, 4), X_test.shape : (25, 4)


In [8]:
# 標準化
from sklearn.preprocessing import StandardScaler
# モデルをインスタンス化して、モデルの中にあるメソッドを使える状態にする。
scaler = StandardScaler()

# 標準化をする（訓練用データで.fitを行う）
scaler.fit(X_train)

# 標準化をする（訓練用、検証用双方で.transformを行う）
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

In [9]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier() # ロジスティック回帰モデルのインスタンスを作成
clf.fit(X_train_transformed, y_train) # ロジスティック回帰モデルの重みを学習

Y_pred = clf.predict(X_test_transformed)
print(Y_pred)


[1 1 2 2 2 1 1 2 2 2 2 2 2 1 2 2 1 1 2 2 1 1 1 2 2]


**SVM**

In [10]:
# データの準備
import numpy as np
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

In [11]:
# サンプルデータ１を分割
from sklearn.model_selection import train_test_split

# シード値は random_state（データを分割する際の乱数のシード値）で指定できます。
X_train, X_test, y_train, y_test = train_test_split(X, y)

print("X_train.shape : {}, X_test.shape : {}".format(X_train.shape, X_test.shape))

X_train.shape : (375, 2), X_test.shape : (125, 2)


In [12]:
# 標準化
from sklearn.preprocessing import StandardScaler
# モデルをインスタンス化して、モデルの中にあるメソッドを使える状態にする。
scaler = StandardScaler()

# 標準化をする（訓練用データで.fitを行う）
scaler.fit(X_train)

# 標準化をする（訓練用、検証用双方で.transformを行う）
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

In [13]:
from sklearn.svm import SVC

svm= SVC() # SVMモデルのインスタンスを作成
svm.fit(X_train_transformed, y_train) # SVMモデルの重みを学習

Y_pred = svm.predict(X_test_transformed)
print(Y_pred)

[-1  1 -1  1 -1 -1 -1 -1  1 -1 -1 -1  1  1 -1 -1 -1  1 -1 -1 -1  1 -1 -1
 -1  1 -1  1  1  1  1  1 -1  1 -1  1 -1  1  1  1 -1 -1  1  1 -1  1 -1 -1
 -1 -1 -1  1  1  1 -1  1 -1  1  1 -1 -1 -1 -1  1 -1  1  1 -1 -1  1  1  1
  1 -1  1  1  1  1 -1  1 -1 -1 -1 -1 -1 -1  1 -1  1 -1  1  1 -1 -1  1 -1
  1 -1  1  1  1 -1  1  1  1 -1  1  1  1  1 -1 -1 -1 -1  1  1  1  1 -1 -1
  1  1  1  1 -1]


*決定木*

In [14]:
# データの準備
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [15]:
# サンプルデータ２を分割
from sklearn.model_selection import train_test_split

# シード値は random_state（データを分割する際の乱数のシード値）で指定できます。
X_train, X_test, y_train, y_test = train_test_split(X, y)

print("X_train.shape : {}, X_test.shape : {}".format(X_train.shape, X_test.shape))

X_train.shape : (30, 2), X_test.shape : (10, 2)


In [16]:
# 標準化
from sklearn.preprocessing import StandardScaler
# モデルをインスタンス化して、モデルの中にあるメソッドを使える状態にする。
scaler = StandardScaler()

# 標準化をする（訓練用データで.fitを行う）
scaler.fit(X_train)

# 標準化をする（訓練用、検証用双方で.transformを行う）
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

In [17]:
from sklearn import tree

clf = tree.DecisionTreeClassifier() # 決定木モデルのインスタンスを作成
clf.fit(X_train_transformed, y_train) # 決定木モデルの重みを学習

Y_pred = clf.predict(X_test_transformed)
print(Y_pred)

[1 0 1 1 0 1 1 1 1 0]


*線形回帰*

In [18]:
from google.colab import drive

# データセットの読み込み
drive.mount('/content/drive')
csv_path = "/content/drive/My Drive/Data/house-prices-advanced-regression-techniques/train.csv" # ファイル名（パス）を指定する

df = pd.read_csv(csv_path)
df = df.loc[:, ["GrLivArea", "YearBuilt", "SalePrice"]]
print(df.head())

X = df.loc[:, ["GrLivArea", "YearBuilt"]]
y = df.loc[:, ["SalePrice"]]


Mounted at /content/drive
   GrLivArea  YearBuilt  SalePrice
0       1710       2003     208500
1       1262       1976     181500
2       1786       2001     223500
3       1717       1915     140000
4       2198       2000     250000


In [19]:
# sklearnのから各種モデルをimportする
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# データの前処理
# データを分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# データを確認
print("X_train.shape : {}, X_test.shape : {}".format(X_train.shape, X_test.shape))

# インスタンス作成から学習
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# 推定
y_pred = linear_reg.predict(X_test)

print(y_pred)

X_train.shape : (1095, 2), X_test.shape : (365, 2)
[[130445.81129484]
 [306448.48211546]
 [ 88741.8324304 ]
 [168496.12883044]
 [227800.17562669]
 [ 99569.77660678]
 [186783.75135889]
 [182056.44293199]
 [ 97457.76006969]
 [151960.15487048]
 [131192.06213303]
 [117840.59883308]
 [ 84836.16656948]
 [216795.82827786]
 [226861.55984128]
 [114354.20681418]
 [233851.35381306]
 [133362.46046392]
 [ 79874.70207186]
 [210577.06633379]
 [191870.90367478]
 [226090.42299764]
 [212412.50678112]
 [110884.40546558]
 [201927.81582345]
 [174203.21527073]
 [201986.40724909]
 [ 98966.53796861]
 [215765.02046252]
 [202036.70333958]
 [ 89328.69003012]
 [262849.94729844]
 [298537.18710817]
 [112359.4779443 ]
 [228219.23983687]
 [138826.88586649]
 [193790.24081689]
 [251971.28776788]
 [295988.88679508]
 [105906.0358499 ]
 [109778.31073838]
 [259246.16279641]
 [128015.21791264]
 [267761.22052149]
 [120958.43236415]
 [148080.42317423]
 [117119.86289586]
 [123196.13671951]
 [291177.99612118]
 [167205.33559564]