In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split


# 【問題1】train_test_splitのスクラッチ

In [2]:
def scratch_train_test_split(X, y, train_size=0.8):
    """
    検証データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      訓練データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    
    if (type(X) is pd.core.frame.DataFrame):
        X = X.values # ndarrayに変換
        
    if (type(y) is pd.core.frame.DataFrame) or (type(y) is pd.core.frame.Series):
        y = y.values # ndarrayに変換
    
    rows = X.shape[0] # 行数を取得
    index = random.sample(range(rows), k = int(rows * train_size)) # 　ランダムでインデックスを取得
    
    for row in range(rows): # すべてのインデックスをループ
        if row in index: # ランダムで取得したインデックスと等しい場合
            # trainデータに追加
            X_train.append(X[row])
            y_train.append(y[row])
        else:
            # testデータに追加
            X_test.append(X[row])
            y_test.append(y[row])           
    
     # ndarrayに変換
    X_train = np.array(X_train)
    X_test = np.array( X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train, X_test, y_train, y_test

In [3]:
# 確認用のデータ
X = pd.concat([pd.DataFrame(['001','002','003','004','005','006','007','008','009','010']),pd.DataFrame([18,34,23,42,31,52,28,49,38,35])],axis = 1)
X.columns = ['ID','AGE']
y = pd.DataFrame([0,0,0,0,0,1,1,1,1,1])
y.columns = ['TARGET']
print(X)
print(y)

    ID  AGE
0  001   18
1  002   34
2  003   23
3  004   42
4  005   31
5  006   52
6  007   28
7  008   49
8  009   38
9  010   35
   TARGET
0       0
1       0
2       0
3       0
4       0
5       1
6       1
7       1
8       1
9       1


In [4]:
# train_test_splitのスクラッチ
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.6)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[['002' 34]
 ['003' 23]
 ['006' 52]
 ['007' 28]
 ['009' 38]
 ['010' 35]]
[['001' 18]
 ['004' 42]
 ['005' 31]
 ['008' 49]]
[[0]
 [0]
 [1]
 [1]
 [1]
 [1]]
[[0]
 [0]
 [0]
 [1]]


# 【問題2】 分類問題を解くコードの作成

## irisデータセットの作成

In [5]:
from sklearn.datasets import load_iris
iris = load_iris() # irisデータの読み込み

X0 = pd.DataFrame(iris.data) # インデックス付きの2次元データ（データフレーム）で取得
X0 = X0.rename(columns={0:'sepal_length', 1:'sepal_width', 2:'petal_length', 3:'petal_width'})

y0 = pd.DataFrame(iris.target) # インデックス付きの2次元データ（データフレーム）で取得
y0 = y0.rename(columns={0:'Species'}) # 項目名を「Species」に変更

In [6]:
Xy0 = pd.concat([X0, y0], axis=1) #目的変数と説明変数を結合
Xy0 = Xy0.loc[(Xy0['Species']==1) | (Xy0['Species']==2)] # 目的変数「virgicolor」「virginica」を抽出

X0 = X0[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] # 説明変数に分割
y0 = y0['Species'] #目的変数に分割

## シンプルデータセット1の作成

In [7]:
import numpy as np
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X1 = np.concatenate((f0, f1))
y1 = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X1 = X1[random_index]
y1 = y1[random_index]

## シンプルデータセット2の作成

In [8]:
X2 = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## 標準化のスクラッチ

In [9]:
# 標準化
from sklearn.preprocessing import StandardScaler

def scratch_std(X):
        
    scaler = StandardScaler()
    scaler.fit(X)
    X_std = scaler.transform(X)
    
    return X_std

## ロジスティック回帰のスクラッチ

In [10]:
# ロジスティック回帰
from sklearn import linear_model

def scratch_sgd(X_train, X_test, y_train):

    sgd = linear_model.SGDClassifier(loss='log', max_iter=1000, tol=1e-3) # ロジスティック回帰のインスタンスを生成
    sgd.fit(X_train, y_train) # 学習
    y_test_pred = sgd.predict(X_test) #予測
    
    return y_test_pred

## SVMの関数

In [11]:
# SVM
from sklearn.svm import SVC

def scratch_svc(X_train, X_test, y_train):

    svc = SVC(gamma='auto') # SVMのインスタンスを生成
    svc.fit(X_train, y_train) # 学習
    y_test_pred = svc.predict(X_test) #予測
    
    return y_test_pred

## 決定木の関数

In [12]:
# 決定木
from sklearn.tree import DecisionTreeClassifier

def scratch_tree(X_train, X_test, y_train):

    tree = DecisionTreeClassifier(random_state=0) # 決定木のインスタンスを生成
    tree.fit(X_train, y_train) # 学習
    y_test_pred = tree.predict(X_test) # 予測
    
    return y_test_pred

## irisデータセットの学習・推定

In [13]:
# 学習データとテストデータに分割
X0_train, X0_test, y0_train, y0_test = scratch_train_test_split(X0, y0, train_size=0.8)

# 標準化
X0_train_std = scratch_std(X0_train)
X0_test_std = scratch_std(X0_test)

# ロジスティック回帰の学習・推定
y0_sgd_pred = scratch_sgd(X0_train_std, X0_test_std, y0_train)

# SVMの学習・推定
y0_svm_pred = scratch_svc(X0_train_std, X0_test_std, y0_train)

# 決定木の学習・推定
y0_tree_pred = scratch_tree(X0_train_std, X0_test_std, y0_train)


## シンプルデータセット1の学習・推定

In [14]:
# 学習データとテストデータに分割
X1_train, X1_test, y1_train, y1_test = scratch_train_test_split(X1, y1, train_size=0.8)

# 標準化
X1_train_std = scratch_std(X1_train)
X1_test_std = scratch_std(X1_test)

# ロジスティック回帰の学習・推定
y1_sgd_pred = scratch_sgd(X1_train_std, X1_test_std, y1_train)

# SVMの学習・推定
y1_svm_pred = scratch_svc(X1_train_std, X1_test_std, y1_train)

# 決定木の学習・推定
y1_tree_pred = scratch_tree(X1_train_std, X1_test_std, y1_train)

## シンプルデータセット2の学習・推定

In [15]:
# 学習データとテストデータに分割
X2_train, X2_test, y2_train, y2_test = scratch_train_test_split(X2, y2, train_size=0.8)

# 標準化
X2_train_std = scratch_std(X2_train)
X2_test_std = scratch_std(X2_test)

# ロジスティック回帰の学習・推定
y2_sgd_pred = scratch_sgd(X2_train_std, X2_test_std, y2_train)

# SVMの学習・推定
y2_svm_pred = scratch_svc(X2_train_std, X2_test_std, y2_train)

# 決定木の学習・推定
y2_tree_pred = scratch_tree(X2_train_std, X2_test_std, y2_train)

# 【問題3】 回帰問題を解くコードの作成

## House Pricesデータセットの作成

In [16]:
Xy3 = pd.read_csv('train.csv')
X3 =Xy3[['GrLivArea','YearBuilt']]
y3 = Xy3['SalePrice']

In [17]:
print(X3.head())
print(y3.head())

   GrLivArea  YearBuilt
0       1710       2003
1       1262       1976
2       1786       2001
3       1717       1915
4       2198       2000
0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64


## 線形回帰のスクラッチ

In [18]:
from sklearn import linear_model

def scratch_linear(X_train, X_test, y_train):

    sgd = linear_model.SGDRegressor(max_iter=1000, tol=1e-3) # 線形回帰のインスタンスを生成
    sgd.fit(X_train, y_train) # 学習
    y_test_pred = sgd.predict(X_test) #予測
    
    return y_test_pred

## House Pricesデータセットの学習・推定

In [19]:
# 学習データとテストデータに分割
X3_train, X3_test, y3_train, y3_test = scratch_train_test_split(X3, y3, train_size=0.8)

# 標準化
X3_train_std = scratch_std(X3_train)
X3_test_std = scratch_std(X3_test)

# 線形回帰の学習・推定
y3_linear_pred = scratch_linear(X3_train_std, X3_test_std, y3_train)

