# Data split

In [28]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
iris = load_iris()
X = iris.data
y = iris.target
print('Size of X: ', X.shape)
print('Size of y: ', y.shape)

Size of X:  (150, 4)
Size of y:  (150,)


## 1. Holdout method
- 데이터를 training set과 test set으로 구분
- `sklearn.model_selection.train_test_split` 활용

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234, stratify=y)
print('Size of X_train: ', X_train.shape)
print('Size of X_test: ', X_test.shape)

Size of X_train:  (105, 4)
Size of X_test:  (45, 4)


In [18]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_test_hat = model.predict(X_test)

print(metrics.classification_report(y_test, y_test_hat))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.73      0.85        15
           2       0.79      1.00      0.88        15

    accuracy                           0.91        45
   macro avg       0.93      0.91      0.91        45
weighted avg       0.93      0.91      0.91        45



## 2. 3-way holdout method
- 데이터를 training set, validation set (또는 development set), test set으로 구분
- 아래 함수를 활용

In [24]:
def train_val_test_split(X, y, val_size=0.3, test_size=0.2, random_state=123, stratify_bool=True):
    if stratify_bool:
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=test_size, 
                                                            random_state=random_state,
                                                            stratify=y)
        val_size_rev = val_size / (1 - test_size)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                          test_size=val_size_rev,
                                                          random_state=random_state,
                                                          stratify=y_train)
        return X_train, X_val, X_test, y_train, y_val, y_test
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=test_size, 
                                                            random_state=random_state)
        val_size_rev = val_size / (1 - test_size)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                          test_size=val_size_rev,
                                                          random_state=random_state)
        return X_train, X_val, X_test, y_train, y_val, y_test

In [25]:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, y, val_size=0.3, test_size=0.2, random_state=1234, stratify_bool=True)
print('Size of X_train: ', X_train.shape)
print('Size of X_val: ', X_val.shape)
print('Size of X_test: ', X_test.shape)

Size of X_train:  (75, 4)
Size of X_val:  (45, 4)
Size of X_test:  (30, 4)


In [26]:
C_set = [0.1, 1, 10]
models = []
for C in C_set:
    print('== Logistic Regression (C = %f) ==' % C)
    model = LogisticRegression(C=C)
    model.fit(X_train, y_train)
    
    y_val_hat = model.predict(X_val)
    print(metrics.classification_report(y_val, y_val_hat))
    print()
    
    models.append(model)

== Logistic Regression (C = 0.100000) ==
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.40      0.57        15
           2       0.62      1.00      0.77        15

    accuracy                           0.80        45
   macro avg       0.88      0.80      0.78        45
weighted avg       0.88      0.80      0.78        45


== Logistic Regression (C = 1.000000) ==
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.93      0.97        15
           2       0.94      1.00      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45


== Logistic Regression (C = 10.000000) ==
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
   

### 1) 최적의 하이퍼파라미터로 training set에 학습된 모델을 그대로 test set에 적용

In [27]:
best_model = models[2]
y_test_hat = best_model.predict(X_test)
print(metrics.classification_report(y_test, y_test_hat))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



### 2) 최적의 하이퍼파라미터로 training set + validation set에 다시 재학습한 후, 이를 test set에 적용

In [34]:
X_concat = np.concatenate((X_train, X_val), axis=0)
y_concat = np.concatenate((y_train, y_val), axis=0)
print(X_concat.shape)
print(y_concat.shape)

(120, 4)
(120,)


In [36]:
# 재학습
best_model = models[2]
best_model.fit(X_concat, y_concat)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
y_test_hat = best_model.predict(X_test)
print(metrics.classification_report(y_test, y_test_hat))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.80      0.89        10
           2       0.83      1.00      0.91        10

    accuracy                           0.93        30
   macro avg       0.94      0.93      0.93        30
weighted avg       0.94      0.93      0.93        30



## 3. Cross-validation
- 데이터를 먼저 training set, test set으로 구분
- Training set을 cross-validation하여 최적의 모델 (하이퍼)파라미터를 찾고 이를 test set에 적용

In [38]:
from IPython.display import Image
Image(url='https://scikit-learn.org/stable/_images/grid_search_cross_validation.png')

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234, stratify=y)
print('Size of X_train: ', X_train.shape)
print('Size of X_test: ', X_test.shape)

Size of X_train:  (105, 4)
Size of X_test:  (45, 4)


In [40]:
C_set = [0.1, 1, 10]

for C in C_set:
    print('== Logistic Regression (C = %f) ==' % C)
    model = LogisticRegression(C=C)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    print('Accuracy: %.4f (+/- %.4f)' % (scores.mean(), scores.std()*1.96))

== Logistic Regression (C = 0.100000) ==
Accuracy: 0.8095 (+/- 0.1562)
== Logistic Regression (C = 1.000000) ==
Accuracy: 0.9429 (+/- 0.0373)
== Logistic Regression (C = 10.000000) ==
Accuracy: 0.9619 (+/- 0.0698)


### 1) 최적의 하이퍼파라미터로 training set에 학습된 모델을 그대로 test set에 적용

In [43]:
best_C = C_set[2]
best_model = LogisticRegression(C=best_C)

best_model.fit(X_train, y_train)
y_test_hat = best_model.predict(X_test)
print(metrics.classification_report(y_test, y_test_hat))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.80      0.89        15
           2       0.83      1.00      0.91        15

    accuracy                           0.93        45
   macro avg       0.94      0.93      0.93        45
weighted avg       0.94      0.93      0.93        45



### 2) 최적의 하이퍼파라미터로 training set + validation set에 다시 재학습한 후, 이를 test set에 적용

In [44]:
X_concat = np.concatenate((X_train, X_val), axis=0)
y_concat = np.concatenate((y_train, y_val), axis=0)
print(X_concat.shape)
print(y_concat.shape)

(150, 4)
(150,)


In [45]:
# 재학습
best_C = C_set[2]
best_model = LogisticRegression(C=best_C)

best_model.fit(X_concat, y_concat)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
y_test_hat = best_model.predict(X_test)
print(metrics.classification_report(y_test, y_test_hat))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.92      0.80      0.86        15
           2       0.82      0.93      0.87        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45



## 4. Cross-validation with hyperparameter search
- `sklearn.model_selection.GridSearchCV` 또는 `sklearn.model_selection.RandomizedSearchCV` 적용
- 다음 시간에...