In [6]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

raw_iris = datasets.load_iris()

X = raw_iris.data
y = raw_iris.target

X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

clf_knn = KNeighborsClassifier(n_neighbors=2)
clf_knn.fit(X_tn_std, y_tn)

knn_pred = clf_knn.predict(X_te_std)
print(knn_pred)

[2 1 0 2 0 2 0 1 1 1 1 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]


In [5]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, knn_pred)
print(accuracy)

0.9473684210526315


In [16]:
from sklearn.pipeline import Pipeline
# 파이프라인 사용 후 코드

# 트레이닝 / 테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

# 파이프라인
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=2))
])

# 학습
knn_pipeline.fit(X_tn, y_tn)

# 예측
pred_knn = knn_pipeline.predict(X_te)
print(pred_knn)

[2 1 0 2 0 2 0 1 1 1 1 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]


In [17]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, knn_pred)
print(accuracy)

0.9473684210526315


In [18]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_te, knn_pred)
print(conf_matrix)

[[13  0  0]
 [ 0 15  1]
 [ 0  1  8]]


In [19]:
from sklearn.metrics import classification_report
class_report = classification_report(y_te, knn_pred)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      0.94      0.94        16
           2       0.89      0.89      0.89         9

    accuracy                           0.95        38
   macro avg       0.94      0.94      0.94        38
weighted avg       0.95      0.95      0.95        38



In [25]:
# 선형 회귀
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

X, y = datasets.fetch_openml('boston', return_X_y=True)
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=1)

#파이프라인
linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_regression', LinearRegression())
])

linear_pipeline.fit(X_tn, y_tn)

# 분석 계수
print(linear_pipeline.named_steps['linear_regression'].coef_)

[-1.07145146  1.34036243  0.26298069  0.66554537 -2.49842551  1.97524314
  0.19516605 -3.14274974  2.66736136 -1.80685572 -2.13034748  0.56172933
 -4.03223518]


  warn(
  warn(


In [26]:
# 상수항 확인
print(linear_pipeline.named_steps['linear_regression'].intercept_)

22.344591029023746


In [28]:
# L2 제약식 Ridge 회귀 분석
from sklearn.linear_model import Ridge
clf_ridge = Ridge(alpha=1)  # 알파값이 클 수록 강한 제약식
clf_ridge.fit(X_tn, y_tn)

print(clf_ridge.coef_)

[-1.09313326e-01  5.81466109e-02 -6.57912472e-03  2.17625586e+00
 -1.14648161e+01  2.96948364e+00 -1.23491124e-03 -1.33558053e+00
  2.86708621e-01 -1.17634405e-02 -8.80356116e-01  6.85811788e-03
 -5.68412252e-01]


In [29]:
print(clf_ridge.intercept_)

38.15376459785541


In [34]:
# L1 제약식 Lasso 회귀 분석
from sklearn.linear_model import Lasso
clf_lasso = Lasso(alpha=0.01)
clf_lasso.fit(X_tn, y_tn)

print(clf_lasso.coef_)

[-1.12603260e-01  5.74119132e-02  2.27694520e-02  2.23755561e+00
 -1.78882034e+01  2.88880009e+00  4.46892800e-03 -1.42175988e+00
  2.99434691e-01 -1.10547556e-02 -9.57257589e-01  6.47523310e-03
 -5.62702027e-01]


In [35]:
print(clf_lasso.intercept_)

42.917590841334736


In [36]:
# elasticnet

from sklearn.linear_model import ElasticNet
clf_elastic = ElasticNet(alpha=0.01, l1_ratio=0.01)
clf_elastic.fit(X_tn, y_tn)

print(clf_elastic.coef_)

[-0.10618455  0.05908425 -0.0357108   1.88718361 -5.04362051  2.97071514
 -0.00599727 -1.24505464  0.27678943 -0.01259106 -0.80843434  0.0072289
 -0.5799861 ]


In [37]:
print(clf_elastic.intercept_)

34.01496030557843


In [47]:
import numpy as np
X_te = np.array(X_te)

In [48]:
from sklearn.metrics import r2_score

pred_lr = linear_pipeline.predict(X_te)
pred_ridge = clf_ridge.predict(X_te)
pred_lasso = clf_lasso.predict(X_te)
pred_elastic = clf_elastic.predict(X_te)

print(r2_score(y_te, pred_lr))
print(r2_score(y_te, pred_ridge))
print(r2_score(y_te, pred_lasso))
print(r2_score(y_te, pred_elastic))

0.7789410172622857
0.7847774316882408
0.7808911360973492
0.7839553424529528




In [49]:
from sklearn.metrics import mean_squared_error

print(mean_squared_error(y_te, pred_lr))
print(mean_squared_error(y_te, pred_ridge))
print(mean_squared_error(y_te, pred_lasso))
print(mean_squared_error(y_te, pred_elastic))

21.8977653960495
21.31961909196864
21.704589600993465
21.40105399674387


In [50]:
# 로지스틱 회귀분석

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

raw_cancer = datasets.load_breast_cancer()

X = raw_cancer.data
y = raw_cancer.target

X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

lr = Pipeline([
    ('sclaer', StandardScaler()),
    ('logistic_regression', LogisticRegression(penalty='l2'))
])

lr.fit(X_tn, y_tn)

print(lr.named_steps['logistic_regression'].coef_)

[[-0.29792942 -0.58056355 -0.3109406  -0.377129   -0.11984232  0.42855478
  -0.71131106 -0.85371164 -0.46688191  0.11762548 -1.38262136  0.0899184
  -0.94778563 -0.94686238  0.18575731  0.99305313  0.11090349 -0.3458275
   0.20290919  0.80470317 -0.91626377 -0.91726667 -0.8159834  -0.86539197
  -0.45539191  0.10347391 -0.83009341 -0.98445173 -0.5920036  -0.61086989]]


In [51]:
print(lr.named_steps['logistic_regression'].intercept_)

[0.02713751]


In [52]:
pred_logistic = lr.predict(X_te)
print(pred_logistic)

[0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0
 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1
 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 0]


In [53]:
pred_proba = lr.predict_proba(X_te)
print(pred_proba)

[[9.98638613e-01 1.36138656e-03]
 [3.95544804e-02 9.60445520e-01]
 [1.30896362e-03 9.98691036e-01]
 [1.24473354e-02 9.87552665e-01]
 [2.44132101e-04 9.99755868e-01]
 [4.50491513e-03 9.95495085e-01]
 [1.13985968e-04 9.99886014e-01]
 [1.82475894e-03 9.98175241e-01]
 [9.67965506e-05 9.99903203e-01]
 [1.75222878e-06 9.99998248e-01]
 [1.76572612e-01 8.23427388e-01]
 [8.24119135e-02 9.17588087e-01]
 [9.66067493e-06 9.99990339e-01]
 [5.39343196e-01 4.60656804e-01]
 [3.98187854e-01 6.01812146e-01]
 [9.95762760e-01 4.23724017e-03]
 [2.75612083e-03 9.97243879e-01]
 [9.99997097e-01 2.90271401e-06]
 [9.99926506e-01 7.34935682e-05]
 [9.99999997e-01 2.78313939e-09]
 [9.98738365e-01 1.26163489e-03]
 [9.81405399e-01 1.85946008e-02]
 [1.77902039e-02 9.82209796e-01]
 [9.65876713e-04 9.99034123e-01]
 [9.99464578e-01 5.35421808e-04]
 [6.73385015e-04 9.99326615e-01]
 [5.50833875e-05 9.99944917e-01]
 [9.69828919e-01 3.01710813e-02]
 [1.62119075e-03 9.98378809e-01]
 [9.99997821e-01 2.17867101e-06]
 [6.005712

In [54]:
from sklearn.metrics import precision_score

precision = precision_score(y_te, pred_logistic)
print(precision)

0.9666666666666667


In [55]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_te, pred_logistic)
print(conf_matrix)

[[50  3]
 [ 3 87]]


In [56]:
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_logistic)
print(class_report)

              precision    recall  f1-score   support

           0       0.94      0.94      0.94        53
           1       0.97      0.97      0.97        90

    accuracy                           0.96       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.96      0.96      0.96       143



In [57]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

raw_wine = datasets.load_wine()

X = raw_wine.data
y = raw_wine.target

X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

nb = Pipeline([
    ('scaler', StandardScaler()),
    ('gaussiannb', GaussianNB())
])

nb.fit(X_tn, y_tn)

pred_nb = nb.predict(X_te)
print(pred_nb)

[0 2 1 0 1 1 0 2 1 1 2 2 0 0 2 1 0 0 2 0 0 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


In [59]:
from sklearn.metrics import recall_score

recall = recall_score(y_te, pred_nb, average='macro')
print(recall)

0.9523809523809524


In [60]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_te, pred_nb)
print(conf_matrix)

[[16  0  0]
 [ 2 18  1]
 [ 0  0  8]]


In [61]:
from sklearn.metrics import classification_report

class_report = classification_report(y_te, pred_nb)
print(class_report)

              precision    recall  f1-score   support

           0       0.89      1.00      0.94        16
           1       1.00      0.86      0.92        21
           2       0.89      1.00      0.94         8

    accuracy                           0.93        45
   macro avg       0.93      0.95      0.94        45
weighted avg       0.94      0.93      0.93        45



In [1]:
# Decision tree
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

raw_wine = datasets.load_wine()
X = raw_wine.data
y = raw_wine.target

X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

dt = Pipeline([
    ('scaler', StandardScaler()),
    ('tree', DecisionTreeClassifier(random_state=0))
])

dt.fit(X_tn, y_tn)
pred_tree = dt.predict(X_te)
print(pred_tree)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 2 0 1 0 1 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 1 0 1 1 1]


In [2]:
from sklearn.metrics import f1_score
f1 = f1_score(y_te, pred_tree, average='macro')
print(f1)

0.9349141206870346


In [4]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_tree)
print(conf_matrix)

[[14  2  0]
 [ 0 20  1]
 [ 0  0  8]]


In [5]:
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_tree)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      0.88      0.93        16
           1       0.91      0.95      0.93        21
           2       0.89      1.00      0.94         8

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.94      0.93      0.93        45



In [7]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm

raw_wine = datasets.load_wine()
X = raw_wine.data
y = raw_wine.target

X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

sm = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', svm.SVC(kernel='linear', random_state=0))
])

sm.fit(X_tn, y_tn)

pred_svm = sm.predict(X_te)
print(pred_svm)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 1 0 1 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


In [8]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, pred_svm)
print(accuracy)

1.0


In [10]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_svm)
print(conf_matrix)

[[16  0  0]
 [ 0 21  0]
 [ 0  0  8]]


In [11]:
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_svm)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00         8

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [13]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

raw_wine = datasets.load_wine()

X = raw_wine.data
y = raw_wine.target

X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

param_grid = {'kernel': ('linear', 'rbf'),
              'C': [0.5, 1, 10, 1000]}

crova = Pipeline([
    ('scaler', StandardScaler()),
    ('kfold', StratifiedKFold(n_splits=5, shuffle=True, random_state=0)),
    ('svc', svm.SVC(random_state=0)),
])

grid_cv = GridSearchCV(crova.named_steps['svc'], param_grid, cv=crova.named_steps['kfold'], scoring='accuracy')
grid_cv.fit(X_tn, y_tn)

print(grid_cv.cv_results_)

{'mean_fit_time': array([0.06601086, 0.00046906, 0.03350306, 0.0004374 , 0.05156312,
       0.00042129, 0.05145812, 0.00067644]), 'std_fit_time': array([6.40103545e-02, 3.03860153e-05, 1.89726392e-02, 1.82165525e-05,
       3.35027688e-02, 1.60716052e-05, 3.34111549e-02, 5.90595882e-05]), 'mean_score_time': array([0.00031195, 0.00023656, 0.00020466, 0.00022864, 0.00019412,
       0.00021987, 0.00019584, 0.0002152 ]), 'std_score_time': array([8.43394864e-05, 8.75018922e-06, 2.95248851e-05, 5.01246358e-06,
       1.27212790e-05, 4.86140101e-06, 1.69961513e-05, 4.26389483e-06]), 'param_C': masked_array(data=[0.5, 0.5, 1, 1, 10, 10, 1000, 1000],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf',
                   'linear', 'rbf'],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
  

In [14]:
import numpy as np
import pandas as pd

np.transpose(pd.DataFrame(grid_cv.cv_results_))

Unnamed: 0,0,1,2,3,4,5,6,7
mean_fit_time,0.066011,0.000469,0.033503,0.000437,0.051563,0.000421,0.051458,0.000676
std_fit_time,0.06401,0.00003,0.018973,0.000018,0.033503,0.000016,0.033411,0.000059
mean_score_time,0.000312,0.000237,0.000205,0.000229,0.000194,0.00022,0.000196,0.000215
std_score_time,0.000084,0.000009,0.00003,0.000005,0.000013,0.000005,0.000017,0.000004
param_C,0.5,0.5,1,1,10,10,1000,1000
param_kernel,linear,rbf,linear,rbf,linear,rbf,linear,rbf
params,"{'C': 0.5, 'kernel': 'linear'}","{'C': 0.5, 'kernel': 'rbf'}","{'C': 1, 'kernel': 'linear'}","{'C': 1, 'kernel': 'rbf'}","{'C': 10, 'kernel': 'linear'}","{'C': 10, 'kernel': 'rbf'}","{'C': 1000, 'kernel': 'linear'}","{'C': 1000, 'kernel': 'rbf'}"
split0_test_score,0.925926,0.666667,0.925926,0.666667,0.925926,0.777778,0.925926,0.962963
split1_test_score,0.962963,0.62963,0.962963,0.777778,0.962963,0.666667,0.962963,0.851852
split2_test_score,0.962963,0.62963,0.962963,0.666667,0.962963,0.740741,0.962963,0.740741


In [15]:
grid_cv.best_score_

0.9549857549857549

In [16]:
grid_cv.best_params_

{'C': 0.5, 'kernel': 'linear'}

In [17]:
clf = grid_cv.best_estimator_
print(clf)

SVC(C=0.5, kernel='linear', random_state=0)


In [18]:
from sklearn.model_selection import cross_validate

metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_scores = cross_validate(clf, X_tn, y_tn, cv=crova.named_steps['kfold'], scoring=metrics)

cv_scores

{'fit_time': array([0.02500486, 0.18827415, 0.02175522, 0.05174708, 0.03310323]),
 'score_time': array([0.00221705, 0.00177789, 0.0014348 , 0.00138593, 0.00133681]),
 'test_accuracy': array([0.92592593, 0.96296296, 0.96296296, 1.        , 0.92307692]),
 'test_precision_macro': array([0.925     , 0.96296296, 0.96969697, 1.        , 0.93333333]),
 'test_recall_macro': array([0.925     , 0.96666667, 0.96296296, 1.        , 0.93333333]),
 'test_f1_macro': array([0.925     , 0.9628483 , 0.96451914, 1.        , 0.92592593])}

In [19]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(clf, X_tn, y_tn, cv=crova.named_steps['kfold'], scoring='accuracy')
print(cv_score)

print(cv_score.mean())

print(cv_score.std())

[0.92592593 0.96296296 0.96296296 1.         0.92307692]
0.9549857549857549
0.028341493140617388


In [20]:
pred_svm = clf.predict(X_te)
print(pred_svm)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 2 0 1 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


In [21]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, pred_svm)
print(accuracy)

0.9777777777777777


In [22]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_svm)
print(conf_matrix)

[[16  0  0]
 [ 0 20  1]
 [ 0  0  8]]


In [23]:
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_svm)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.95      0.98        21
           2       0.89      1.00      0.94         8

    accuracy                           0.98        45
   macro avg       0.96      0.98      0.97        45
weighted avg       0.98      0.98      0.98        45

