In [181]:
import numpy as np
import pandas as pd

# 트레이닝/테스트 데이터 분할
from sklearn.model_selection import train_test_split

# 데이터 표준화
from sklearn.preprocessing import StandardScaler

# 모델
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# 성능 평가 지표
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# confusion matrix
from sklearn.metrics import confusion_matrix

# 분류 레포트
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('c:/reposit/data/철원이형/iris.csv')
df.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [3]:
df.shape

(150, 5)

In [4]:
df['class'].unique()

array([0, 1, 2], dtype=int64)

In [5]:
features = df.columns[:-1]

# 피처
X = df[features] # 행렬

# 타겟
y = df['class'] # 벡터

In [6]:
X, y

(     sepal_length  sepal_width  petal_length  petal_width
 0             5.1          3.5           1.4          0.2
 1             4.9          3.0           1.4          0.2
 2             4.7          3.2           1.3          0.2
 3             4.6          3.1           1.5          0.2
 4             5.0          3.6           1.4          0.2
 ..            ...          ...           ...          ...
 145           6.7          3.0           5.2          2.3
 146           6.3          2.5           5.0          1.9
 147           6.5          3.0           5.2          2.0
 148           6.2          3.4           5.4          2.3
 149           5.9          3.0           5.1          1.8
 
 [150 rows x 4 columns],
 0      0
 1      0
 2      0
 3      0
 4      0
       ..
 145    2
 146    2
 147    2
 148    2
 149    2
 Name: class, Length: 150, dtype: int64)

In [7]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=808)
X_tr

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
60,5.0,2.0,3.5,1.0
36,5.5,3.5,1.3,0.2
70,5.9,3.2,4.8,1.8
35,5.0,3.2,1.2,0.2
113,5.7,2.5,5.0,2.0
...,...,...,...,...
142,5.8,2.7,5.1,1.9
147,6.5,3.0,5.2,2.0
84,5.4,3.0,4.5,1.5
121,5.6,2.8,4.9,2.0


In [8]:
# 데이터 standardization
std_scale = StandardScaler()
std_scale.fit(X_tr)
X_tr_std = std_scale.transform(X_tr)
X_te_std = std_scale.transform(X_te)

X_tr, X_te

(     sepal_length  sepal_width  petal_length  petal_width
 60            5.0          2.0           3.5          1.0
 36            5.5          3.5           1.3          0.2
 70            5.9          3.2           4.8          1.8
 35            5.0          3.2           1.2          0.2
 113           5.7          2.5           5.0          2.0
 ..            ...          ...           ...          ...
 142           5.8          2.7           5.1          1.9
 147           6.5          3.0           5.2          2.0
 84            5.4          3.0           4.5          1.5
 121           5.6          2.8           4.9          2.0
 38            4.4          3.0           1.3          0.2
 
 [112 rows x 4 columns],
      sepal_length  sepal_width  petal_length  petal_width
 6             4.6          3.4           1.4          0.3
 32            5.2          4.1           1.5          0.1
 136           6.3          3.4           5.6          2.4
 47            4.6          3

In [10]:
# 학습
clf_knn = KNeighborsClassifier(n_neighbors=2)
clf_knn.fit(X_tr_std, y_tr)

In [11]:
# 예측
knn_pred = clf_knn.predict(X_te_std)
knn_pred

array([0, 0, 2, 0, 1, 2, 0, 0, 1, 2, 2, 1, 2, 1, 0, 0, 1, 1, 2, 0, 1, 0,
       2, 1, 2, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 2, 1, 0], dtype=int64)

In [13]:
# 정확도
accuracy = accuracy_score(y_te, knn_pred)
print('정확도: ', accuracy)

# 정밀도
precision = precision_score(y_te, knn_pred, average='macro')
print('정밀도: ', precision)

# 리콜
recall = recall_score(y_te, knn_pred, average='macro')
print('리콜: ', recall)

# F1 스코어
f1 = f1_score(y_te, knn_pred, average='macro')
print('f1 스코어:', f1)

정확도:  0.9210526315789473
정밀도:  0.9246031746031745
리콜:  0.923076923076923
f1 스코어: 0.9229629629629629


In [15]:
# confusion matrix 확인
conf_matrix = confusion_matrix(y_te, knn_pred)
print(conf_matrix)

[[12  0  0]
 [ 0 12  1]
 [ 0  2 11]]


In [18]:
# 분류 레포트 확인
class_report = classification_report(y_te, knn_pred)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.86      0.92      0.89        13
           2       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38



## 선형 회귀 분석

In [129]:
df = pd.read_csv('c:/reposit/data/철원이형/house_prices.csv')
df.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7


In [130]:
df.shape

(506, 14)

In [131]:
features = df.columns[:-1]

X = df[features]
y = df['MEDV']

In [132]:
# 트레이닝 / 테스트 데이터 분할
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0)

In [133]:
# 데이터 standardization
std_scale = StandardScaler()
std_scale.fit(X_tr)
X_tr_std = std_scale.transform(X_tr)
X_te_std = std_scale.transform(X_te)

In [134]:
# 선형 회귀 분석 학습
clf_lr = LinearRegression()
clf_lr.fit(X_tr_std, y_tr)

In [135]:
# 선형 회귀 분석 모형 추정 계수 확인
print(clf_lr.coef_) # 계수, 가중치 w
print(clf_lr.intercept_) # 상수, 바이어스 b

[-0.97100092  1.04667838 -0.04044753  0.59408776 -1.80876877  2.60991991
 -0.19823317 -3.00216551  2.08021582 -1.93289037 -2.15743759  0.75199122
 -3.59027047]
22.608707124010557


In [136]:
# 릿지 회귀 분석(L2 제약식 적용)
clf_ridge = Ridge(alpha=1)
clf_ridge.fit(X_tr_std, y_tr)

In [137]:
# 라쏘 회귀 분석(L1 제약식 적용)
clf_lasso = Lasso(alpha=.01)
clf_lasso.fit(X_tr_std, y_tr)

In [138]:
# 엘라스틱넷
clf_elastic = ElasticNet(alpha=.01, l1_ratio=.01)
clf_elastic.fit(X_tr_std, y_tr)

In [139]:
# 예측
pred_lr = clf_lr.predict(X_te_std)
pred_ridge = clf_ridge.predict(X_te_std)
pred_lasso = clf_lasso.predict(X_te_std)
pred_elastic = clf_elastic.predict(X_te_std)

In [140]:
# 모형 평가 - R제곱값
print(r2_score(y_te, pred_lr))
print(r2_score(y_te, pred_ridge))
print(r2_score(y_te, pred_lasso))
print(r2_score(y_te, pred_elastic))

0.6354638433202132
0.6345884564889056
0.6343061000666705
0.6322273400977831


In [141]:
# 모형 평가 - MSE
print(mean_squared_error(y_te, pred_lr))
print(mean_squared_error(y_te, pred_ridge))
print(mean_squared_error(y_te, pred_lasso))
print(mean_squared_error(y_te, pred_elastic))

29.78224509230234
29.85376333454759
29.876831576246804
30.046664219036884


In [142]:
# 릿지 회귀 분석(L2 제약식 적용)
clf_ridge = Ridge(alpha=0.01, solver='auto', random_state=0)
clf_ridge.fit(X_tr_std, y_tr)

pred_ridge = clf_ridge.predict(X_te_std)

print(mean_squared_error(y_te, pred_ridge))

29.7829643001264


## 로지스틱 회귀 분석

In [143]:
df = pd.read_csv('c:/reposit/data/철원이형/wine_data.csv')
df.head(3)

Unnamed: 0,Alcohol,Malic,Ash,Alcalinity,Magesium,Phenols,Flavanoids,Nonflavanoids,Proanthocyanins,Color,Hue,Dilution,Proline,class
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,0
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,0
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,0


In [144]:
df.shape

(178, 14)

In [145]:
df.columns

Index(['Alcohol', 'Malic', 'Ash', 'Alcalinity', 'Magesium', 'Phenols',
       'Flavanoids', 'Nonflavanoids', 'Proanthocyanins', 'Color', 'Hue',
       'Dilution', 'Proline', 'class'],
      dtype='object')

In [146]:
features = df.columns[:-1]

X = df[features]
y = df['class']

In [147]:
# 트레이닝 / 테스트 데이터 분할
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0)

In [148]:
# 데이터 표준화
std_scale = StandardScaler()
std_scale.fit(X_tr)
X_tr_std = std_scale.transform(X_tr)
X_te_std = std_scale.transform(X_te)

In [161]:
# 로지스틱 회귀 분석(L2 제약식 적용)
clf_logi_l2 = LogisticRegression(penalty='l2', max_iter=10000)
clf_logi_l2.fit(X_tr, y_tr)

In [162]:
# 로지스틱 회귀분석 모형(L2 제약식 적용) 추정 계수
print(clf_logi_l2.coef_)
print(clf_logi_l2.intercept_)

[[ 0.4606876   0.46997885  0.59341787 -0.19144331 -0.02013054  0.19082452
   0.76664885  0.08896556  0.07084868  0.25261258 -0.00793739  0.5955938
   0.00947839]
 [-0.52143325 -0.68457314 -0.69603822  0.12500067 -0.0214034   0.10789957
   0.30640058 -0.04651889  0.51938374 -1.1823636   0.23276953  0.10446698
  -0.00716379]
 [ 0.06074565  0.21459429  0.10262035  0.06644264  0.04153393 -0.29872409
  -1.07304943 -0.04244667 -0.59023241  0.92975102 -0.22483214 -0.70006079
  -0.0023146 ]]
[-14.34628528  18.69165316  -4.34536788]


In [163]:
# 예측
pred_logistic = clf_logi_l2.predict(X_te_std)
pred_logistic



array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1], dtype=int64)

In [164]:
# 확률값으로 예측
pred_proba = clf_logi_l2.predict_proba(X_te_std)
print(pred_proba)

[[1.37082023e-14 1.00000000e+00 1.62751714e-11]
 [1.11456345e-13 9.99998551e-01 1.44932992e-06]
 [1.85232845e-16 1.00000000e+00 1.50210441e-13]
 [1.23576480e-14 1.00000000e+00 7.88073572e-12]
 [1.70492010e-14 1.00000000e+00 9.56748706e-12]
 [2.39961693e-13 1.00000000e+00 8.62821966e-13]
 [1.42459098e-13 1.00000000e+00 7.36611881e-11]
 [3.14964925e-14 9.99999859e-01 1.41062680e-07]
 [1.38227150e-16 1.00000000e+00 2.03820979e-12]
 [8.32596818e-17 1.00000000e+00 7.37932305e-12]
 [1.10400538e-14 9.99999969e-01 3.13390242e-08]
 [4.50424787e-14 9.99999156e-01 8.43947725e-07]
 [1.12083946e-13 1.00000000e+00 1.88012515e-11]
 [2.17395047e-16 1.00000000e+00 6.93574811e-14]
 [1.17550834e-13 9.99999690e-01 3.10101546e-07]
 [1.24805699e-17 1.00000000e+00 9.02504869e-14]
 [6.07628010e-14 1.00000000e+00 4.07873824e-11]
 [3.46887730e-13 1.00000000e+00 3.90528224e-11]
 [3.25732529e-16 9.99999999e-01 9.91155477e-10]
 [3.23155601e-14 1.00000000e+00 4.25643016e-12]
 [1.14208874e-16 1.00000000e+00 6.691241



In [165]:
# 정확도 accuracy
accuracy = accuracy_score(y_te, pred_logistic)
accuracy

0.4666666666666667

In [166]:
# 정밀도
precision = precision_score(y_te, pred_logistic, average='macro')
precision

  _warn_prf(average, modifier, msg_start, len(result))


0.15555555555555556

In [167]:
# 리콜
recall = recall_score(y_te, pred_logistic, average='macro')
recall

0.3333333333333333

In [168]:
# F1 스코어
f1 = f1_score(y_te, pred_logistic, average='macro')
f1

0.21212121212121213

In [169]:
conf_matrix = confusion_matrix(y_te, pred_logistic)
print(conf_matrix)

[[ 0 16  0]
 [ 0 21  0]
 [ 0  8  0]]


In [170]:
# 분류 레포트 확인
class_report = classification_report(y_te, pred_logistic)
print(class_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.47      1.00      0.64        21
           2       0.00      0.00      0.00         8

    accuracy                           0.47        45
   macro avg       0.16      0.33      0.21        45
weighted avg       0.22      0.47      0.30        45



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 나이브 베이즈 실습

In [171]:
df = pd.read_csv('c:/reposit/data/철원이형/wine_data.csv')
df.head(3)

Unnamed: 0,Alcohol,Malic,Ash,Alcalinity,Magesium,Phenols,Flavanoids,Nonflavanoids,Proanthocyanins,Color,Hue,Dilution,Proline,class
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,0
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,0
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,0


In [173]:
features = df.columns[:-1]
features

Index(['Alcohol', 'Malic', 'Ash', 'Alcalinity', 'Magesium', 'Phenols',
       'Flavanoids', 'Nonflavanoids', 'Proanthocyanins', 'Color', 'Hue',
       'Dilution', 'Proline'],
      dtype='object')

In [174]:
X = df[features]
y = df['class']

In [179]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0, stratify=y)

std_scale = StandardScaler()
std_scale.fit(X_tr)
X_tr_std = std_scale.transform(X_tr)
X_te_std = std_scale.transform(X_te)

clf_gnb = GaussianNB()
clf_gnb.fit(X_tr_std, y_tr)

pred_gnb = clf_gnb.predict(X_te_std)
print(pred_gnb)

conf_matrix = confusion_matrix(y_te, pred_gnb)
print(conf_matrix)

class_report = classification_report(y_te, pred_gnb)
print(class_report)

[2 0 1 1 2 2 2 2 0 2 0 0 1 2 0 0 0 1 2 0 0 2 2 2 1 0 1 0 1 0 1 1 1 2 1 0 1
 1 2 1 1 1 0 1 1]
[[14  1  0]
 [ 0 17  1]
 [ 0  0 12]]
              precision    recall  f1-score   support

           0       1.00      0.93      0.97        15
           1       0.94      0.94      0.94        18
           2       0.92      1.00      0.96        12

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45



## 의사결정나무

In [182]:
df = pd.read_csv('c:/reposit/data/철원이형/wine_data.csv')

features = df.columns[:-1]

X = df[features]
y = df['class']

X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0, stratify=y)

std_scale = StandardScaler()
std_scale.fit(X_tr)
X_tr_std = std_scale.transform(X_tr)
X_te_std = std_scale.transform(X_te)

clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_tr_std, y_tr)

pred_tree = clf_tree.predict(X_te_std)
print(pred_tree)

conf_matrix = confusion_matrix(y_te, pred_tree)
print(conf_matrix)

class_report = classification_report(y_te, pred_tree)
print(class_report)

[2 1 1 1 2 2 2 2 0 2 0 0 1 0 0 0 0 1 2 0 0 2 2 2 1 1 1 0 1 1 1 1 1 1 1 0 1
 1 2 1 1 1 0 1 1]
[[11  4  0]
 [ 0 18  0]
 [ 1  0 11]]
              precision    recall  f1-score   support

           0       0.92      0.73      0.81        15
           1       0.82      1.00      0.90        18
           2       1.00      0.92      0.96        12

    accuracy                           0.89        45
   macro avg       0.91      0.88      0.89        45
weighted avg       0.90      0.89      0.89        45



In [185]:
df = pd.read_csv('c:/reposit/data/철원이형/wine_data.csv')

features = df.columns[:-1]

X = df[features]
y = df['class']

X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0, stratify=y)

std_scale = StandardScaler()
std_scale.fit(X_tr)
X_tr_std = std_scale.transform(X_tr)
X_te_std = std_scale.transform(X_te)

clf_tree = DecisionTreeClassifier(splitter='random')
clf_tree.fit(X_tr_std, y_tr)

pred_tree = clf_tree.predict(X_te_std)
print(pred_tree)

conf_matrix = confusion_matrix(y_te, pred_tree)
print(conf_matrix)

class_report = classification_report(y_te, pred_tree)
print(class_report)

[2 1 1 1 2 2 2 2 0 2 0 0 1 1 0 0 0 1 1 0 0 2 2 2 1 0 1 0 1 1 0 1 1 2 1 0 1
 1 2 1 1 1 0 1 1]
[[12  3  0]
 [ 1 16  1]
 [ 0  2 10]]
              precision    recall  f1-score   support

           0       0.92      0.80      0.86        15
           1       0.76      0.89      0.82        18
           2       0.91      0.83      0.87        12

    accuracy                           0.84        45
   macro avg       0.86      0.84      0.85        45
weighted avg       0.85      0.84      0.85        45



In [193]:
df = pd.read_csv('c:/reposit/data/철원이형/wine_data.csv')

features = df.columns[:-1]

X = df[features]
y = df['class']

X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0, stratify=y)

std_scale = StandardScaler()
std_scale.fit(X_tr)
X_tr_std = std_scale.transform(X_tr)
X_te_std = std_scale.transform(X_te)

clf_tree = DecisionTreeClassifier(criterion='log_loss', splitter='random')
clf_tree.fit(X_tr_std, y_tr)

pred_tree = clf_tree.predict(X_te_std)
print(pred_tree)

conf_matrix = confusion_matrix(y_te, pred_tree)
print(conf_matrix)

class_report = classification_report(y_te, pred_tree)
print(class_report)

[2 0 1 0 2 2 2 2 0 2 0 0 1 2 0 0 0 1 2 0 0 2 2 2 1 0 1 0 1 0 1 1 1 2 1 0 1
 1 2 1 1 1 0 1 1]
[[15  0  0]
 [ 0 17  1]
 [ 0  0 12]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        12

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

