In [41]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns

In [42]:
df = pd.read_csv("/content/drive/MyDrive/car_evaluation.csv", header=None,
                 names=['price', 'maint', 'doors', 'persons', 'lug_capacity', 'safety', 'output'])
# index_col=0은 첫 번째 열(0번째 열)을 DataFrame의 인덱스로 설정하는 옵션
# 컬럼이 없으므로 header=None을 활용
# names를 사용해 지정해줘도 된다.
print(df)

      price  maint  doors persons lug_capacity safety output
0     vhigh  vhigh      2       2        small    low  unacc
1     vhigh  vhigh      2       2        small    med  unacc
2     vhigh  vhigh      2       2        small   high  unacc
3     vhigh  vhigh      2       2          med    low  unacc
4     vhigh  vhigh      2       2          med    med  unacc
...     ...    ...    ...     ...          ...    ...    ...
1723    low    low  5more    more          med    med   good
1724    low    low  5more    more          med   high  vgood
1725    low    low  5more    more          big    low  unacc
1726    low    low  5more    more          big    med   good
1727    low    low  5more    more          big   high  vgood

[1728 rows x 7 columns]


In [43]:
# 컬럼명을 입히고 싶으면 pd,DataFrame을 사용해도 된다.
df = pd.DataFrame(df, columns=['price', 'maint', 'doors', 'persons', 'lug_capacity', 'safety', 'output'],
                  dtype='object')
print(df)

      price  maint  doors persons lug_capacity safety output
0     vhigh  vhigh      2       2        small    low  unacc
1     vhigh  vhigh      2       2        small    med  unacc
2     vhigh  vhigh      2       2        small   high  unacc
3     vhigh  vhigh      2       2          med    low  unacc
4     vhigh  vhigh      2       2          med    med  unacc
...     ...    ...    ...     ...          ...    ...    ...
1723    low    low  5more    more          med    med   good
1724    low    low  5more    more          med   high  vgood
1725    low    low  5more    more          big    low  unacc
1726    low    low  5more    more          big    med   good
1727    low    low  5more    more          big   high  vgood

[1728 rows x 7 columns]


In [44]:
# 결측치 확인
print(df.isnull().sum())

price           0
maint           0
doors           0
persons         0
lug_capacity    0
safety          0
output          0
dtype: int64


In [45]:
# 인코딩
encoder = LabelEncoder()

for column in df.select_dtypes(include=['object']).columns:
    df[column] = encoder.fit_transform(df[column])
print(df)

      price  maint  doors  persons  lug_capacity  safety  output
0         3      3      0        0             2       1       2
1         3      3      0        0             2       2       2
2         3      3      0        0             2       0       2
3         3      3      0        0             1       1       2
4         3      3      0        0             1       2       2
...     ...    ...    ...      ...           ...     ...     ...
1723      1      1      3        2             1       2       1
1724      1      1      3        2             1       0       3
1725      1      1      3        2             0       1       2
1726      1      1      3        2             0       2       1
1727      1      1      3        2             0       0       3

[1728 rows x 7 columns]


In [46]:
# 레이블 확인
# 각 값이 몇 번 등장했는지 빈도수를 계산
df['output'].value_counts()

Unnamed: 0_level_0,count
output,Unnamed: 1_level_1
2,1210
0,384
1,69
3,65


In [47]:
# 특징과 결과값 지정
# dataframe과 numpy의 차이점
# dataframe은 행과 열을 가진 표(테이블) 형태이고, numpy는 다차원 배열이다. 즉 numpy로 변경하고 계산한다.
# 우리가 앞에서 데이터프레임을 사용한 이유는 행과 열을 통해서 데이터를 전처리해줘야하기 때문이다.
# 최종적으로 머신러닝에 적용하기 전 numpy로 변환한다. -> numpy가 더욱 빠르고, 데이터를 추적하고 분석하기 편하기 때문이다.
X = df.drop(['output'], axis=1).values
y = df['output'].values

print(X[:5])
print(y[:5])

[[3 3 0 0 2 1]
 [3 3 0 0 2 2]
 [3 3 0 0 2 0]
 [3 3 0 0 1 1]
 [3 3 0 0 1 2]]
[2 2 2 2 2]


In [48]:
# 스케일러(표준화한다)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 훈련 데이터와 테스트 데이터 분리 (80% 훈련, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# 행과 열 확인해서 shape 확인하기
X_train.shape, X_test.shape, y_train.shape, y_test.shape



((1382, 6), (346, 6), (1382,), (346,))

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [55]:
clf_svm = SVC(random_state=0)
clf_svm.fit(X_train, y_train)

pred_svm = clf_svm.predict(X_test)

print("\n--- SVM Classifier ---")
print(accuracy_score(y_test, pred_svm))
print(confusion_matrix(y_test, pred_svm))


--- SVM Classifier ---
0.8988439306358381
[[ 67   5  11   0]
 [  6   4   0   1]
 [ 10   0 225   0]
 [  2   0   0  15]]


In [56]:
# Build a logistic regression classifier and predict

clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)

pred_lr = clf_lr.predict(X_test)

print ("\n--- Logistic Regression Classifier ---")
print (accuracy_score(y_test, pred_lr))
print (confusion_matrix(y_test, pred_lr))


--- Logistic Regression Classifier ---
0.661849710982659
[[ 12   0  68   3]
 [  2   0   9   0]
 [ 17   0 217   1]
 [ 12   0   5   0]]


In [57]:
clf_nn = MLPClassifier(random_state=0)
clf_nn.fit(X_train, y_train)

pred_nn = clf_nn.predict(X_test)

print ("\n--- Neural Network Classifier ---")
print (accuracy_score(y_test, pred_nn))
print (confusion_matrix(y_test, pred_nn))


--- Neural Network Classifier ---
0.9739884393063584
[[ 78   0   5   0]
 [  1  10   0   0]
 [  0   0 235   0]
 [  3   0   0  14]]


In [58]:
clf_dt = DecisionTreeClassifier(random_state=0)
clf_dt.fit(X_train, y_train)

pred_dt = clf_dt.predict(X_test)

print ("\n--- Decision Tree Classifier ---")
print (accuracy_score(y_test, pred_dt))
print (confusion_matrix(y_test, pred_dt))


--- Decision Tree Classifier ---
0.9682080924855492
[[ 76   6   1   0]
 [  1  10   0   0]
 [  0   0 235   0]
 [  1   2   0  14]]


In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

print ("\n--- Radom Forest ---")
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
print(accuracy_score(y_test,pred))
print (confusion_matrix(y_test, pred))


--- Radom Forest ---
0.9739884393063584
[[ 76   7   0   0]
 [  0  10   0   1]
 [  0   0 235   0]
 [  1   0   0  16]]
