### import modules

In [41]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

### Load the data
data from [here -> archive.ics.uci.edu](https://archive.ics.uci.edu/ml/datasets/car+evaluation)

In [127]:
file_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
df = pd.read_csv(file_path, header = None)
df.columns = ['Buying', 'Maint', 'Doors', 'Persons', 'Lug_boot', 'Safety', 'Class']
#df = df[(df.Persons != 'more')&(df.Doors != '5more')]
df.index = range(len(df))
df.tail()

Unnamed: 0,Buying,Maint,Doors,Persons,Lug_boot,Safety,Class
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good
1727,low,low,5more,more,big,high,vgood


In [128]:
# 특징과 라벨 분리
X = df.drop('Class', axis = 1)
Y = df['Class']

In [129]:
# 학습 데이터와 평가 데이터 분리
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, Y, random_state = 42)

In [130]:
Train_Y.value_counts()

unacc    916
acc      281
good      54
vgood     45
Name: Class, dtype: int64

In [131]:
# 문자 라벨을 숫자로 치환 -> 여기선 binary로 바꾼다.
Train_Y.replace({"unacc":-1, "acc":-1, "good":1, "vgood":1 }, inplace = True)
Test_Y.replace({"unacc":-1, "acc":-1, "good":1, "vgood":1 }, inplace = True)

In [132]:
Train_X.head() # Buying, Maint, Lug_boot, safety 변수가 범주형 변수로 판단됨

Unnamed: 0,Buying,Maint,Doors,Persons,Lug_boot,Safety
1036,med,high,4,4,small,med
757,high,low,2,2,small,med
589,high,high,3,more,med,med
907,med,vhigh,3,4,big,med
1159,med,med,4,more,big,med


In [133]:
Train_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296 entries, 1036 to 1126
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Buying    1296 non-null   object
 1   Maint     1296 non-null   object
 2   Doors     1296 non-null   object
 3   Persons   1296 non-null   object
 4   Lug_boot  1296 non-null   object
 5   Safety    1296 non-null   object
dtypes: object(6)
memory usage: 70.9+ KB


In [134]:
# 자세한 범주형 변수 판별 => 모든 변수가 범주형임을 확인
for col in Train_X.columns:
    print(col, Train_X[col].nunique())

Buying 4
Maint 4
Doors 4
Persons 3
Lug_boot 3
Safety 3


### 더미화를 이용한 Categorical variables 처리

In [135]:
# 모든 변수가 범주이므로, 더미화를 위해 전부 string 타입으로 변환
Train_X = Train_X.astype(str) 

In [136]:
from feature_engine.categorical_encoders import OneHotCategoricalEncoder as OHE
dummy_model = OHE(variables = Train_X.columns.tolist(),
                 drop_last = True)

dummy_model.fit(Train_X)

d_Train_X = dummy_model.transform(Train_X)
d_Test_X = dummy_model.transform(Test_X)

In [137]:
print(d_Train_X.shape)
print(d_Test_X.shape)

(1296, 15)
(432, 15)


In [138]:
# 더미화를 한 뒤의 모델 테스트
from sklearn.neighbors import KNeighborsClassifier as KNN
model = KNN().fit(d_Train_X, Train_Y)
pred_Y = model.predict(d_Test_X)

from sklearn.metrics import f1_score
f1_score(Test_Y, pred_Y)

0.7999999999999999

### 연속형 변수로 치환

In [139]:
Train_df = pd.concat([Train_X, Train_Y], axis = 1)
f_Test_X = Test_X

for col in Train_X.columns: # 보통은 범주 변수만 순회
    
    # col에 따른 Class의 평균을 나타내는 사전 (replace를 쓰기 위해, 사전으로 만듦)
    temp_dict = Train_df.groupby(col)['Class'].mean().to_dict() 
    print(temp_dict)
    print()
    
    # 변수 치환  
    Train_df[col] = Train_df[col].replace(temp_dict)   
    
    # 테스트 데이터도 같이 치환해줘야 함 (나중에 활용하기 위해서는 저장도 필요)
    f_Test_X[col] = Test_X[col].astype(str).replace(temp_dict) 

{'high': -1.0, 'low': -0.6109422492401215, 'med': -0.7891566265060241, 'vhigh': -1.0}

{'high': -0.9335347432024169, 'low': -0.638095238095238, 'med': -0.8115501519756839, 'vhigh': -1.0}

{'2': -0.8731117824773413, '3': -0.8691588785046729, '4': -0.8142414860681114, '5more': -0.8317757009345794}

{'2': -1.0, '4': -0.7617977528089888, 'more': -0.7835294117647059}

{'big': -0.7679814385150812, 'med': -0.8425925925925926, 'small': -0.930715935334873}

{'high': -0.6869158878504673, 'low': -1.0, 'med': -0.8504672897196262}



In [140]:
Train_df.head()

Unnamed: 0,Buying,Maint,Doors,Persons,Lug_boot,Safety,Class
1036,-0.789157,-0.933535,-0.814241,-0.761798,-0.930716,-0.850467,-1
757,-1.0,-0.638095,-0.873112,-1.0,-0.930716,-0.850467,-1
589,-1.0,-0.933535,-0.869159,-0.783529,-0.842593,-0.850467,-1
907,-0.789157,-1.0,-0.869159,-0.761798,-0.767981,-0.850467,-1
1159,-0.789157,-0.81155,-0.814241,-0.783529,-0.767981,-0.850467,-1


In [141]:
f_Train_X = Train_df.drop('Class', axis = 1)
f_Train_Y = Train_df['Class']

In [142]:
# 치환한 뒤의 모델 테스트
model = KNN().fit(f_Train_X, f_Train_Y)
pred_Y = model.predict(Test_X)

f1_score(Test_Y, pred_Y)


# 라벨을 고려한 전처리이므로 더미화보다 좋은 결과가 나왔음 => 차원도 줄고 성능 상에 이점이 있음

0.8607594936708862

Doors 특징같은 경우에는 값간에 큰 차이가 나지 않는다 -> 연속형 대신에 더미화를 진행해본다.


In [143]:
Train_df = pd.concat([Train_X, Train_Y], axis = 1)
mix_Test_X = Test_X

In [144]:
for col in Train_X.columns: 
    if col == 'Doors' : continue
    
    # col에 따른 Class의 평균을 나타내는 사전 (replace를 쓰기 위해, 사전으로 만듦)
    temp_dict = Train_df.groupby(col)['Class'].mean().to_dict() 
    print(temp_dict)
    print()
    
    # 변수 치환  
    Train_df[col] = Train_df[col].replace(temp_dict)   
    
    # 테스트 데이터도 같이 치환해줘야 함 (나중에 활용하기 위해서는 저장도 필요)
    mix_Test_X[col] = Test_X[col].astype(str).replace(temp_dict) 

{'high': -1.0, 'low': -0.6109422492401215, 'med': -0.7891566265060241, 'vhigh': -1.0}

{'high': -0.9335347432024169, 'low': -0.638095238095238, 'med': -0.8115501519756839, 'vhigh': -1.0}

{'2': -1.0, '4': -0.7617977528089888, 'more': -0.7835294117647059}

{'big': -0.7679814385150812, 'med': -0.8425925925925926, 'small': -0.930715935334873}

{'high': -0.6869158878504673, 'low': -1.0, 'med': -0.8504672897196262}



In [145]:
Train_df.head()

Unnamed: 0,Buying,Maint,Doors,Persons,Lug_boot,Safety,Class
1036,-0.789157,-0.933535,4,-0.761798,-0.930716,-0.850467,-1
757,-1.0,-0.638095,2,-1.0,-0.930716,-0.850467,-1
589,-1.0,-0.933535,3,-0.783529,-0.842593,-0.850467,-1
907,-0.789157,-1.0,3,-0.761798,-0.767981,-0.850467,-1
1159,-0.789157,-0.81155,4,-0.783529,-0.767981,-0.850467,-1


In [146]:
mix_Train_X = Train_df.drop('Class', axis = 1)
mix_Train_Y = Train_df['Class']

In [150]:
dummy_model = OHE(variables = ['Doors'],
                 drop_last = True)

dummy_model.fit(Train_X)

mix_Train_X = dummy_model.transform(mix_Train_X)
mix_Test_X = dummy_model.transform(mix_Test_X)

In [153]:
mix_Train_X

Unnamed: 0,Buying,Maint,Persons,Lug_boot,Safety,Doors_4,Doors_2,Doors_3
1036,-0.789157,-0.933535,-0.761798,-0.930716,-0.850467,1,0,0
757,-1.000000,-0.638095,-1.000000,-0.930716,-0.850467,0,1,0
589,-1.000000,-0.933535,-0.783529,-0.842593,-0.850467,0,0,1
907,-0.789157,-1.000000,-0.761798,-0.767981,-0.850467,0,0,1
1159,-0.789157,-0.811550,-0.783529,-0.767981,-0.850467,1,0,0
...,...,...,...,...,...,...,...,...
1130,-0.789157,-0.811550,-0.783529,-0.842593,-0.686916,0,0,1
1294,-0.789157,-0.638095,-0.783529,-0.767981,-0.850467,0,0,0
860,-1.000000,-0.638095,-0.783529,-0.842593,-0.686916,0,0,0
1459,-0.610942,-0.933535,-1.000000,-0.930716,-0.850467,1,0,0


In [152]:
# 치환한 뒤의 모델 테스트
model = KNN().fit(mix_Train_X, mix_Train_Y)
pred_Y = model.predict(mix_Test_X)

f1_score(Test_Y, pred_Y)

0.7948717948717948