In [13]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

In [14]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

In [15]:
df_train['배터리용량'] = df_train['배터리용량'].fillna(df_train['배터리용량'].mean())
df_test['배터리용량'] = df_test['배터리용량'].fillna(df_train['배터리용량'].mean())

In [16]:
def preprocess(df):
    df = df.rename(columns={'제조사':'Company', '모델': 'Vehicle_Model','차량상태':'Status',
    '배터리용량':'Battery', '구동방식': 'Driving_Method',
    '주행거리(km)':'Distance', '보증기간(년)':'Warranty',
    '사고이력':'Accident', '연식(년)':'Year', '가격(백만원)':'Price'})
    encoder = OneHotEncoder(sparse_output=False)
    encoded_data = encoder.fit_transform(df[['Vehicle_Model', 'Driving_Method','Warranty','Status']])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['Vehicle_Model', 'Driving_Method','Warranty','Status']))
    df = pd.concat([df, encoded_df], axis=1)
    df.drop(['Vehicle_Model','Driving_Method','Warranty','Status'], axis=1, inplace = True)
    mapping = {'No' : 1, 'Yes' : -1}
    df['Accident'] = df['Accident'].map(mapping)
    df.drop(['ID','Company'], axis=1, inplace = True)
    s = StandardScaler()
    df[['Battery', 'Distance']] = s.fit_transform(df[['Battery', 'Distance']])
    
    return df

In [17]:
df_train = preprocess(df_train)

In [18]:
df_train.columns

Index(['Battery', 'Distance', 'Accident', 'Year', 'Price', 'Vehicle_Model_EV6',
       'Vehicle_Model_ID4', 'Vehicle_Model_ION5', 'Vehicle_Model_ION6',
       'Vehicle_Model_IONIQ', 'Vehicle_Model_KNE', 'Vehicle_Model_M3',
       'Vehicle_Model_MS', 'Vehicle_Model_MX', 'Vehicle_Model_MY',
       'Vehicle_Model_Niro', 'Vehicle_Model_Q4eT', 'Vehicle_Model_RSeTGT',
       'Vehicle_Model_Soul', 'Vehicle_Model_Tay', 'Vehicle_Model_TayCT',
       'Vehicle_Model_TayGTS', 'Vehicle_Model_eT', 'Vehicle_Model_i3',
       'Vehicle_Model_i5', 'Vehicle_Model_iX', 'Driving_Method_AWD',
       'Driving_Method_FWD', 'Driving_Method_RWD', 'Warranty_0', 'Warranty_1',
       'Warranty_2', 'Warranty_3', 'Warranty_4', 'Warranty_5', 'Warranty_6',
       'Warranty_7', 'Warranty_8', 'Warranty_9', 'Warranty_10',
       'Status_Brand New', 'Status_Nearly New', 'Status_Pre-Owned'],
      dtype='object')

In [19]:
df_train.sample(1)

Unnamed: 0,Battery,Distance,Accident,Year,Price,Vehicle_Model_EV6,Vehicle_Model_ID4,Vehicle_Model_ION5,Vehicle_Model_ION6,Vehicle_Model_IONIQ,...,Warranty_4,Warranty_5,Warranty_6,Warranty_7,Warranty_8,Warranty_9,Warranty_10,Status_Brand New,Status_Nearly New,Status_Pre-Owned
1543,2.178731,-0.714669,1,0,69.64,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [20]:
df_train.isnull().sum()

Battery                 0
Distance                0
Accident                0
Year                    0
Price                   0
Vehicle_Model_EV6       0
Vehicle_Model_ID4       0
Vehicle_Model_ION5      0
Vehicle_Model_ION6      0
Vehicle_Model_IONIQ     0
Vehicle_Model_KNE       0
Vehicle_Model_M3        0
Vehicle_Model_MS        0
Vehicle_Model_MX        0
Vehicle_Model_MY        0
Vehicle_Model_Niro      0
Vehicle_Model_Q4eT      0
Vehicle_Model_RSeTGT    0
Vehicle_Model_Soul      0
Vehicle_Model_Tay       0
Vehicle_Model_TayCT     0
Vehicle_Model_TayGTS    0
Vehicle_Model_eT        0
Vehicle_Model_i3        0
Vehicle_Model_i5        0
Vehicle_Model_iX        0
Driving_Method_AWD      0
Driving_Method_FWD      0
Driving_Method_RWD      0
Warranty_0              0
Warranty_1              0
Warranty_2              0
Warranty_3              0
Warranty_4              0
Warranty_5              0
Warranty_6              0
Warranty_7              0
Warranty_8              0
Warranty_9  

In [21]:
df_test = preprocess(df_test)
df_test.sample(2)

Unnamed: 0,Battery,Distance,Accident,Year,Vehicle_Model_EV6,Vehicle_Model_ID4,Vehicle_Model_ION5,Vehicle_Model_ION6,Vehicle_Model_IONIQ,Vehicle_Model_KNE,...,Warranty_4,Warranty_5,Warranty_6,Warranty_7,Warranty_8,Warranty_9,Warranty_10,Status_Brand New,Status_Nearly New,Status_Pre-Owned
614,-1.127768,-0.357719,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
507,-0.611131,-0.081502,1,0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [37]:
df_train.to_csv('./train_data.csv')
df_test.to_csv('./test_data.csv')