In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder

```
Car_ID			Unique id of each observation (Interger)		
Symboling 			Its assigned insurance risk rating, A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.(Categorical) 		
carCompany			Name of car company (Categorical)		
fueltype			Car fuel type i.e gas or diesel (Categorical)		
aspiration			Aspiration used in a car (Categorical)		
doornumber			Number of doors in a car (Categorical)		
carbody			body of car (Categorical)		
drivewheel			type of drive wheel (Categorical)		
enginelocation			Location of car engine (Categorical)		
wheelbase			Weelbase of car (Numeric)		
carlength			Length of car (Numeric)		
carwidth			Width of car (Numeric)		
carheight			height of car (Numeric)		
curbweight			The weight of a car without occupants or baggage. (Numeric)		
enginetype			Type of engine. (Categorical)		
cylindernumber			cylinder placed in the car (Categorical)		
enginesize			Size of car (Numeric)		
fuelsystem			Fuel system of car (Categorical)		
boreratio			Boreratio of car (Numeric)		
stroke			Stroke or volume inside the engine (Numeric)		
compressionratio			compression ratio of car (Numeric)		
horsepower			Horsepower (Numeric)		
peakrpm			car peak rpm (Numeric)		
citympg			Mileage in city (Numeric)		
highwaympg			Mileage on highway (Numeric)		
price(Dependent variable)			Price of car (Numeric)		

```

In [2]:
raw_car_data = pd.read_csv('../data/CarPrice_Assignment.csv')

# CarName에서 브랜드와 차종 분리
car_data = raw_car_data.copy()
car_data['car_brand'] = car_data['CarName'].apply(lambda x: x.split(' ')[0].lower())
car_data['car_model'] = car_data['CarName'].apply(lambda x: ' '.join(x.split(' ')[1:]).lower())

# car_data.drop(columns=['car_ID', 'CarName', 'drivewheel', 'enginelocation', 'wheelbase',
#        'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
#        'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
#        'compressionratio'], inplace=True)
car_data

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,car_brand,car_model
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero,giulia
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero,stelvio
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero,quadrifoglio
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,3.19,3.40,10.0,102,5500,24,30,13950.0,audi,100 ls
4,5,2,audi 100 ls,gas,std,four,sedan,4wd,front,99.4,...,3.19,3.40,8.0,115,5500,18,22,17450.0,audi,100 ls
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,3.78,3.15,9.5,114,5400,23,28,16845.0,volvo,145e (sw)
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,3.78,3.15,8.7,160,5300,19,25,19045.0,volvo,144ea
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,3.58,2.87,8.8,134,5500,18,23,21485.0,volvo,244dl
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,3.01,3.40,23.0,106,4800,26,27,22470.0,volvo,246


In [3]:
X = car_data.drop(columns=['price'])
y = car_data['price']

# Label Encoding: object 타입 컬럼만 인코딩
LE = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LE.fit_transform(X[col])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=5000, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 3401227.493705848
Mean Absolute Error: 1291.5747609756097
R^2 Score: 0.9569159582219573


In [4]:
model =  DecisionTreeRegressor(max_depth=7, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 8696847.601051522
Mean Absolute Error: 1869.026821457038
R^2 Score: 0.8898352591602978


In [5]:
y_test

15     30760.000
9      17859.167
100     9549.000
132    11850.000
68     28248.000
95      7799.000
159     7788.000
162     9258.000
147    10198.000
182     7775.000
191    13295.000
164     8238.000
65     18280.000
175     9988.000
73     40960.000
152     6488.000
18      5151.000
82     12629.000
86      8189.000
143     9960.000
60      8495.000
101    13499.000
98      8249.000
30      6479.000
25      6692.000
16     41315.000
168     9639.000
195    13415.000
97      7999.000
194    12940.000
67     25552.000
120     6229.000
154     7898.000
202    21485.000
79      7689.000
69     28176.000
145    11259.000
55     10945.000
45      8916.500
84     14489.000
146     7463.000
Name: price, dtype: float64

In [6]:
y_pred

array([36880.        , 17849.6       ,  9117.        , 12007.5       ,
       35056.        ,  6111.66666667,  7802.        ,  8160.88888889,
        9117.        ,  7802.        , 13645.        ,  8160.88888889,
       12007.5       , 10260.52941176, 45400.        ,  6111.66666667,
        5456.        , 12864.        ,  9117.        ,  9117.        ,
       10260.52941176, 15537.55555556,  6111.66666667,  5456.        ,
        7131.73333333, 36880.        , 10260.52941176, 15537.55555556,
        7131.73333333, 15537.55555556, 35056.        ,  6111.66666667,
        9117.        , 17866.66666667,  8160.88888889, 35056.        ,
        9117.        , 12007.5       ,  8916.5       , 14869.        ,
        9117.        ])