# Dataset do Kaggle - Car Price Prediction

In [1]:
## para tratar os dados

import pandas as pd
import numpy as np

## prepocessamento

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer  # pipeline com colunas de tipos diferentes
from sklearn.impute import SimpleImputer # missing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  # escala das features
from category_encoders import TargetEncoder, OneHotEncoder  # tratamento de categóricas
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_regression  # seleção de features

## modelagem

import lightgbm as lgb

## configoracoes gerais

import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('car_price.csv')

df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


## Verificações Padrão do Dataset

In [3]:
df.shape

(205, 26)

In [4]:
df.describe()

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,103.0,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329756,3.255415,10.142537,104.117073,5125.121951,25.219512,30.75122,13276.710571
std,59.322565,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.270844,0.313597,3.97204,39.544167,476.985643,6.542142,6.886443,7988.852332
min,1.0,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,52.0,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,103.0,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,154.0,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16503.0
max,205.0,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


In [5]:
df.info

<bound method DataFrame.info of      car_ID  symboling                   CarName fueltype aspiration  \
0         1          3        alfa-romero giulia      gas        std   
1         2          3       alfa-romero stelvio      gas        std   
2         3          1  alfa-romero Quadrifoglio      gas        std   
3         4          2               audi 100 ls      gas        std   
4         5          2                audi 100ls      gas        std   
..      ...        ...                       ...      ...        ...   
200     201         -1           volvo 145e (sw)      gas        std   
201     202         -1               volvo 144ea      gas      turbo   
202     203         -1               volvo 244dl      gas        std   
203     204         -1                 volvo 246   diesel      turbo   
204     205         -1               volvo 264gl      gas      turbo   

    doornumber      carbody drivewheel enginelocation  wheelbase  ...  \
0          two  convertible   

In [6]:
df.nunique().sort_values()

fueltype              2
aspiration            2
doornumber            2
enginelocation        2
drivewheel            3
carbody               5
symboling             6
enginetype            7
cylindernumber        7
fuelsystem            8
peakrpm              23
citympg              29
highwaympg           30
compressionratio     32
stroke               37
boreratio            38
carwidth             44
enginesize           44
carheight            49
wheelbase            53
horsepower           59
carlength            75
CarName             147
curbweight          171
price               189
car_ID              205
dtype: int64

In [7]:
df.isna().mean()

car_ID              0.0
symboling           0.0
CarName             0.0
fueltype            0.0
aspiration          0.0
doornumber          0.0
carbody             0.0
drivewheel          0.0
enginelocation      0.0
wheelbase           0.0
carlength           0.0
carwidth            0.0
carheight           0.0
curbweight          0.0
enginetype          0.0
cylindernumber      0.0
enginesize          0.0
fuelsystem          0.0
boreratio           0.0
stroke              0.0
compressionratio    0.0
horsepower          0.0
peakrpm             0.0
citympg             0.0
highwaympg          0.0
price               0.0
dtype: float64

In [8]:
## separando as colunas de forma automática entre numéricas e categóricas

numerical = df.select_dtypes(include='number').columns.to_list()

categorical = df.select_dtypes(exclude='number').columns.to_list()

In [9]:
numerical

['car_ID',
 'symboling',
 'wheelbase',
 'carlength',
 'carwidth',
 'carheight',
 'curbweight',
 'enginesize',
 'boreratio',
 'stroke',
 'compressionratio',
 'horsepower',
 'peakrpm',
 'citympg',
 'highwaympg',
 'price']

In [10]:
categorical

['CarName',
 'fueltype',
 'aspiration',
 'doornumber',
 'carbody',
 'drivewheel',
 'enginelocation',
 'enginetype',
 'cylindernumber',
 'fuelsystem']

In [11]:
## retirando o nosso target das features numéricas

numerical = [feature for feature in numerical if feature !='price']

target = 'price'

In [12]:
numerical = ['wheelbase',
 'carlength',
 'carwidth',
 'carheight',
 'curbweight',
 'enginesize',
 'boreratio',
 'stroke',
 'compressionratio',
 'horsepower',
 'peakrpm',
 'citympg',
 'highwaympg']

In [13]:
X  = df[numerical + categorical]

y = df[target]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (143, 23)
X_test shape: (62, 23)
y_train shape: (143,)
y_test shape: (62,)


In [15]:
lgb_model = lgb.LGBMRegressor()

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder())])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical),
    ('cat', categorical_transformer, categorical)
    ]
)

# SelectKBest seleciona as features fazendo testes estatísticos
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func = f_regression, k=20)),
    ('model', lgb_model)   
])

# treina o modelo
pipeline.fit(X_train, y_train)

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
print(f'Mean Absolute Error (MAE): {mae:.4f}')
print(f'R-Squared (R2): {r2:.4f}')

Mean Squared Error (MSE): 10563579.6962
Root Mean Squared Error (RMSE): 3250.1661
Mean Absolute Error (MAE): 2141.9980
R-Squared (R2): 0.8475


## Discretização - Assunto no PDF

In [18]:
from feature_engine.discretisation import EqualWidthDiscretiser, EqualFrequencyDiscretiser, DecisionTreeDiscretiser

In [19]:
ewd = EqualWidthDiscretiser()
ewd.fit(df[['carheight']])

In [20]:
df['carheight_ewd'] = ewd.transform(df[['carheight']])

df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,carheight_ewd
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0,0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0,0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0,3
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0,5
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0,5


In [23]:
df.groupby('carheight_ewd').agg({'carheight': ['mean', 'median', 'min', 'max', 'size']})

Unnamed: 0_level_0,carheight,carheight,carheight,carheight,carheight
Unnamed: 0_level_1,mean,median,min,max,size
carheight_ewd,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,48.466667,48.8,47.8,48.8,3
1,49.588889,49.6,49.4,49.7,9
2,50.673333,50.8,50.2,51.4,30
3,51.96087,52.0,51.6,52.5,23
4,53.075758,53.0,52.6,53.7,33
5,54.404878,54.4,53.9,54.9,41
6,55.636842,55.7,55.1,56.1,38
7,56.52,56.7,56.2,56.7,15
8,57.7,57.5,57.5,58.3,4
9,59.077778,59.1,58.7,59.8,9


In [25]:
efd = EqualFrequencyDiscretiser()
efd.fit(df[['carheight']])

df['carheight_efd'] = efd.transform(df[['carheight']])

df.groupby('carheight_efd').agg({'carheight': ['mean', 'median', 'min', 'max', 'size']})

Unnamed: 0_level_0,carheight,carheight,carheight,carheight,carheight
Unnamed: 0_level_1,mean,median,min,max,size
carheight_efd,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,49.876,50.2,47.8,50.6,25
1,50.882353,50.8,50.8,51.4,17
2,51.88,52.0,51.6,52.4,20
3,52.75,52.8,52.5,53.0,22
4,53.742308,53.7,53.1,54.1,26
5,54.41,54.45,54.3,54.5,20
6,54.935714,54.9,54.7,55.1,14
7,55.5875,55.65,55.2,55.7,24
8,56.175,56.1,55.9,56.5,16
9,57.909524,57.5,56.7,59.8,21


In [28]:
td = DecisionTreeDiscretiser()
td.fit(df[['carheight']], df[['price']])

df['carheight_td'] = td.transform(df[['carheight']])

df.groupby('carheight_td').agg({'carheight': ['mean', 'median', 'min', 'max', 'size']})

Unnamed: 0_level_0,carheight,carheight,carheight,carheight,carheight
Unnamed: 0_level_1,mean,median,min,max,size
carheight_td,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
11859.473566,52.550345,52.8,47.8,55.2,145
16701.7,56.563333,56.1,55.4,59.8,60
