In [37]:
import pandas as pd
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Import the dataset
# cars_file_path = 'cars.csv'
cars_file_path = '/home/jean/GitHub/predict-price-car/cars.csv'
cars_data = pd.read_csv(cars_file_path)

cars_data.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [38]:
# Choose Target / Predictions
y = cars_data.price
X = cars_data.drop(['price'], axis=1)

In [39]:
# Divide Data

# Training / Validation
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Numerical columns
numerical_columns = ['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke','compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']

# Numerical columns with automatic approach
# numerical_columns = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Categoricals columns
categorical_columns = ['fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem']

# Categoricals columns with with automatic approach and low cardinality
# categorical_columns = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

# Filter only selected columns
selected_columns = categorical_columns + numerical_columns
X_train = X_train_full[selected_columns].copy()
X_valid = X_valid_full[selected_columns].copy()

X_train.describe()

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
count,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0
mean,98.906098,174.534146,65.920122,53.795122,2575.969512,127.304878,3.335366,3.257835,10.344634,104.158537,5114.02439,25.140244,30.536585
std,6.181277,12.129154,2.082595,2.495463,522.542938,41.171254,0.272641,0.303516,4.297356,38.007093,486.837446,6.433127,6.851976
min,86.6,144.6,61.8,47.8,1713.0,70.0,2.54,2.07,7.0,52.0,4150.0,13.0,16.0
25%,94.5,167.3,64.075,51.9,2190.75,98.0,3.15,3.1175,8.5,70.0,4800.0,19.0,25.0
50%,97.0,173.2,65.45,54.1,2417.0,120.0,3.33,3.28,9.0,96.0,5100.0,24.0,30.0
75%,102.4,184.6,66.9,55.7,2957.75,141.0,3.59,3.41,9.4,117.0,5500.0,30.0,34.0
max,120.9,208.1,71.7,59.8,4066.0,326.0,3.8,4.17,23.0,262.0,6600.0,49.0,54.0


In [40]:
# First 5 Data

X.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22


In [41]:
# Create Preprocessing

# Preprocessing for Numerical
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for Categorical - Ont-Hot Encoding
# categorical_transformer_ohe = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant')),
#     ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
# ])

# Preprocessing for Categorical - Ordinal Encoding
categorical_transformer_ordinal = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('ordinalencoder', OrdinalEncoder(categories='auto', handle_unknown='use_encoded_value', unknown_value=-1))
])

# Create Preprocessing for Numerical / Categorical
preprocessor = ColumnTransformer(transformers=[
    ('numericals', numerical_transformer, numerical_columns),
    ('categoricals', categorical_transformer_ordinal, categorical_columns)
])

In [42]:
def get_score_random_forest(n_estimators):
  # Create the model with Random Forest Regressor
  cars_model = RandomForestRegressor(random_state=0, n_estimators=n_estimators)

  # Create a Preprocessing and set Pipeline
  cars_predict_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cars_model)
  ])

  cars_predict_pipeline.fit(X_train, y_train)

  # Predict the price of cars with the model
  cars_predict = cars_predict_pipeline.predict(X_valid)

  # Evaluate the model with MAE
  return mean_absolute_error(y_valid, cars_predict)

score = get_score_random_forest(n_estimators=150)

score

1790.6047641114983

In [67]:
# Search the best approch to predict the car price
scores_multiply_n_estimators = {n_estimator: get_score_random_forest(n_estimator) for n_estimator in range(20, 520, 20)}

# Less MAE is better
fig = go.Figure(
  data=go.Scatter(
    x=list(scores_multiply_n_estimators.keys()), 
    y=list(scores_multiply_n_estimators.values()),
    mode='markers',
    line = dict(color='royalblue', width=3, dash='dash')
  )
)

fig.update_layout(
  title='Variação do MAE x Nº Estimators',
  xaxis_title='Nº Estimators',
  yaxis_title='MAE',
  xaxis = dict(tickmode = 'linear', tick0 = 20, dtick = 20)
)

fig.show()

In [68]:
# The best Nº Estimators approuch
get_score_random_forest(20)

1794.6205280487802