In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pickle

# Pipeline for usage
# ________________________________________________________________

##
In this notebook, we will build on the data exploration notebook and create a machine learning pipeline that takes in the features of a used car and predicts its price. The RFR with the best parameters from the hyperparameter tuning will be used as the predictor. The preprocessing will be the same as was done in the model-selection phase.
# ________________________________________________________________

In [3]:
car_data = pd.read_csv("datasets/car_data.csv")
car_data.drop(columns=car_data.columns[0], inplace=True)

try:
    with open('pipeline/best_params.txt', 'r') as file:
        while True:
            line = file.readline()
            if not line:
                raise EOFError('End of file reached')
            print(line)
except EOFError:
    print('End of file reached')

n_estimators: 300

min_samples_split: 2

min_samples_leaf: 1

max_features: sqrt

max_depth: 40

bootstrap: True

End of file reached


In [4]:
rf = RandomForestRegressor(n_estimators=300,
                           min_samples_split = 2,
                           min_samples_leaf = 1,
                           max_features = "sqrt",
                           max_depth = 40,
                           bootstrap = True
                          )

In [5]:
y = car_data.price
X = car_data.drop(columns='price')
num_cols = ['year', 'odometer', 'lat', 'long']
cat_cols = ['manufacturer', 'type', 'paint_color', 'fuel']

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.20)

cat_pre = Pipeline([("imputer",SimpleImputer(strategy='most_frequent')), ("ohe",OneHotEncoder(handle_unknown='ignore', sparse=False, drop='first'))])
num_pre = Pipeline([("imputer",SimpleImputer(strategy='mean')), ("scale",StandardScaler())])
preprocess = ColumnTransformer(
    transformers=[
        ("cat_process", cat_pre, cat_cols),
        ("num_process", num_pre, num_cols)
    ]
)
pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('regressor', rf)
])

In [6]:
pipe.fit(x_train, y_train)



In [7]:
prediction = pipe.predict(x_test)

In [8]:
MAE = mean_absolute_error(prediction, y_test)

In [9]:
print(MAE)
print(pipe.score(x_test, y_test))

1711.5207951326042
0.8310742851377091


### Test in Practice:

- Toyota
- SUV
- black
- gas
- 2014
- Washington (38.895794345288074, -77.03621265903129)

These are the specs of a car priced at $19990.
Let's see how our model would predict the price

In [10]:
new_car = pd.DataFrame({'manufacturer': ['toyota'],
                        'type': ['SUV'],
                        'paint_color': ['black'],
                        'fuel': ['gas'],
                        'year': [2014],
                        'odometer': [40000],
                        'lat': [38.895794345288074],
                        'long': [-77.03621265903129],
                        'is_fwd': [1]})
predicted_price = pipe.predict(new_car)

In [11]:
print("Model predicted the price to $"+str(predicted_price[0])+ " which is $" + str(19990-predicted_price[0]) + str(" away from the real price."))

Model predicted the price to $18549.62693711435 which is $1440.3730628856501 away from the real price.


The pipeline is now capable of taking in a dataset containing information on used cars, cleaning and preprocessing the data, and training a machine learning model to predict the price of a used car based on its features. Specifically, the pipeline is able to handle missing values, scale numeric features, and one-hot-encode categorical features. The trained model can then be used to predict the price of a new used car by providing its features as input to the pipeline.

In [12]:
with open('app_pipeline.pkl', 'wb') as f:
    pickle.dump(pipe, f)