In [12]:
# 1. IMPORTS
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn

# 2. IMPORTS TO GET THE DATA READY
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# 3. IMPORTS FOR MODELLING
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# 4. SETUP RANDOM SEED
np.random.seed(42)

In [13]:
# 1. IMPORT DATA
data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data.dtypes

# 2. DROP ROWS WITH MISSING LABELS (PRICE)
data.dropna(subset=["Price"], inplace=True)
data.isna().sum() # to check how much NaN we have in each column

# 3. NOW WE NEED TO TRANSFORM NON NUMERIC DATA TO NUMERIC.
# WE ALSO NEED TO FILL DATA IN NAN FIELDS.

# 3.1 DEFINE DIFFERENT FEATURES AND TRANSFORMER PIPELINE
categoricalFeatures = ["Make", "Colour"]
categoricalTransformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("oneHot", OneHotEncoder(handle_unknown="ignore"))
])
doorFeatures = ["Doors"]
doorTransformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4)),
])
numericFeatures = ["Odometer (KM)"]
numericTransformer= Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
])

# 3.2 FILL MISSING VALUES THEN CONVERT TO NUMBERS
preprocessor = ColumnTransformer(transformers=[
    ("cat", categoricalTransformer, categoricalFeatures),
    ("door", doorTransformer, doorFeatures),
    ("num", numericTransformer, numericFeatures)
])

# 4. CREATE A PREPROCESSING AND MODELLING PIPELINE
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor())
])

# 5. DEFINE FEATURES AND LABELS
X = data.drop("Price", axis=1)
y = data["Price"]

# 6. SPLIT THE DATA
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.2)

# 7. FIT THE MODEL INTO THE DATA
model.fit(xTrain, yTrain)

# 8. YOU CAN PREDICT HERE USING THE PREDICT() METHOD.

# 9. EVALUATE THE MODEL
model.score(xTest, yTest)

0.22188417408787875

In [14]:
# USE GRIDSEARCHCV WITH OUR REGRESSION PIPELINE

# 1. DEFINE THE GRID
pipeGrid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__n_estimators": [100, 1000],
    "model__max_depth": [None, 5],
    "model__max_features": ["sqrt"],
    "model__min_samples_split": [2,4]
}

# 2. CALL GRIDSEARCHCV ON THE MODEL AND PASS IT THE GRID
gsModel = GridSearchCV(model, pipeGrid, cv=5, verbose=2)

# 3. FIT THE GSMODEL INTO THE DATA
gsModel.fit(xTrain, yTrain)

# 4. EVALUATE THE GSMODEL
gsModel.score(xTest, yTest)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_sampl

0.2848784564026805

In [17]:
# SAVE THE MODEL
import pickle
pickle.dump(gsModel, open("GitHubFiles/car_sales_model.pkl", "wb"))