In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [7]:
data = pd.read_csv('car-sales-extended-missing-data.csv')
data.head()

(     Make Colour  Odometer (KM)  Doors    Price
 0   Honda  White        35431.0    4.0  15323.0
 1     BMW   Blue       192714.0    5.0  19943.0
 2   Honda  White        84714.0    4.0  28343.0
 3  Toyota  White       154365.0    4.0  13434.0
 4  Nissan   Blue       181577.0    3.0  14043.0,
 Make              object
 Colour            object
 Odometer (KM)    float64
 Doors            float64
 Price            float64
 dtype: object,
 Make             49
 Colour           50
 Odometer (KM)    50
 Doors            50
 Price            50
 dtype: int64)

In [8]:
data.dtypes,data.isna().sum()

(Make              object
 Colour            object
 Odometer (KM)    float64
 Doors            float64
 Price            float64
 dtype: object,
 Make             49
 Colour           50
 Odometer (KM)    50
 Doors            50
 Price            50
 dtype: int64)

Steps
1. Fill missing data
2. Convert data to numbers
3. Build a model on the data


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV

np.random.seed(42)

data.dropna(subset=['Price'],inplace=True)


category_features = ['Make','Colour']
category_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

door_feature = ['Doors']
door_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value=4))
])

numeric_features = ['Odometer (KM)']
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean'))
])

preprocessor = ColumnTransformer(transformers=[
    ('category',category_transformer,category_features),
    ('door',door_transformer,door_feature),
    ('numeric',numeric_transformer,numeric_features)
])

model = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',RandomForestRegressor())
])

X = data.drop('Price',axis=1)
y = data['Price']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

model.fit(X_train,y_train)
model.score(X_test,y_test)

0.22188417408787875

# Use GridSearchCV with Pipeline

In [11]:
grid = {
    'preprocessor__numeric__imputer__strategy':['mean','median'],
    'model__n_estimators':[100,1000],
    'model__max_depth':[None,5],
    'model__min_samples_split':[2,4]
}

gs_model = GridSearchCV(model,grid,cv=5,verbose=2)
gs_model.fit(X_train,y_train)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=median; total time=   0.2s
[CV] END model__max_depth=None,

In [12]:
gs_model.score(X_test,y_test)

0.3339554263158365