In [1]:
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv("../dataSets/car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [4]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

### Steps we want to do (In one cell):
1. Fill missing values
2. Convert data to numbers
3. Build the model on the data.

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer   # Fill missing values
from sklearn.preprocessing import OneHotEncoder    # Convert our objects to integer(numbers)


# Modelling
from sklearn.ensemble import RandomForestRegressor       
from sklearn.model_selection import train_test_split, GridSearchCV   # GridSearchCV for hyperParameter tuning.


np.random.seed(42)


# import data and drop rows with missing labels.
data = pd.read_csv("../dataSets/car-sales-extended-missing-data.csv")
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [6]:
data["Doors"].value_counts()

Doors
4.0    811
5.0     75
3.0     64
Name: count, dtype: int64

In [7]:
# Droping the Rules in the Price having no value:
data = data.dropna(subset=['Price'])
data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [8]:
# Define different features and transformer pipeline. The aim is to use different Pipelines to handle Missing values, convert data to numbers etc.
categorical_features = ["Make", "Colour"]
categorical_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)


door_features = ["Doors"]
door_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="constant", fill_value=4))   # We are filling with 4 because 4 is the majority value in the door feature.
    ]
)

numeric_features = ["Odometer (KM)"]      # No price here because Price is our Target vector
numeric_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="mean"))   # Fill missing values in the Odomerter (KM) column with the mean.
    ]
)

In [9]:
# SetUp preprocessing steps (fill missing values, then convert to numbers).
preprocessor = ColumnTransformer(
                transformers=[ 
                    ("cat", categorical_transformer, categorical_features),
                    ("door", door_transformer, door_features),
                    ("num", numeric_transformer, numeric_features)
                ]
)

# Creating a preprocessing and modelling Pipeline.
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor()),
    ]
)

In [10]:
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [11]:
# Split the data
y = data["Price"]
X = data.drop("Price", axis=1)


In [14]:
# Splitting our model and training it and scoring the model.
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2)
model.fit(X_train, y_train)


In [15]:
model.score(X_test, y_test)

0.07796223536784908

In [None]:
### Improving our model using HyperParameter tuning.
