In [1]:
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv("../dataSets/car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [4]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

### Steps we want to do (In one cell):
1. Fill missing values
2. Convert data to numbers
3. Build the model on the data.

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer   # Fill missing values
from sklearn.preprocessing import OneHotEncoder    # Convert our objects to integer(numbers)


# Modelling
from sklearn.ensemble import RandomForestRegressor       
from sklearn.model_selection import train_test_split, GridSearchCV   # GridSearchCV for hyperParameter tuning.


np.random.seed(42)


# import data and drop rows with missing labels.
data = pd.read_csv("../dataSets/car-sales-extended-missing-data.csv")
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [6]:
data["Doors"].value_counts()

Doors
4.0    811
5.0     75
3.0     64
Name: count, dtype: int64

In [7]:
# Droping the Rules in the Price having no value:
data = data.dropna(subset=['Price'])
data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [8]:
# Define different features and transformer pipeline. The aim is to use different Pipelines to handle Missing values, convert data to numbers etc.
categorical_features = ["Make", "Colour"]
categorical_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)


door_features = ["Doors"]
door_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="constant", fill_value=4))   # We are filling with 4 because 4 is the majority value in the door feature.
    ]
)

numeric_features = ["Odometer (KM)"]      # No price here because Price is our Target vector
numeric_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="mean"))   # Fill missing values in the Odomerter (KM) column with the mean.
    ]
)

In [9]:
# SetUp preprocessing steps (fill missing values, then convert to numbers).
preprocessor = ColumnTransformer(
                transformers=[ 
                    ("cat", categorical_transformer, categorical_features),
                    ("door", door_transformer, door_features),
                    ("num", numeric_transformer, numeric_features)
                ]
)

# Creating a preprocessing and modelling Pipeline.
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor()),
    ]
)

In [10]:
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [11]:
# Split the data
y = data["Price"]
X = data.drop("Price", axis=1)


In [12]:
# Splitting our model and training it and scoring the model.
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2)
model.fit(X_train, y_train)


In [13]:
model.score(X_test, y_test)

0.22188417408787875

In [16]:
### Improving our model using HyperParameter tuning. Using GridSearchCV.

pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],    # We are reverting back to preprocessor and getting to num then imputer and setting that we want to have the mean and median.
    "model__n_estimators": [100, 1000],  # it's like going to our RandomForestRegressor() and inside the func we but n_estimators.
    "model__max_depth": [None,5],
    "model__max_features": ["auto", "sqrt"],
    "model__min_samples_split": [2,4]
    
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samp

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Livingston\Desktop\ml-course\env\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Livingston\Desktop\ml-course\env\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Livingston\Desktop\ml-course\env\lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Livingston\Desktop\ml-course\env\lib\site-packages\skle

In [17]:
gs_model.score(X_test, y_test)

0.2848784564026805

In [19]:
house = pd.read_csv("../dataSets/housing_price_dataset.csv")
house

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065
...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080.865895
49996,2854,2,2,Suburb,1988,374507.656727
49997,2979,5,3,Suburb,1962,384110.555590
49998,2596,5,2,Rural,1984,380512.685957


In [20]:
house.isnull().sum()

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

### Putting it all together again

In [16]:
# Getting the data ready
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer

# Setup random seed
import numpy as np
np.random.seed(42)

In [8]:
data = pd.read_csv("../dataSets/car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [9]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [10]:
data.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [11]:
#Dropping columns of the "Price" having missing values
data.dropna(subset=["Price"],inplace=True)
data.isnull().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [13]:
data["Doors"].value_counts()

Doors
4.0    768
5.0     71
3.0     64
Name: count, dtype: int64

In [29]:
# Define different featuers and transformer pipeline
categorical_features = ["Make", "Colour"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

door_feature= ["Doors"]
door_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=4))   
    ]
)

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy='mean'))
    ]
)


# SetUp preprocessing steps (Fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(
    transformers = [
        ("cat", categorical_transformer, categorical_features),
        ("door", door_transformer, door_feature),
        ("num", numeric_transformer, numeric_features)
    ]
)

In [30]:
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [31]:
# Creating a preprocessing and modelling pipeline
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(random_state=42))
    ]
)

In [34]:
# Split the data
X = data.drop("Price", axis=1)
y = data["Price"]

#Creating a X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# fit the model
model.fit(X_train, y_train)

In [35]:
model.score(X_test, y_test)

0.272353803284346

# Using a HyperParameter(GridsSearchCV or RandomizedSearchCV) with our Pipeline.

In [39]:
# Using GridSearchCV with our regression 
from sklearn.model_selection import GridSearchCV

pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__n_estimators":[100,1000],
    "model__max_depth": [None, 5],
    "model__max_features": ["auto", "sqrt"],
    "model__min_samples_split": [2,4]
}


gs_model = GridSearchCV(model,pipe_grid, cv=5, n_jobs=1,verbose=2)
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samp

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Livingston\Desktop\ml-course\env\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Livingston\Desktop\ml-course\env\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Livingston\Desktop\ml-course\env\lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Livingston\Desktop\ml-course\env\lib\site-packages\skle

In [41]:
gs_model.score(X_test, y_test)

0.31526411055670844