In [38]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [39]:
def read_data():
    df = pd.read_csv("data/car-sales-extended-missing-data.csv")
    return df

# to change index of dataframe --> use reindex(d)

In [40]:
df = read_data()
df

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [41]:
df.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [42]:
# Check how many data is present in the dataframe
len(df)

1000

In [43]:
# Number of missing values in the dataframe
df.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

#### Then Split data into train and test set, and then fill them out seperately

## Handling Missing Values

### 4. Filling rows with missing data - SimpleImputer()

* SimpleImputer() is provided by Scikit-Learn Library

In [44]:
# Remove rows with missing Price Values --> as price is y
df.dropna(subset=["Price"],inplace=True)

df.isnull().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [45]:
X = df.drop('Price',axis=1)
y = df['Price']

In [46]:
from sklearn.model_selection import train_test_split

np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [47]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((760, 4), (190, 4), (760,), (190,))

In [48]:
# Check missing values in X (both train and test dataset)
X.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

Let's fill the missing values. We'll fill the training and test values separately to ensure training data stays with the training data and test data stays with the test data.

Note: We use fit_transform() on the training data and transform() on the testing data. 

In essence, we learn the patterns in the training set and transform it via imputation (fit, then transform). Then we take those same patterns and fill the test set (transform only).

In [49]:
# Fill missing values with Scikit-Learn

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with "missing" and numerical values with mean
# Imputation --> Find the missing values and fill them
# Define some imputers

# strategy="constant" = go to the categorical columns, if u find a missing value, 
# constantly fill them with the string missing, or mean or a default value

cat_imputer = SimpleImputer(strategy="constant",fill_value="missing")
door_imputer = SimpleImputer(strategy="constant",fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Make","Colour"] # nominal category column in dataframe
door_features = ["Doors"] # a category column
num_features = ["Odometer (KM)"] # numerical column

# Create an imputer (something that fills missing data)
# pass in the imputations (all the different transformations to do)

# Takes a list and tuples within exists multiple different transformers
imputer = ColumnTransformer([
    # name, imputer to use, features on which to use the imputer
    ("cat_imputer",cat_imputer,cat_features),
    ("door_imputer",door_imputer,door_features),
    ("num_imputer",num_imputer,num_features)
])

# Transform data
# Fill train and test values separately
filled_X_train = imputer.fit_transform(X_train) # fit_transform imputes the missing values from the training set and fills them simultaneously
filled_X_test = imputer.transform(X_test) # tranform takes the imputing missing values from the training set and fills the test set with them

In [50]:
# Check filled X_train
filled_X_train

array([['Honda', 'White', 4.0, 71934.0],
       ['Toyota', 'Red', 4.0, 162665.0],
       ['Honda', 'White', 4.0, 42844.0],
       ...,
       ['Toyota', 'White', 4.0, 196225.0],
       ['Honda', 'Blue', 4.0, 133117.0],
       ['Honda', 'missing', 4.0, 150582.0]], dtype=object)

In [51]:
# Check filled X_test
filled_X_test

array([['Toyota', 'Blue', 4.0, 99761.0],
       ['Toyota', 'Black', 4.0, 17975.0],
       ['Honda', 'Blue', 4.0, 197664.0],
       ['Nissan', 'Green', 4.0, 235589.0],
       ['Honda', 'Black', 4.0, 231659.0],
       ['Toyota', 'Blue', 4.0, 247601.0],
       ['Toyota', 'Green', 4.0, 110078.0],
       ['missing', 'White', 4.0, 155383.0],
       ['Nissan', 'White', 4.0, 26634.0],
       ['Honda', 'White', 4.0, 130319.03314917127],
       ['Honda', 'Green', 4.0, 238825.0],
       ['Honda', 'Green', 4.0, 37606.0],
       ['Toyota', 'Blue', 4.0, 230908.0],
       ['Toyota', 'Red', 4.0, 159925.0],
       ['Toyota', 'Blue', 4.0, 181466.0],
       ['Toyota', 'Blue', 4.0, 140465.0],
       ['Toyota', 'White', 4.0, 146307.0],
       ['Toyota', 'Green', 4.0, 214179.0],
       ['Honda', 'White', 4.0, 184869.0],
       ['Toyota', 'Black', 4.0, 224986.0],
       ['Nissan', 'White', 3.0, 176135.0],
       ['Nissan', 'Red', 4.0, 130319.03314917127],
       ['Toyota', 'Blue', 4.0, 112223.0],
       ['BM

In [52]:
# Get our transformed data array's back into DataFrame's
train_df = pd.DataFrame(filled_X_train, 
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

test_df = pd.DataFrame(filled_X_test, 
                                     columns=["Make", "Colour", "Doors", "Odometer (KM)"])

In [53]:
train_df

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,71934.0
1,Toyota,Red,4.0,162665.0
2,Honda,White,4.0,42844.0
3,Honda,White,4.0,195829.0
4,Honda,Blue,4.0,219217.0
...,...,...,...,...
755,Toyota,missing,4.0,218803.0
756,BMW,Blue,5.0,245427.0
757,Toyota,White,4.0,196225.0
758,Honda,Blue,4.0,133117.0


In [54]:
train_df.isnull().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [55]:
test_df.isnull().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [56]:
# No need to fill y_train and y_test as they don't have any missing values

#### Now that there are no missing values

#### Convert the data into numbers (numerical data) by One Hot Encoding