### Scikit Learn-02 (sklearn): Dat Cleansing - Cars
 - Python library design for Machine Learning
 - Study based from Udemy

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_car_sales = pd.read_csv( "data/car-sales-extended.csv" )
df_car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [3]:
len( df_car_sales )

1000

In [4]:
# Print Data-types of the Car-Sales DataFrame
df_car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [5]:
X = df_car_sales.drop( "Price", axis = 1 )
Y = df_car_sales[ "Price" ]

In [6]:
# 3. Fit the model to training & testing data
from sklearn.model_selection import train_test_split

In [7]:
# Split data into trainning & testing sets (important)
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2 )

In [8]:
# 2. Choose Right Model and hyper-parametres
# Regressor predicts number
from sklearn.ensemble import RandomForestRegressor

In [9]:
model = RandomForestRegressor()
model.fit( X_train, Y_train )
model.score( X_test, Y_test )

ValueError: could not convert string to float: 'Honda'

### DATA CLEANSING
 - Converting STRING into NUMERICALs
 - Note: Machine Learning cannot handle String. Hence, we pre-process the data

In [10]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [11]:
# Defines categories
categorical_features = [ "Make", "Colour", "Doors" ]

In [12]:
# Instantiate OneHotEncoder
one_hot = OneHotEncoder()

In [13]:
# passthrough - remainder of the columns, we do nothing; in this case,  Odometer
transformer = ColumnTransformer( [ ("one_hot", 
                                    one_hot,
                                    categorical_features )],
                                    remainder = "passthrough" )

In [14]:
X_transformed = transformer.fit_transform( X )
X_transformed

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [15]:
# Columns we defined have been converted into float - numerical values where Machine Learning can be applied
# Numerical columns represents faltten out categories: Colour --> Red, Green Blue etc
pd.DataFrame( X_transformed ).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [16]:
# Pandas aslo has built in function get_dummies
dummies = pd.get_dummies( df_car_sales[ [ "Make", "Colour", "Doors" ]])
dummies.head()

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0


In [17]:
# Let's refit the model:
np.random.seed( 42 )

# Split data into trainning & testing sets
X_train, X_test, Y_train, Y_test = train_test_split( X_transformed, Y, test_size = 0.2 )

In [18]:
model = RandomForestRegressor()
model.fit( X_train, Y_train )
model.score( X_test, Y_test )

0.3235867221569877

**Note:** This worked now as the STRING has been converted into NUMERICAL values for Machine Learning to process

### DATA CLEANSING
 - Handling MISSING values
 
 
***NOTE***print(sklearn.__version__)
* "In a newer version of Scikit-Learn (0.23+), the OneHotEncoder class was upgraded to be able to handle None & NaN values."

In [19]:
import sklearn as sklearn

In [20]:
print(sklearn.__version__)

0.24.2


In [21]:
df_car_sales = pd.read_csv( "data/car-sales-extended-missing-data.csv" )
df_car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [22]:
# Get Count of NULL values in the DataFrame
df_car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [23]:
X = df_car_sales.drop( "Price", axis = 1 )
Y = df_car_sales[ "Price" ]

In [24]:
# Converting to numbers
transformer = ColumnTransformer( [ ("one_hot", 
                                    one_hot,
                                    categorical_features )],
                                    remainder = "passthrough" )

In [25]:
X_transformed = transformer.fit_transform( X )
X_transformed

<1000x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

In [26]:
pd.DataFrame( X_transformed ).head()

Unnamed: 0,0
0,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
1,"(0, 0)\t1.0\n (0, 6)\t1.0\n (0, 13)\t1.0\n..."
2,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
3,"(0, 3)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
4,"(0, 2)\t1.0\n (0, 6)\t1.0\n (0, 11)\t1.0\n..."


In [27]:
# Let's refit the model:
np.random.seed( 42 )

# Split data into trainning & testing sets
X_train, X_test, Y_train, Y_test = train_test_split( X_transformed, Y, test_size = 0.2 )

In [28]:
model = RandomForestRegressor()
model.fit( X_train, Y_train )
model.score( X_test, Y_test )

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

#### Option-1: Fill missing data with Pandas

In [29]:
df_car_sales[ "Doors" ].value_counts()

4.0    811
5.0     75
3.0     64
Name: Doors, dtype: int64

In [30]:
df_car_sales[ "Make" ].fillna( "missing", inplace = True )

df_car_sales[ "Colour" ].fillna( "missing", inplace = True )

df_car_sales[ "Odometer (KM)" ].fillna( df_car_sales[ "Odometer (KM)" ].mean(), inplace = True )

df_car_sales[ "Doors" ].fillna( 4, inplace = True )

In [31]:
df_car_sales.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [32]:
# Remove rows with missing Price value
df_car_sales.dropna( inplace = True )

In [33]:
df_car_sales.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [34]:
# Lost some data - 50 rows
len( df_car_sales )

950

In [35]:
df_car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [36]:
X = df_car_sales.drop( "Price", axis = 1 )
Y = df_car_sales[ "Price" ]

In [37]:
# Converting to numbers
transformer = ColumnTransformer( [ ("one_hot", 
                                    one_hot,
                                    categorical_features )],
                                    remainder = "passthrough" )

In [38]:
X_transformed = transformer.fit_transform( df_car_sales )
X_transformed

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [39]:
# Let's refit the model:
np.random.seed( 42 )

# Split data into trainning & testing sets
X_train, X_test, Y_train, Y_test = train_test_split( X_transformed, Y, test_size = 0.2 )

In [40]:
model = RandomForestRegressor()
model.fit( X_train, Y_train )
model.score( X_test, Y_test )

0.9998421058539825

#### Option-2: Fill missing data with Scikit-Learn

In [41]:
df_car_sales = pd.read_csv( "data/car-sales-extended-missing-data.csv" )
df_car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [42]:
df_car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [43]:
df_car_sales.dropna( subset = [ "Price" ], inplace = True )
df_car_sales.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [44]:
X = df_car_sales.drop( "Price", axis = 1 )
Y = df_car_sales[ "Price" ]

In [45]:
# Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [46]:
# Fill Categorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer( strategy = "constant", fill_value = "missing" )
door_imputer = SimpleImputer( strategy = "constant", fill_value = 4 )
num_imputer = SimpleImputer( strategy = "mean" )

In [47]:
# Define Columns
cat_features = [ "Make", "Colour" ]
door_features = [ "Doors" ]
num_features = [ "Odometer (KM)" ]

In [48]:
# Create an imputer (something that fills missing data)
imputer = ColumnTransformer( [ 
    ( "cat_imputer", cat_imputer, cat_features ),
    ( "door_imputer", door_imputer, door_features ),
    ( "num_imputer", num_imputer, num_features )
])

In [49]:
X.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

In [50]:
# Transform Data:
X_filled = imputer.fit_transform( X )
X_filled

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [51]:
df_car_sales_filled = pd.DataFrame( X_filled,
                                    columns = [ "Make", "Colour", "Doors", "Odometer (KM)" ])
df_car_sales_filled.head()

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0


In [52]:
df_car_sales_filled.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [53]:
# Converting to numbers
transformer = ColumnTransformer( [ ("one_hot", 
                                    one_hot,
                                    categorical_features )],
                                    remainder = "passthrough" )
X_transformed = transformer.fit_transform( df_car_sales_filled )
X_transformed

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [54]:
np.random.seed( 42 )
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split( X_transformed, Y, test_size = 0.2 )

model = RandomForestRegressor()
model.fit( X_train, Y_train )
model.score( X_test, Y_test )

0.21990196728583944