In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### 1. Getting Data Ready to use with SciKit Learn (sklearn)

3 main things to do here
1. Split the data into features and labels (x,y). Features and Lables are training inputs.
- On testing phase Features-like (same data structure like Features) data are used for input and Lables-like (same data structure like Lables) data are returned as output/prediction
- sklearn.model_selection's train_test_split is a good option for spliting all the data into x_train, x_test, y_train, y_test
2. Filling/imputing/disregarding missing values of the DataSet
3. Feature Encoding : Convert non-numerical values into numerical values

In [72]:
### 1.1 Feature Encoding: non-numerical to numerical conversion
car_sales = pd.read_csv("car-sales.csv")
car_sales_1k = car_sales[:1000].dropna().copy()
# check how nan rows
# car_sales_1k.isna().sum()
# len(car_sales_1k)
car_sales_1k.tail()

Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model
995,Mercedes-Benz,9150.0,sedan,273,2.7,Diesel,yes,2000,E-Class
996,Renault,7900.0,van,268,1.5,Diesel,yes,2008,Grand Scenic
997,Toyota,72000.0,crossover,40,4.5,Diesel,yes,2014,Land Cruiser 200
998,Mercedes-Benz,49500.0,crossover,52,3.0,Diesel,yes,2013,ML 350
999,Volkswagen,45000.0,sedan,90,3.0,Diesel,yes,2011,Phaeton


In [67]:
# Split into X/y
X = car_sales_1k.drop(["Price", "Model"], axis=1)
y = car_sales_1k["Price"]

In [68]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features =  ['Brand', 'Body', 'Engine Type', 'Registration']
# X.columns.to_list

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                               remainder="passthrough")
transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.000e+00, 1.000e+00, 0.000e+00, ..., 2.770e+02, 2.000e+00,
        1.991e+03],
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 4.270e+02, 2.900e+00,
        1.999e+03],
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 3.580e+02, 5.000e+00,
        2.003e+03],
       ...,
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 4.000e+01, 4.500e+00,
        2.014e+03],
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 5.200e+01, 3.000e+00,
        2.013e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 9.000e+01, 3.000e+00,
        2.011e+03]])

In [69]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,277.0,2.0,1991.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,427.0,2.9,1999.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,358.0,5.0,2003.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,240.0,4.2,2007.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,120.0,2.0,2011.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
923,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,273.0,2.7,2000.0
924,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,268.0,1.5,2008.0
925,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,40.0,4.5,2014.0
926,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,52.0,3.0,2013.0


In [63]:
# Feature Encoding With Dummies
dummies = pd.get_dummies(car_sales_1k[['Brand', 'Body', 'Mileage', 'EngineV', 'Engine Type', 'Registration','Year']])
# X.columns.to_list
dummies

Unnamed: 0,Mileage,EngineV,Year,Brand_Audi,Brand_BMW,Brand_Mercedes-Benz,Brand_Mitsubishi,Brand_Renault,Brand_Toyota,Brand_Volkswagen,...,Body_other,Body_sedan,Body_vagon,Body_van,Engine Type_Diesel,Engine Type_Gas,Engine Type_Other,Engine Type_Petrol,Registration_no,Registration_yes
0,277,2.0,1991,False,True,False,False,False,False,False,...,False,True,False,False,False,False,False,True,False,True
1,427,2.9,1999,False,False,True,False,False,False,False,...,False,False,False,True,True,False,False,False,False,True
2,358,5.0,2003,False,False,True,False,False,False,False,...,False,True,False,False,False,True,False,False,False,True
3,240,4.2,2007,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
4,120,2.0,2011,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,273,2.7,2000,False,False,True,False,False,False,False,...,False,True,False,False,True,False,False,False,False,True
996,268,1.5,2008,False,False,False,False,True,False,False,...,False,False,False,True,True,False,False,False,False,True
997,40,4.5,2014,False,False,False,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
998,52,3.0,2013,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True


In [70]:
# Split into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
len(X_train), len(X_test), len(y_train), len(y_test)

(742, 186, 742, 186)

In [71]:
# Build the model by fitting data
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7728930538826875

### Fill missing data (nan rows) with SciKit Learn:

In [73]:
# Drop the row with no lable
car_sales_20 = car_sales[:20].dropna(subset=["Price"]).copy()
car_sales_20

Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model
0,BMW,4200.0,sedan,277,2.0,Petrol,yes,1991,320
1,Mercedes-Benz,7900.0,van,427,2.9,Diesel,yes,1999,Sprinter 212
2,Mercedes-Benz,13300.0,sedan,358,5.0,Gas,yes,2003,S 500
3,Audi,23000.0,crossover,240,4.2,Petrol,yes,2007,Q7
4,Toyota,18300.0,crossover,120,2.0,Petrol,yes,2011,Rav 4
5,Mercedes-Benz,199999.0,crossover,0,5.5,Petrol,yes,2016,GLS 63
6,BMW,6100.0,sedan,438,2.0,Gas,yes,1997,320
7,Audi,14200.0,vagon,200,2.7,Diesel,yes,2006,A6
8,Renault,10799.0,vagon,193,1.5,Diesel,yes,2012,Megane
9,Volkswagen,1400.0,other,212,1.8,Gas,no,1999,Golf IV


In [76]:
# now delete some cells to test fill with sklearn
car_sales_20.at[7, "Mileage"] = None
car_sales_20

Unnamed: 0,Brand,Price,Body,Mileage,EngineV,Engine Type,Registration,Year,Model
0,BMW,4200.0,sedan,277.0,2.0,Petrol,yes,1991,320
1,Mercedes-Benz,7900.0,van,427.0,2.9,Diesel,yes,1999,Sprinter 212
2,Mercedes-Benz,13300.0,sedan,358.0,5.0,Gas,yes,2003,S 500
3,Audi,23000.0,crossover,240.0,4.2,Petrol,yes,2007,Q7
4,Toyota,18300.0,crossover,120.0,2.0,Petrol,yes,2011,Rav 4
5,Mercedes-Benz,199999.0,crossover,0.0,5.5,Petrol,yes,2016,GLS 63
6,BMW,6100.0,sedan,438.0,2.0,Gas,yes,1997,320
7,Audi,14200.0,vagon,,2.7,Diesel,yes,2006,A6
8,Renault,10799.0,vagon,193.0,1.5,Diesel,yes,2012,Megane
9,Volkswagen,1400.0,other,212.0,1.8,Gas,no,1999,Golf IV


In [None]:
# Implementing of the sklearn's Imputer to fill missing rows data
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4) 
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Col1", "Col2"]
door_feature = ["Col3"]
num_feature = ["Mileage"]

# Create an imputer
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_feature)
])

# Transform the data
filled_X = imputer.fit_transform(X)