##Neural Networks
Car Sales. Consider the data on used cars (ToyotaCorolla.csv) with 1436 records and details on 38 attributes, including Price, Age, KM, HP, and other specifications. The goal is to predict the price of a used Toyota Corolla based on its specifications.



In [1]:
%matplotlib inline

from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
!pip install dmba
from dmba import classificationSummary, regressionSummary

Collecting dmba
  Downloading dmba-0.2.4-py3-none-any.whl.metadata (1.9 kB)
Downloading dmba-0.2.4-py3-none-any.whl (11.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.2.4
Colab environment detected.


In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [35]:
cars = pd.read_csv("ToyotaCorolla.csv")
cars.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,Color,Automatic,CC,Doors,Cylinders,Gears,Quarterly_Tax,Weight,Mfr_Guarantee,BOVAG_Guarantee,Guarantee_Period,ABS,Airbag_1,Airbag_2,Airco,Automatic_airco,Boardcomputer,CD_Player,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,Blue,0,2000,3,4,5,210,1165,0,1,3,1,1,1,0,0,1,0,1,1,1,0,0,0,1,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,Silver,0,2000,3,4,5,210,1165,0,1,3,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,Blue,0,2000,3,4,5,210,1165,1,1,3,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,Black,0,2000,3,4,5,210,1165,1,1,3,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,Black,0,2000,3,4,5,210,1170,1,1,3,1,1,1,1,0,1,0,1,1,1,0,1,0,1,0,0,0,0


In [36]:
cars.columns

Index(['Id', 'Model', 'Price', 'Age_08_04', 'Mfg_Month', 'Mfg_Year', 'KM',
       'Fuel_Type', 'HP', 'Met_Color', 'Color', 'Automatic', 'CC', 'Doors',
       'Cylinders', 'Gears', 'Quarterly_Tax', 'Weight', 'Mfr_Guarantee',
       'BOVAG_Guarantee', 'Guarantee_Period', 'ABS', 'Airbag_1', 'Airbag_2',
       'Airco', 'Automatic_airco', 'Boardcomputer', 'CD_Player',
       'Central_Lock', 'Powered_Windows', 'Power_Steering', 'Radio',
       'Mistlamps', 'Sport_Model', 'Backseat_Divider', 'Metallic_Rim',
       'Radio_cassette', 'Parking_Assistant', 'Tow_Bar'],
      dtype='object')

1.	Preprocess the data

a.	Only use the following as predictors: Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, and Tow_Bar



b.	Create categorical and dummy variables where appropriate.


In [37]:
cars.Fuel_Type=cars.Fuel_Type.astype("category")
cars.Color=cars.Color.astype("category") #not used in model but changed to catergorical
print(cars.dtypes)

Id                      int64
Model                  object
Price                   int64
Age_08_04               int64
Mfg_Month               int64
Mfg_Year                int64
KM                      int64
Fuel_Type            category
HP                      int64
Met_Color               int64
Color                category
Automatic               int64
CC                      int64
Doors                   int64
Cylinders               int64
Gears                   int64
Quarterly_Tax           int64
Weight                  int64
Mfr_Guarantee           int64
BOVAG_Guarantee         int64
Guarantee_Period        int64
ABS                     int64
Airbag_1                int64
Airbag_2                int64
Airco                   int64
Automatic_airco         int64
Boardcomputer           int64
CD_Player               int64
Central_Lock            int64
Powered_Windows         int64
Power_Steering          int64
Radio                   int64
Mistlamps               int64
Sport_Mode

In [38]:
cars_2 = cars.drop(columns=["Id","Model", "Color"])
cars_2.head()

Unnamed: 0,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,Automatic,CC,Doors,Cylinders,Gears,Quarterly_Tax,Weight,Mfr_Guarantee,BOVAG_Guarantee,Guarantee_Period,ABS,Airbag_1,Airbag_2,Airco,Automatic_airco,Boardcomputer,CD_Player,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,13500,23,10,2002,46986,Diesel,90,1,0,2000,3,4,5,210,1165,0,1,3,1,1,1,0,0,1,0,1,1,1,0,0,0,1,0,0,0,0
1,13750,23,10,2002,72937,Diesel,90,1,0,2000,3,4,5,210,1165,0,1,3,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,0,0,0
2,13950,24,9,2002,41711,Diesel,90,1,0,2000,3,4,5,210,1165,1,1,3,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
3,14950,26,7,2002,48000,Diesel,90,0,0,2000,3,4,5,210,1165,1,1,3,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
4,13750,30,3,2002,38500,Diesel,90,0,0,2000,3,4,5,210,1170,1,1,3,1,1,1,1,0,1,0,1,1,1,0,1,0,1,0,0,0,0


In [39]:
cars_2=pd.get_dummies(cars_2, columns=["Fuel_Type"], drop_first=True)
cars_2.head()

Unnamed: 0,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,CC,Doors,Cylinders,Gears,Quarterly_Tax,Weight,Mfr_Guarantee,BOVAG_Guarantee,Guarantee_Period,ABS,Airbag_1,Airbag_2,Airco,Automatic_airco,Boardcomputer,CD_Player,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar,Fuel_Type_Diesel,Fuel_Type_Petrol
0,13500,23,10,2002,46986,90,1,0,2000,3,4,5,210,1165,0,1,3,1,1,1,0,0,1,0,1,1,1,0,0,0,1,0,0,0,0,True,False
1,13750,23,10,2002,72937,90,1,0,2000,3,4,5,210,1165,0,1,3,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,0,0,0,True,False
2,13950,24,9,2002,41711,90,1,0,2000,3,4,5,210,1165,1,1,3,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,True,False
3,14950,26,7,2002,48000,90,0,0,2000,3,4,5,210,1165,1,1,3,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,True,False
4,13750,30,3,2002,38500,90,0,0,2000,3,4,5,210,1170,1,1,3,1,1,1,1,0,1,0,1,1,1,0,1,0,1,0,0,0,0,True,False


In [40]:
cars_3=cars_2[["Price", "Age_08_04", "KM", "Fuel_Type_Diesel", "Fuel_Type_Petrol", "HP", "Automatic", "Doors",
              "Quarterly_Tax", "Mfr_Guarantee", "Guarantee_Period", "Airco", "Automatic_airco", "CD_Player", "Powered_Windows", "Sport_Model", "Tow_Bar"]]
cars_3.head()

Unnamed: 0,Price,Age_08_04,KM,Fuel_Type_Diesel,Fuel_Type_Petrol,HP,Automatic,Doors,Quarterly_Tax,Mfr_Guarantee,Guarantee_Period,Airco,Automatic_airco,CD_Player,Powered_Windows,Sport_Model,Tow_Bar
0,13500,23,46986,True,False,90,0,3,210,0,3,0,0,0,1,0,0
1,13750,23,72937,True,False,90,0,3,210,0,3,1,0,1,0,0,0
2,13950,24,41711,True,False,90,0,3,210,1,3,0,0,0,0,0,0
3,14950,26,48000,True,False,90,0,3,210,1,3,0,0,0,0,0,0
4,13750,30,38500,True,False,90,0,3,210,1,3,1,0,0,1,0,0


d.	Partition the data into X & y data frames and train_test_split.

In [41]:
predictors = ["Age_08_04", "KM", "Fuel_Type_Diesel", "Fuel_Type_Petrol", "HP", "Automatic", "Doors", "Quarterly_Tax", "Mfr_Guarantee", "Guarantee_Period", "Airco", "Automatic_airco", "CD_Player", "Powered_Windows", "Sport_Model", "Tow_Bar"]
outcome = ["Price"]
X = cars_3[predictors]
y = cars_3[outcome]

In [42]:
mms = MinMaxScaler()
og_columns = X.columns
X = pd.DataFrame(mms.fit_transform(X), columns=og_columns)

In [43]:
train_X, valid_X, train_y, valid_y = train_test_split(X,y, test_size=0.4, random_state=1)

c.	Scale the data using MinMaxScaler().

2.	Fit a neural network model to the data. Use two hidden layers.

In [44]:
cars_NN = MLPRegressor(random_state=1, activation="relu", hidden_layer_sizes=2, solver="lbfgs", max_iter=1000)
cars_NN.fit(train_X,train_y)

  y = column_or_1d(y, warn=True)


In [45]:
cars_NN.coefs_

[array([[-318.30906793, -545.3657215 ],
        [-204.95837756, -214.6695562 ],
        [  75.0811413 ,  -94.59904538],
        [  76.01650544, -797.54208434],
        [ 194.35946262, -253.83098754],
        [  27.29625008,  -63.34504506],
        [  26.56471997, -634.14357878],
        [ 184.56530914, -243.13656983],
        [   3.4629131 , -412.03064023],
        [  79.76632808,  -22.2319237 ],
        [   8.59581807, -567.38862311],
        [  85.98063183, -112.2884819 ],
        [   9.85648259, -294.99918936],
        [  16.8210061 , -537.31296451],
        [  21.2612589 , -336.96247865],
        [  -9.88033014, -208.29869471]]),
 array([[  28.92488867],
        [1156.87965452]])]

In [46]:
cars_NN.predict(valid_X)

array([12029.86525207, 10399.82712446, 14077.43377701,  7757.51902773,
       11721.95252496, 17619.79119297,  7757.51902773,  9284.73565925,
        7757.51902773, 11824.76532041,  9512.69513468,  9010.08242352,
        7757.51902773,  8610.6712002 ,  7757.51902773,  7757.51902773,
        7757.51902773,  7757.51902773, 16827.16846871,  7757.51902773,
       15594.16777531,  7757.51902773,  8677.74459794,  7757.51902773,
       12839.42229675, 20859.47523528,  7757.51902773,  9707.5791454 ,
       19621.51097758,  9983.99941287,  7757.51902773,  7757.51902773,
       13359.55037442, 17140.33840711, 26394.11583297, 10485.31033211,
       10012.71140917,  7757.51902773,  9045.6691922 , 16682.68094033,
       12564.61074561,  8764.11044666,  7889.65885951,  9727.74427956,
        8034.70091094,  9840.39980501, 11311.07134542,  9059.24986841,
        7757.51902773,  7757.51902773,  7757.51902773, 11775.3422745 ,
        8762.28105005,  7757.51902773,  8537.3608596 , 18815.30491859,
      

3. 	Calculate the RMSE for the training and validation data.

In [47]:
print("Training Data", regressionSummary(train_y, cars_NN.predict(train_X)))
print("Valid Data", regressionSummary(valid_y, cars_NN.predict(valid_X)))


Regression statistics

                      Mean Error (ME) : 0.1893
       Root Mean Squared Error (RMSE) : 1169.0846
            Mean Absolute Error (MAE) : 874.8954
          Mean Percentage Error (MPE) : -1.0815
Mean Absolute Percentage Error (MAPE) : 8.6033
Training Data None

Regression statistics

                      Mean Error (ME) : 11.1230
       Root Mean Squared Error (RMSE) : 1131.8648
            Mean Absolute Error (MAE) : 906.3423
          Mean Percentage Error (MPE) : -1.0372
Mean Absolute Percentage Error (MAPE) : 9.2755
Valid Data None


4.	Complete a grid search to find the best number of hidden layers.

In [48]:
param_grid={"hidden_layer_sizes":[1,2,3,4,5,6,7,8]}

In [49]:
cars_NN_CV = GridSearchCV(MLPRegressor(random_state=1, activation="relu", solver="lbfgs",
                    max_iter=1000), param_grid=param_grid, cv=5, n_jobs=-1)
cars_NN_CV.fit(train_X,train_y)

  y = column_or_1d(y, warn=True)


In [50]:
cars_NN_CV.best_params_

{'hidden_layer_sizes': 8}

In [None]:
#Scaling after partioning not part of Assignment but to show the (slight) difference

In [51]:
predictors = ["Age_08_04", "KM", "Fuel_Type_Diesel", "Fuel_Type_Petrol", "HP", "Automatic", "Doors", "Quarterly_Tax", "Mfr_Guarantee", "Guarantee_Period", "Airco", "Automatic_airco", "CD_Player", "Powered_Windows", "Sport_Model", "Tow_Bar"]
outcome = ["Price"]
X2 = cars_3[predictors]
y2 = cars_3[outcome]
train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X2,y2, test_size=0.4, random_state=1)

In [52]:
mms = MinMaxScaler()
train_X2 = mms.fit_transform(train_X2)
valid_X2 = mms.transform(valid_X2) #fit data only on train, not valid

In [53]:
cars_NN2 = MLPRegressor(random_state=1, activation="relu", hidden_layer_sizes=2, solver="lbfgs", max_iter=1000)
cars_NN2.fit(train_X2,train_y2)

  y = column_or_1d(y, warn=True)


In [54]:
cars_NN2.coefs_

[array([[-586.8231132 , -677.29317855],
        [-360.32338007, -277.94852955],
        [ 135.46067882, -116.57889433],
        [ 134.05573517, -989.77448592],
        [ 362.51977485, -314.84721474],
        [  51.00835474,  -78.86815137],
        [  49.40129542, -786.87649141],
        [ 336.71130395, -301.27741808],
        [   6.89863222, -512.47073018],
        [ 144.07789171,  -27.56368423],
        [  16.3408114 , -702.76185948],
        [ 156.55764271, -138.32387075],
        [  18.07185857, -364.69324195],
        [  31.05603782, -666.71901805],
        [  39.68947361, -417.11941658],
        [ -17.6857679 , -259.04615432]]),
 array([[  15.72743485],
        [1433.31244387]])]

In [55]:
cars_NN.predict(valid_X)

array([12029.86525207, 10399.82712446, 14077.43377701,  7757.51902773,
       11721.95252496, 17619.79119297,  7757.51902773,  9284.73565925,
        7757.51902773, 11824.76532041,  9512.69513468,  9010.08242352,
        7757.51902773,  8610.6712002 ,  7757.51902773,  7757.51902773,
        7757.51902773,  7757.51902773, 16827.16846871,  7757.51902773,
       15594.16777531,  7757.51902773,  8677.74459794,  7757.51902773,
       12839.42229675, 20859.47523528,  7757.51902773,  9707.5791454 ,
       19621.51097758,  9983.99941287,  7757.51902773,  7757.51902773,
       13359.55037442, 17140.33840711, 26394.11583297, 10485.31033211,
       10012.71140917,  7757.51902773,  9045.6691922 , 16682.68094033,
       12564.61074561,  8764.11044666,  7889.65885951,  9727.74427956,
        8034.70091094,  9840.39980501, 11311.07134542,  9059.24986841,
        7757.51902773,  7757.51902773,  7757.51902773, 11775.3422745 ,
        8762.28105005,  7757.51902773,  8537.3608596 , 18815.30491859,
      

In [56]:
print("Training Data", regressionSummary(train_y2, cars_NN2.predict(train_X2)))
print("Valid Data", regressionSummary(valid_y2, cars_NN2.predict(valid_X2)))


Regression statistics

                      Mean Error (ME) : -0.3633
       Root Mean Squared Error (RMSE) : 1169.0969
            Mean Absolute Error (MAE) : 875.4463
          Mean Percentage Error (MPE) : -1.0936
Mean Absolute Percentage Error (MAPE) : 8.6168
Training Data None

Regression statistics

                      Mean Error (ME) : 8.9122
       Root Mean Squared Error (RMSE) : 1131.5335
            Mean Absolute Error (MAE) : 905.6579
          Mean Percentage Error (MPE) : -1.0702
Mean Absolute Percentage Error (MAPE) : 9.2772
Valid Data None


In [None]:
param_grid={"hidden_layer_sizes":[1,2,3,4,5,6,7,8]}

In [57]:
cars_NN2_CV2 = GridSearchCV(MLPRegressor(random_state=1, activation="relu", solver="lbfgs",
                    max_iter=1000), param_grid=param_grid, cv=5, n_jobs=-1)
cars_NN2_CV2.fit(train_X2,train_y2)

  y = column_or_1d(y, warn=True)


In [58]:
cars_NN2_CV2.best_params_

{'hidden_layer_sizes': 8}