In [51]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import keras

from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras import layers

In [52]:
df = pd.read_csv('car_price_prediction.csv')

In [53]:
df.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


Quick check for NaN values and duplicates.

In [54]:
# print out NaN values in each column
df.isna().sum()

ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64

In [55]:
# print amount of duplicate rows
df.duplicated().sum()

313

In [56]:
# detele all duplicate rows
df.drop_duplicates(inplace=True)

In [57]:
# delete ID column, there is no any information there, only identificator
df.drop("ID", inplace=True,axis=1)

<h2>Levy</h2>
Levy in auto dealeship language could mean any additional fees, such as taxes and etc.

This column has "-" value in 30% of dataset, so I assume these mean not the NaN values, but the fact that there are no fees. So, I'll change them to zeroes.

In [58]:
df["Levy"].replace("-", "0", inplace=True)
# change data type of column to numeric
df["Levy"] = pd.to_numeric(df["Levy"])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Levy"].replace("-", "0", inplace=True)


<h2>Manufacturer, model</h2>
Manufacturer column has 65 distinct values. My first thought was to leave only let's say 10 mostly popular ones and bucket up everything else in "other" category. But then I had an idea to divide them into countries of origin, let's see if it works. 

I believe that it might work because most of the car brands in average consumer mind are related to country of origin and e.g. all korean or japanese cars are consider as reliable, or german cars well engineered. However I might loose some brands from premium or "very poor" segment. On the other hand they would possibly be outliers anyways, so...

ChatGPT was very usefull in this. Also I joined minority countries such as Georgia, Spain and Czech Republic into temporary "other" category which should be deleted afterwards. Model will automatically understand that if the car is not in any category, then it is "other".

In [59]:
df["Manufacturer"].unique()

array(['LEXUS', 'CHEVROLET', 'HONDA', 'FORD', 'HYUNDAI', 'TOYOTA',
       'MERCEDES-BENZ', 'OPEL', 'PORSCHE', 'BMW', 'JEEP', 'VOLKSWAGEN',
       'AUDI', 'RENAULT', 'NISSAN', 'SUBARU', 'DAEWOO', 'KIA',
       'MITSUBISHI', 'SSANGYONG', 'MAZDA', 'GMC', 'FIAT', 'INFINITI',
       'ALFA ROMEO', 'SUZUKI', 'ACURA', 'LINCOLN', 'VAZ', 'GAZ',
       'CITROEN', 'LAND ROVER', 'MINI', 'DODGE', 'CHRYSLER', 'JAGUAR',
       'ISUZU', 'SKODA', 'DAIHATSU', 'BUICK', 'TESLA', 'CADILLAC',
       'PEUGEOT', 'BENTLEY', 'VOLVO', 'სხვა', 'HAVAL', 'HUMMER', 'SCION',
       'UAZ', 'MERCURY', 'ZAZ', 'ROVER', 'SEAT', 'LANCIA', 'MOSKVICH',
       'MASERATI', 'FERRARI', 'SAAB', 'LAMBORGHINI', 'ROLLS-ROYCE',
       'PONTIAC', 'SATURN', 'ASTON MARTIN', 'GREATWALL'], dtype=object)

In [60]:
# dictionary with manufacters and origin countries according to ChatGPT
car_manufacturers = {
    "Japan": ['LEXUS', 'HONDA', 'TOYOTA', 'NISSAN', 'SUBARU', 'MITSUBISHI', 'MAZDA', 'SUZUKI', 'ACURA', 'ISUZU', 'DAIHATSU', 'INFINITI', 'SCION'],
    "USA": ['CHEVROLET', 'FORD', 'JEEP', 'GMC', 'LINCOLN', 'DODGE', 'CHRYSLER', 'TESLA', 'BUICK', 'CADILLAC', 'HUMMER', 'MERCURY', 'PONTIAC', 'SATURN'],
    "South Korea": ['HYUNDAI', 'KIA', 'DAEWOO', 'SSANGYONG'],
    "Germany": ['MERCEDES-BENZ', 'OPEL', 'PORSCHE', 'BMW', 'VOLKSWAGEN', 'AUDI'],
    "France": ['RENAULT', 'CITROEN', 'PEUGEOT'],
    "UK": ['LAND ROVER', 'MINI', 'JAGUAR', 'BENTLEY', 'ROLLS-ROYCE', 'ROVER', 'ASTON MARTIN', 'MG'],
    "Italy": ['FIAT', 'ALFA ROMEO', 'LANCIA', 'MASERATI', 'FERRARI', 'LAMBORGHINI'],
    "Sweden": ['VOLVO', 'SAAB'],
    "Russia": ['VAZ', 'GAZ', 'UAZ', 'MOSKVICH', 'ZAZ'],
    "China": ['HAVAL', 'GREATWALL'],
    "Other": ['SKODA', 'სხვა', 'SEAT']
}

In [61]:
# Now map a country to each manufacturer

# empty dictionary to store the mappings
manufacturer_to_country = {}

# Loop over each country and its list of manufacturers
for country, manufacturers in car_manufacturers.items():
    # For each manufacturer in the list, assign the country as the value
    for manufacturer in manufacturers:
        manufacturer_to_country[manufacturer] = country

# Now, 'manufacturer_to_country' maps each manufacturer to its corresponding country
print(manufacturer_to_country)

{'LEXUS': 'Japan', 'HONDA': 'Japan', 'TOYOTA': 'Japan', 'NISSAN': 'Japan', 'SUBARU': 'Japan', 'MITSUBISHI': 'Japan', 'MAZDA': 'Japan', 'SUZUKI': 'Japan', 'ACURA': 'Japan', 'ISUZU': 'Japan', 'DAIHATSU': 'Japan', 'INFINITI': 'Japan', 'SCION': 'Japan', 'CHEVROLET': 'USA', 'FORD': 'USA', 'JEEP': 'USA', 'GMC': 'USA', 'LINCOLN': 'USA', 'DODGE': 'USA', 'CHRYSLER': 'USA', 'TESLA': 'USA', 'BUICK': 'USA', 'CADILLAC': 'USA', 'HUMMER': 'USA', 'MERCURY': 'USA', 'PONTIAC': 'USA', 'SATURN': 'USA', 'HYUNDAI': 'South Korea', 'KIA': 'South Korea', 'DAEWOO': 'South Korea', 'SSANGYONG': 'South Korea', 'MERCEDES-BENZ': 'Germany', 'OPEL': 'Germany', 'PORSCHE': 'Germany', 'BMW': 'Germany', 'VOLKSWAGEN': 'Germany', 'AUDI': 'Germany', 'RENAULT': 'France', 'CITROEN': 'France', 'PEUGEOT': 'France', 'LAND ROVER': 'UK', 'MINI': 'UK', 'JAGUAR': 'UK', 'BENTLEY': 'UK', 'ROLLS-ROYCE': 'UK', 'ROVER': 'UK', 'ASTON MARTIN': 'UK', 'MG': 'UK', 'FIAT': 'Italy', 'ALFA ROMEO': 'Italy', 'LANCIA': 'Italy', 'MASERATI': 'Italy', 

In [62]:
# Create new column for country of origin and map it based on manufacturer
df['Country'] = df['Manufacturer'].map(manufacturer_to_country)

In [63]:
# encode categorical column into seperate features
variable = 'Country'
dummies = pd.get_dummies(df[variable]).astype(int)
df = pd.concat([df, dummies], axis=1).drop(columns=variable)

In [64]:
# Finally drop the manufacturer column, model, "other".
# Model column has almost 1600 distinct values and is basically just a name for the car, pretty similar to ID column
# I think there is no way to get any information out of model

df.drop(["Manufacturer", "Model", "Other"], axis=1, inplace=True)

<h2>Category</h2>

Here I would like to join the cabriolet and coupe categories because cabriolet is basically coupe without roof.
Limousine category will be deleted before encoding for the same reason as "other" column previously.

In [65]:
df["Category"].value_counts()

Category
Sedan          8600
Jeep           5378
Hatchback      2799
Minivan         633
Coupe           528
Universal       361
Microbus        299
Goods wagon     229
Pickup           51
Cabriolet        35
Limousine        11
Name: count, dtype: int64

In [66]:
# Replace cabriolet to coupe
df["Category"].replace("Cabriolet", "Coupe", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Category"].replace("Cabriolet", "Coupe", inplace=True)


In [67]:
# encode categorical column into seperate features
variable = 'Category'
dummies = pd.get_dummies(df[variable]).astype(int)
df = pd.concat([df, dummies], axis=1).drop(columns=variable)

In [68]:
# Drop limousine to decrease multidimensionality
df.drop("Limousine", axis=1, inplace=True)

<h2>Leather interior</h2>
This one is simple, it's yes or no column so just encode it to binary. 

In [69]:
variable = 'Leather interior'

# NOTE: factorize can mix up the order of the values
values1, names1 = pd.factorize(df[variable], sort=False)
df[variable] = values1

<h2>Fuel type</h2>
I think about joining the plug in hybrid and hybrid into one category, they are slightly different but however
Also I think I'll delete row with hydrogen fuel type.

In [70]:
df["Fuel type"].value_counts()

Fuel type
Petrol            9944
Diesel            4001
Hybrid            3539
LPG                885
CNG                469
Plug-in Hybrid      85
Hydrogen             1
Name: count, dtype: int64

In [71]:
# search for the index of the row with fuel type hydrogen
i = df[df["Fuel type"] == "Hydrogen"].index
# Drop the hydrogen
df.drop(i, inplace=True)


In [72]:
# join hybrids
df.replace("Plug-in Hybrid", "Hybrid", inplace=True)

In [73]:
# Encode column and drop CNG to decrease column amount
variable = 'Fuel type'
dummies = pd.get_dummies(df[variable]).astype(int)
df = pd.concat([df, dummies], axis=1).drop(columns=variable)

In [74]:
df.drop("CNG", axis=1, inplace=True)

<h2>Engine volume</h2>

Engine volume consist of float number indicating volume and sometimes also mention if it is turbo or not. 

I can extract volume and seperate turbo feature to different column

In [75]:
# separate number and Turbo
def process_engine(engine_value):
    # Extract the numeric part using regex
    # \d means any number between 0-9
    number = re.search(r'\d+\.\d+|\d+', engine_value)
    # save our extracted float
    number = float(number.group())

    # Check if "Turbo" is in the string
    turbo = 1 if "Turbo" in engine_value else 0

    return number, turbo

# Apply the function to the DataFrame
df[['Engine volume int', 'Turbo']] = df['Engine volume'].apply(lambda x: pd.Series(process_engine(x)))

# Drop the old Engine column
df.drop('Engine volume', axis=1, inplace=True)


<h2>Mileage</h2>
Mileage values are strings and contain "km".

In [76]:
# using regex exctract only numeric values
df['Mileage'].replace(r'[^0-9]+', '', regex=True, inplace=True)
# change data type of column to numeric
df["Mileage"] = pd.to_numeric(df["Mileage"])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Mileage'].replace(r'[^0-9]+', '', regex=True, inplace=True)


<h2>Gear box type</h2>
Same as other categorical variables

In [77]:
# Encode column and drop last feature to decrease column amount
variable = 'Gear box type'
dummies = pd.get_dummies(df[variable]).astype(int)
df = pd.concat([df, dummies], axis=1).drop(columns=variable)

df.drop("Tiptronic", axis=1, inplace=True)


<h2>Drive wheels</h2>
Same as previous

In [78]:
# Encode column and drop last feature to decrease column amount
variable = 'Drive wheels'
dummies = pd.get_dummies(df[variable]).astype(int)
df = pd.concat([df, dummies], axis=1).drop(columns=variable)

df.drop("Rear", axis=1, inplace=True)

<h2>Doors</h2>
Column is in wrong format, I assume the real meaning is 4-5 (May is 5th month) for the most part of the cars, 2-3 (March is 3rd) for coupes and >5 for cars with more than 5 doors. 

However, basically amount of doors is same thing as type of car, most of the cars are sedans, universals or hatchabacks and they all have 4 to 5 doors. Cabriolets, coupes, pickups and good wagons have 2 to 3 doors. Buses have more than 5.

Another problem is that this column is very unbalanced, since owerhelming amount of cars have 4-5 doors.


So in this case I assume that is better to drop this column.

In [79]:
df["Doors"].value_counts()

Doors
04-May    18031
02-Mar      768
>5          124
Name: count, dtype: int64

In [80]:
df.drop("Doors", axis=1, inplace=True)

<h2>Wheel</h2>
Defines if the wheel is on right or left side. Binary caregory

In [81]:
variable = 'Wheel'

# NOTE: factorize can mix up the order of the values
values1, names1 = pd.factorize(df[variable], sort=False)
df[variable] = values1

<h2>Color</h2>

Color could be meaningfull for example for luxury cars, may be some kind of rare color combinations and so on. 

Also, when buying a new car in case you want any specific and dealership doesn't have it at the moment you have to wait for it.

But in general I don't think that the color has some significant influence on price, especially on used car market.

In [82]:
df.drop("Color", axis=1, inplace=True)

<h2>Optimizing dataset</h2>


In [83]:
from ydata_profiling import ProfileReport
import pandas as pd

# Generate the data profiling report, title is just 
report = ProfileReport(df, title='Car price dataset')
report.to_file("my_report.html")

  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_colum

<h2>Splitting dataset</h2>

In [84]:
# if you  have more than one independent variables, list them all here
# leave out the target variable! (dependent variable)
X = df.drop("Price", axis=1)

# have only the target variable here (dependent variable)
y = df['Price']

In [85]:
# Split 70% for training and 30% for testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
# Split the testing data, half for validation and half for testing
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [96]:

# pip install skfeature-chappers
from skfeature.function.similarity_based import fisher_score

# get the fisher's score rankings 
ranks = fisher_score.fisher_score(X.values, y.values)

# create a pandas DataFrame for easier interpretation
feat_importances = pd.Series(ranks, X.columns)
feat_importances.plot(kind='barh')

# how to interpret -> low score means the effect of this field is not large in the dataset
# => typically means other columns in the dataset have similar correlations, 
# therefore making this particular column not so useful since other columns 
# already fill this role for this correlation

# Fisher's score studies the variance of the data -> statistical significance

KeyboardInterrupt: 

In [87]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# convert all continuous variables to integer,
# and convert all negative numbers to 0
X_cat = X.astype(int)
X_cat = X_cat.clip(lower=0)

# initialize chi2 and SelectKBest
# Note: chi2 -test is a very common test
# in statistics and quantitative analysis
# basically it studies the data whether variables are related
# or independent of each other
chi_2_features = SelectKBest(chi2, k=len(X_cat.columns))

# fit our data to the SelectKBest
best_features = chi_2_features.fit(X_cat,y.astype(int))

# use decimal format in table print later
pd.options.display.float_format = '{:.2f}'.format

# wrap it up, and show the results
# the higher the score, the more effect that column has on price
df_features = pd.DataFrame(best_features.scores_)
df_columns = pd.DataFrame(X_cat.columns)
f_scores = pd.concat([df_columns,df_features],axis=1)
f_scores.columns = ['Features','Score']
f_scores.sort_values(by='Score',ascending=False)

Unnamed: 0,Features,Score
3,Mileage,1743236078567.59
0,Levy,2454171.21
6,Airbags,13906.85
13,South Korea,6830.25
22,Minivan,6277.54
25,Universal,6150.55
26,Diesel,5706.6
2,Leather interior,4555.59
5,Wheel,4226.19
28,LPG,4034.58


In [88]:
# pip install statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor 

# VIF dataframe 
# VIF = Variance Inflation Factor
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  

# variables with high VIF-value 
# can mean multlicollinearity (variables providing same linear
# relationships in the data, potentially confusing the ML algorithm
# this might be good info when deciding if some variable needs to be removed
print(vif_data)

              feature     VIF
0                Levy    3.28
1          Prod. year 2710.50
2    Leather interior    2.30
3             Mileage    1.01
4           Cylinders   45.86
5               Wheel    1.53
6             Airbags    4.39
7               China    1.08
8              France    3.58
9             Germany  183.66
10              Italy    4.72
11              Japan  297.70
12             Russia    4.19
13        South Korea  199.60
14             Sweden    1.88
15                 UK    7.09
16                USA  104.88
17              Coupe   53.43
18        Goods wagon   22.55
19          Hatchback  263.78
20               Jeep  504.57
21           Microbus   29.11
22            Minivan   60.28
23             Pickup    5.79
24              Sedan  805.01
25          Universal   34.84
26             Diesel   11.09
27             Hybrid   10.46
28                LPG    3.23
29             Petrol   24.65
30  Engine volume int   27.51
31              Turbo    1.54
32        

In [91]:
# can process the whole dataset 
# and compares variables to each other instead
# of focusing on outliers on a single variable
from sklearn.ensemble import IsolationForest

# adjust the contamination rate as you see fit
# for example, if you expect 5% of the data to be outliers
# you can use 0.05 etc. 
iso = IsolationForest(contamination=0.05) 

# fit isolation forest
y_pred = iso.fit_predict(df)

# filter outliers only
outliers = df[y_pred != 1]

In [95]:
df.duplicated().sum()

3236

In [92]:
outliers

Unnamed: 0,Price,Levy,Prod. year,Leather interior,Mileage,Cylinders,Wheel,Airbags,China,France,...,Hybrid,LPG,Petrol,Engine volume int,Turbo,Automatic,Manual,Variator,4x4,Front
11,8781,0,1999,1,0,8.00,0,0,0,0,...,0,0,0,4.00,0.00,0,1,0,0,0
23,7840,0,2001,1,230000,4.00,0,0,0,0,...,0,0,0,2.00,1.00,0,1,0,0,1
30,15681,1288,2007,1,180000,6.00,0,4,0,0,...,0,0,0,2.00,1.00,0,1,0,0,0
34,24462,0,2007,0,250000,6.00,0,10,0,0,...,0,0,0,3.00,1.00,0,0,0,1,0
49,17249,0,2008,1,147000,4.00,0,8,0,0,...,1,0,0,2.30,0.00,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19167,18817,1995,2003,1,2147483647,4.00,0,2,0,0,...,0,0,0,2.40,1.00,0,1,0,0,1
19194,500,0,1998,1,760000,4.00,0,2,0,0,...,0,0,0,2.50,0.00,0,1,0,0,0
19199,3763,0,1995,1,1111111111,5.00,1,2,0,0,...,0,0,1,1.80,0.00,0,1,0,0,0
19205,7500,0,2005,1,18500,4.00,1,2,0,0,...,0,0,1,2.00,0.00,0,0,1,1,0


<h2>Building model</h2>

Within first tens tries of building a ANN result were quite bad, so I decided to also train optuna optimized linear regression model just to see if the data is so bad. 

And it was really bad, R2 is 0.62,so I need to optimize the data before training.

<h2>Training</h2>

In [35]:
# pip install keras-tuner
import keras_tuner

def build_model(hp):
    # iniatlize sequential test neural network
    model = keras.Sequential()

    # first layer, batch normalization + input shape, same as in typical neural network
    #model.add(layers.BatchNormalization(input_shape=(len(X.columns),)),)
    model.add(
        layers.Dense(input_shape=(len(X.columns),),
            # Tune number of units separately.
            units=hp.Int(f"imput", min_value=32, max_value=128, step=4),
            activation=hp.Choice("activation", ["relu"]),
        )
    )
    # add the first actual layer including the regularizer
    model.add(
        layers.Dense(
            # Tune number of units separately.
            units=hp.Int(f"units_0", min_value=32, max_value=86, step=4),
            activation=hp.Choice("activation", ["relu"]),
            kernel_regularizer=keras.regularizers.l1(l1=0.1)
        )
    )
        
    # automate a dropout layer
    if hp.Boolean("dropout"):
        model.add(layers.Dropout(rate=0.1))

    # try additional layers, 1 or 2 extra layers
    for i in range(hp.Int("num_layers", 1, 2)):
        model.add(
            layers.Dense(
                # Tune number of units separately.
                units=hp.Int(f"units_{i + 1}", min_value=4, max_value=64, step=4),
                activation=hp.Choice("activation", ["relu"]),
            )
        )

    # output layer, only one node since this is regression
    model.add(layers.Dense(1))

    # automate learning rate tests
    learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")

    # compile the test neural network
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss="mse"
    )

    return model


# build the model + use RandomSearch to actually search the best options for our neural network
build_model(keras_tuner.HyperParameters())

# use val_loss as the objective, because regression tasks do not have accuracy
tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective="val_loss",
    max_trials=10,
    executions_per_trial=3,
    overwrite=True,
    directory="optimizations",
    project_name="regression1test",
)

# start searching
#tuner.search(X_train, y_train, epochs=250, validation_data=(X_val, y_val))

Trial 10 Complete [00h 05m 01s]
val_loss: 243634623829.33334

Best val_loss So Far: 243576968533.33334
Total elapsed time: 01h 00m 34s


In [36]:
#tuner.results_summary()

Results summary
Results in optimizations\regression1test
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 02 summary
Hyperparameters:
imput: 108
activation: relu
units_0: 84
dropout: False
num_layers: 2
units_1: 20
lr: 0.0014601330934395126
units_2: 56
Score: 243576968533.33334

Trial 04 summary
Hyperparameters:
imput: 36
activation: relu
units_0: 60
dropout: False
num_layers: 1
units_1: 44
lr: 0.002828685755041001
units_2: 52
Score: 243584548864.0

Trial 01 summary
Hyperparameters:
imput: 84
activation: relu
units_0: 84
dropout: False
num_layers: 1
units_1: 16
lr: 0.0004591527984583266
units_2: 4
Score: 243624340138.66666

Trial 09 summary
Hyperparameters:
imput: 72
activation: relu
units_0: 56
dropout: False
num_layers: 2
units_1: 44
lr: 0.0031956353244411316
units_2: 56
Score: 243634623829.33334

Trial 06 summary
Hyperparameters:
imput: 64
activation: relu
units_0: 76
dropout: True
num_layers: 1
units_1: 48
lr: 0.0003072922963104916
units_2: 16
Score: 243676

In [None]:
#models = tuner.get_best_models(num_models=2)
#best_model = models[0]
#best_model.summary()

In [37]:
model = keras.Sequential(
    [
        
        layers.Dense(108, activation="relu", input_shape=(len(X.columns),)),
        layers.Dense(84, activation="relu", kernel_regularizer=keras.regularizers.l2(l2=0.1)),
        layers.Dense(20, activation="relu"),
        layers.Dense(56, activation="relu"),
        layers.Dense(1)
    ]
)

# optimal learning rate from keras tuner
optimal_lr = 0.0014601330934395126

# select the optimizer and loss function
# you can try rmsprop also as optimizer, or stochastic gradient descent
model.compile(optimizer=keras.optimizers.Adam(learning_rate=optimal_lr), loss='mse')
# print out the summary of your model
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [38]:
model.fit(x=X_train, y=y_train, epochs=800, validation_data=(X_val, y_val))

Epoch 1/800
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 12689805934592.0000 - val_loss: 1065412198400.0000
Epoch 2/800
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 6171310161920.0000 - val_loss: 244309557248.0000
Epoch 3/800
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 107244003328.0000 - val_loss: 244300382208.0000
Epoch 4/800
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 684407744.0000 - val_loss: 244035043328.0000
Epoch 5/800
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 417582496.0000 - val_loss: 243885015040.0000
Epoch 6/800
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 439994080.0000 - val_loss: 243804618752.0000
Epoch 7/800
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 398365952.0000 - val_loss: 243831373824.0000
Epoch 8

<keras.src.callbacks.history.History at 0x1885595c850>

<h2>Metrics</h2>

In [43]:
# try to look if the model is actually training 
# => the error is going downwards
# if using validation data, you get two lines
# in this case, see if the lines follow a similar trend 
# (they don't always overlap with complex data, the trend is more important)
loss_df = pd.DataFrame(model.history.history)
loss_df.plot()

# other notes:
# if your validation loss is fluctuating a lot, the test data set
# might not be a good sample / representation of the whole dataset
# try to get more data or try shuffling the dataset for a better sample
# if your validation loss FLUCTUATES EXTREMELY: remember to create 
# the neural network again completely (Sequential etc.), otherwise you might fit your 
# model to your previous model version weights

# we'll also study later methods on how to select the best epoch from 
# the training history

# a common reason why a numeric neural network for regression might overfit
# is that you might have too many variables but too little data

<Axes: >

In [44]:
# compare test error values to training error values
# the model is often good when these error values are similar
# even if you training metrics above didn't overlap
# you might still get very close values in evaluation => more important

# compare the final model loss/evaluation values
print("Test data evaluation:")
print(model.evaluate(X_test, y_test, verbose=0))
print("\nTrain data evaluation:")
print(model.evaluate(X_train, y_train, verbose=0))

Test data evaluation:
308976448.0

Train data evaluation:
276280416.0


In [45]:
test_predictions = model.predict(X_test)

# reshape the data for easier comparison table
test_predictions = pd.Series(test_predictions.reshape(len(y_test),))
pred_df = pd.DataFrame(np.asarray(y_test), columns=['Test True Y'])
pred_df = pd.concat([pred_df, test_predictions], axis=1)
pred_df.columns = ['Test True Y', 'Model Predictions']

# print the comparison table - true values vs. model predicted values
# we can nicely see here how far off our model is in some cases
pred_df

[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


Unnamed: 0,Test True Y,Model Predictions
0,30784,27997.824219
1,17876,19008.248047
2,16308,17558.759766
3,7200,16031.405273
4,11604,13250.590820
...,...,...
2834,44724,29348.628906
2835,19416,19629.886719
2836,8500,16719.722656
2837,6743,12061.385742


In [46]:
# these values follow a linear line = good predictions
# we basically compare the predicted values 
# to true test values and see the differences
sns.scatterplot(x='Test True Y', y='Model Predictions', data=pred_df)

<Axes: xlabel='Test True Y', ylabel='Model Predictions'>

In [47]:
# MAE - Mean average error
print("MAE")
print(round(metrics.mean_absolute_error(y_test, test_predictions), 2), "$")

# MSE - Mean square error
print("\nMSE")
print(round(metrics.mean_squared_error(y_test, test_predictions), 2), "$^2")

# RMSE - Root mean square error
print('\nRMSE:')
print(round(np.sqrt(metrics.mean_squared_error(y_test, test_predictions)), 2), "$")

# R-squared. 0 = the model descibes the dataset poorly
# 1 = model describes the dataset perfectly
print('\nR-squared:')
print(round(metrics.r2_score(y_test, test_predictions), 2))

# Explained Variance Score => 0 = the model descibes the dataset poorly
# 1 = model describes the dataset perfectly
# high variance score = model is a good fit for the data 
# low variance score = model is not a good fit for the data
# the higher the score, the model is more able to explain the variation in the data
# if score is low, we might need more and better data
print("\nExplained variance score:")
print(round(metrics.explained_variance_score(y_test, test_predictions), 2))

MAE
11951.75 $

MSE
308975798.58 $^2

RMSE:
17577.71 $

R-squared:
0.17

Explained variance score:
0.23


In [48]:
# if the prediction distribution are far from normal distribution
# then the model is not probably good enough
# distplot is deprecating in future pandas-version
# unfortunately, there's no exact alternative to do this plot at the moment
sns.distplot((y_test - test_predictions))
plt.show()
plt.close()


`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot((y_test - test_predictions))
  plt.show()
