In [1]:
#Import library
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import feature_column
import pandas as pd
import joblib
from tensorflow.keras.models import save_model
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder





In [2]:
# Dataset load
df = pd.read_csv('Dataset/dataset_cleaned1.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18595 entries, 0 to 18594
Data columns (total 14 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Maker                                18595 non-null  object 
 1   Model                                18595 non-null  object 
 2   Number_of_Cylinders                  18595 non-null  int64  
 3   Engine_Type                          18595 non-null  int64  
 4   Engine_Horse_Power                   18595 non-null  float64
 5   Engine_Horse_Power_RPM               18595 non-null  int64  
 6   Transmission                         18595 non-null  int64  
 7   Mixed_Fuel_Consumption_per_100_km_l  18595 non-null  float64
 8   Fuel_Tank_Capacity                   18595 non-null  int64  
 9   Acceleration_0_to_100_Km             18595 non-null  float64
 10  Max_Speed_Km_per_Hour                18595 non-null  int64  
 11  Fuel_Grade                  

In [4]:
#Required features for training ? (tentative)
req_feat = ['Number_of_Cylinders',
            'Engine_Type',
            'Engine_Horse_Power',
            'Engine_Horse_Power_RPM',
            'Transmission',
            'Acceleration_0_to_100_Km',
            'Fuel_Grade',
            'Mixed_Fuel_Consumption_per_100_km_l']

target = 'Mixed_Fuel_Consumption_per_100_km_l'

#Data for train
data = df[req_feat]

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18595 entries, 0 to 18594
Data columns (total 8 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Number_of_Cylinders                  18595 non-null  int64  
 1   Engine_Type                          18595 non-null  int64  
 2   Engine_Horse_Power                   18595 non-null  float64
 3   Engine_Horse_Power_RPM               18595 non-null  int64  
 4   Transmission                         18595 non-null  int64  
 5   Acceleration_0_to_100_Km             18595 non-null  float64
 6   Fuel_Grade                           18595 non-null  int64  
 7   Mixed_Fuel_Consumption_per_100_km_l  18595 non-null  float64
dtypes: float64(3), int64(5)
memory usage: 1.1 MB


In [6]:
#Data split into val and train set
train_df, val_df = train_test_split(data, test_size=0.1)
train_label = train_df.pop('Mixed_Fuel_Consumption_per_100_km_l')
val_label = val_df.pop('Mixed_Fuel_Consumption_per_100_km_l')
print(f'Number of training datasets: {len(train_df)}')
print(f'Number of validation datasets: {len(val_df)}')

Number of training datasets: 16735
Number of validation datasets: 1860


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16735 entries, 12602 to 15564
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Number_of_Cylinders       16735 non-null  int64  
 1   Engine_Type               16735 non-null  int64  
 2   Engine_Horse_Power        16735 non-null  float64
 3   Engine_Horse_Power_RPM    16735 non-null  int64  
 4   Transmission              16735 non-null  int64  
 5   Acceleration_0_to_100_Km  16735 non-null  float64
 6   Fuel_Grade                16735 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 1.0 MB


In [8]:
# Standardize the data
columns_to_standardize = ['Engine_Horse_Power',
                          'Engine_Horse_Power_RPM',
                          'Acceleration_0_to_100_Km']
scaler = StandardScaler()
train_df[columns_to_standardize] = scaler.fit_transform(train_df[columns_to_standardize])
val_df[columns_to_standardize] = scaler.transform(val_df[columns_to_standardize])

In [9]:
# One Hot Encode
categorical_columns = ['Number_of_Cylinders',
            'Engine_Type',
            'Transmission',
            'Fuel_Grade']
transformer = make_column_transformer(
    (OneHotEncoder(), categorical_columns),
    remainder='passthrough')
train_data = transformer.fit_transform(train_df)
train_df = pd.DataFrame(
    train_data, 
    columns=transformer.get_feature_names_out())
val_data = transformer.transform(val_df)
val_df = pd.DataFrame(
    val_data, 
    columns=transformer.get_feature_names_out())


In [12]:
# Convert dataframe to tensorflow datasets
train_data = tf.data.Dataset.from_tensor_slices((train_df.values, train_label.values))
val_data = tf.data.Dataset.from_tensor_slices((val_df.values, val_label.values))

In [13]:
#Batches and shuffles (for train only) the data
train_data = train_data.shuffle(10).batch(32)
val_data = val_data.batch(32)

In [15]:
#Create model and train

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

model = tf.keras.Sequential([
  tf.keras.layers.Dense(160, activation='relu', input_shape=(19,)),
  tf.keras.layers.Dense(1)
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='mse',
              metrics=['mae'])

model.fit(train_data,
          validation_data=val_data,
          epochs=50,
          callbacks=[stop_early])


Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1f427f7c8e0>

In [17]:
# Save model in a h5 file
save_model(model, "car_regress.h5")

# Save scaler in a joblib file
joblib.dump(scaler, 'scaler.joblib')

# Save transformer in a joblib file
joblib.dump(transformer, 'transformer.joblib')

  save_model(model, "car_regress.h5")


['transformer.joblib']

In [21]:
data_test = [[8, 0, 0, 2, 500, 5600, 6.7]] #Toyota Sequioa
data_test = pd.DataFrame(data_test, columns=['Number_of_Cylinders', 
                                                            'Engine_Type',
                                                            'Transmission',
                                                            'Fuel_Grade',
                                                            'Engine_Horse_Power',
                                                            'Engine_Horse_Power_RPM',
                                                            'Acceleration_0_to_100_Km'])
data_test[columns_to_standardize] = scaler.transform(data_test[columns_to_standardize])
data_test.info()
data_test = transformer.transform(data_test)
data_test= tf.data.Dataset.from_tensor_slices(data_test)
data_test = data_test.batch(1)
hasil = model.predict(data_test)
print(hasil)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Number_of_Cylinders       1 non-null      int64  
 1   Engine_Type               1 non-null      int64  
 2   Transmission              1 non-null      int64  
 3   Fuel_Grade                1 non-null      int64  
 4   Engine_Horse_Power        1 non-null      float64
 5   Engine_Horse_Power_RPM    1 non-null      float64
 6   Acceleration_0_to_100_Km  1 non-null      float64
dtypes: float64(3), int64(4)
memory usage: 184.0 bytes


[[15.88987]]
