In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt

from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import layers, models

In [3]:
cars = pd.read_excel('cars.xlsx')
cars.head(5)

Unnamed: 0,num_id,brand,model,age,funding_amount,fundingPercent,month_payment,payment_duration,car_price,is_male,isNew,isWorkCar,isSUV,seatsAmount,ind_sold
0,12517404,Mercedes,Arocs 4148,53.0,405389.64,23.92,7875.07,51.48,1695080.0,0,1,1,0,2,0
1,66073366,Mercedes,Arocs 2636,36.0,411937.0,26.57,8156.85,50.5,1550335.0,1,0,1,0,2,1
2,53618572,Mercedes,S63,34.0,1200000.0,83.33,23620.15,50.8,1440000.0,1,0,0,0,5,0
3,57061044,Mercedes,S500,51.0,600000.0,43.03,20784.35,28.87,1394430.61,1,1,0,0,5,0
4,88422859,Mercedes,S500,50.0,810000.0,60.0,15848.58,51.11,1350000.0,0,0,0,0,5,0


In [4]:
# create X with dummy variables and y

X = cars.iloc[:, 1:-1]
X = pd.get_dummies(X)
y = cars.iloc[:, -1]

In [4]:
# remove brand and model
cars_no_brand = cars.drop(['brand', 'model'], axis=1)
X_no_brand = cars_no_brand.iloc[:, 1:-1]
y_no_brand = cars_no_brand.iloc[:, -1]


In [8]:
# scaling numerical data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(X[['age', 'funding_amount', 'fundingPercent', 'month_payment', 'payment_duration', 'car_price']]),
                         columns=['age', 'funding_amount', 'fundingPercent', 'month_payment', 'payment_duration', 'car_price'])

X = pd.merge(left=scaled_df,
             right=X.drop(['age', 'funding_amount', 'fundingPercent',
                          'month_payment', 'payment_duration', 'car_price'], axis=1),
             left_index=True, right_index=True)
X.head(5)

Unnamed: 0,age,funding_amount,fundingPercent,month_payment,payment_duration,car_price,is_male,isNew,isWorkCar,isSUV,...,model_Tucson,model_V220,model_V250,model_Veloster,model_Viano,model_Vitara,model_Vito,model_X5,model_X6,model_XV
0,0.721096,2.491169,-2.894785,1.411323,0.161126,9.937905,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.728723,2.552694,-2.726033,1.499894,0.105644,8.972815,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.89929,9.958006,0.888429,6.360443,0.122628,8.237155,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.550529,4.319894,-1.677865,5.469072,-1.118928,7.933321,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.465245,6.293233,-0.59722,3.917621,0.140178,7.637079,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# scale

scaler_no_brand = StandardScaler()
scaled_no_brand_df = pd.DataFrame(scaler.fit_transform(X_no_brand[['age', 'funding_amount', 'fundingPercent', 'month_payment', 'payment_duration', 'car_price']]),
                         columns=['age', 'funding_amount', 'fundingPercent', 'month_payment', 'payment_duration', 'car_price'])

X_no_brand = pd.merge(left=scaled_df,
             right=X_no_brand.drop(['age', 'funding_amount', 'fundingPercent',
                          'month_payment', 'payment_duration', 'car_price'], axis=1),
             left_index=True, right_index=True)
X_no_brand.head(5)

Unnamed: 0,age,funding_amount,fundingPercent,month_payment,payment_duration,car_price,is_male,isNew,isWorkCar,isSUV,seatsAmount
0,0.721096,2.491169,-2.894785,1.411323,0.161126,9.937905,0,1,1,0,2
1,-0.728723,2.552694,-2.726033,1.499894,0.105644,8.972815,1,0,1,0,2
2,-0.89929,9.958006,0.888429,6.360443,0.122628,8.237155,1,0,0,0,5
3,0.550529,4.319894,-1.677865,5.469072,-1.118928,7.933321,1,1,0,0,5
4,0.465245,6.293233,-0.59722,3.917621,0.140178,7.637079,0,0,0,0,5


In [10]:
# split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
train_df = pd.merge(left=X_train, right=y_train, left_index=True, right_index=True)
test_df = pd.merge(left=X_test, right=y_test, left_index=True, right_index=True)

In [11]:
X_no_brand_train, X_no_brand_test, y_no_brand_train, y_no_brand_test = train_test_split(X_no_brand, y_no_brand, test_size=0.2, random_state=1337)
train_no_brand_df = pd.merge(left=X_no_brand_train, right=y_no_brand_train, left_index=True, right_index=True)
test_no_brand_df = pd.merge(left=X_no_brand_test, right=y_no_brand_test, left_index=True, right_index=True)

Build the model

In [1]:
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Dense(
        hp.Choice('layer1', [128, 256, 512]),
        activation='relu',
        input_shape=(X_train.shape[1],)
    ))
    model.add(layers.Dense(
        hp.Choice('layer2', [128, 96, 64]),
        activation='relu')
    )
    model.add(layers.Dense(
        hp.Choice('layer3', [96, 64, 32, 16]),
        activation='relu')
    )
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy']
                  )

    return model


In [8]:
# use a tuner to choose the best hyperparameters

tuner = kt.RandomSearch(build_model, objective='val_accuracy', max_trials=3*3*4)
tuner.search(X_train, y_train, epochs=10, validation_split=0.2)
best_model = tuner.get_best_models()[0]

INFO:tensorflow:Reloading Oracle from existing project .\untitled_project\oracle.json
INFO:tensorflow:Reloading Tuner from .\untitled_project\tuner0.json
INFO:tensorflow:Oracle triggered exit


In [16]:
tuner_no_brand = kt.RandomSearch(build_model, objective='val_accuracy', max_trials=3*3*4, )
tuner_no_brand.search(X_no_brand_train, y_no_brand_train, epochs=10, validation_split=0.2)
best_model_no_brand = tuner_no_brand.get_best_models()[0]

Trial 26 Complete [00h 00m 13s]
val_accuracy: 0.6863169074058533

Best val_accuracy So Far: 0.6917252540588379
Total elapsed time: 00h 05m 29s
INFO:tensorflow:Oracle triggered exit




In [9]:
# predict and get accuracy of test
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)


0.7009952401557767

In [14]:
best_model.predict(X_train.iloc[1:4])

array([[0.0147491 ],
       [0.04588164],
       [0.49959096]], dtype=float32)

In [20]:
# predict and get accuracy of test
y_no_brand_pred = best_model_no_brand.predict(X_no_brand_test)
y_no_brand_pred = np.where(y_no_brand_pred > 0.5, 1, 0)

from sklearn.metrics import accuracy_score
accuracy_score(y_no_brand_test, y_no_brand_pred)

from sklearn.metrics import classification_report
print(classification_report(y_no_brand_test, y_no_brand_pred))

              precision    recall  f1-score   support

           0       0.75      0.62      0.68      1324
           1       0.59      0.72      0.65       987

    accuracy                           0.67      2311
   macro avg       0.67      0.67      0.66      2311
weighted avg       0.68      0.67      0.67      2311



In [11]:
# save beset_model to h5 file
best_model.save('nn.h5')

In [13]:
loaded_model = keras.models.load_model('./server/models/nn.h5')
loaded_model.evaluate(X_test,y_test)

# predict and get accuracy of test
y_pred = loaded_model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)




0.6901774123755949

In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.74      0.71      0.72      1324
           1       0.63      0.67      0.65       987

    accuracy                           0.69      2311
   macro avg       0.69      0.69      0.69      2311
weighted avg       0.69      0.69      0.69      2311

