# Classic Machine learning vs. Deep Neural Network (regression)
## Open required libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer 
from sklearn.impute import MissingIndicator
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.metrics import MAPE
from tensorflow.keras.layers import Input, Dense, Flatten, BatchNormalization, Activation, add, Dropout
from tensorflow.keras import regularizers


## Open and read data files

In [2]:
train_df = pd.read_csv("train.csv", index_col=0)
test_df = pd.read_csv("test_no_target.csv", index_col=0)
zipcodes_df = pd.read_csv("zipcodes.csv", index_col=0)

train_df = pd.merge(train_df.reset_index(), zipcodes_df.drop_duplicates("zipcode"), on="zipcode", how="left")
test_df = pd.merge(test_df.reset_index(), zipcodes_df.drop_duplicates("zipcode"), on="zipcode", how="left")

In [3]:
train_df

Unnamed: 0,index,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,price,city,latitude,longitude
0,48298,2.0,bus,2006,auto,140,c4,150000,gasoline,citroen,0.0,49191,380.0,4267,Belm,52.30476,8.12846
1,81047,,,2016,,0,vito,150000,,mercedes_benz,,45896,,2457,Gelsenkirchen,51.51750,7.08575
2,92754,2.2,limousine,2010,manual,175,mondeo,125000,diesel,ford,0.0,59229,930.0,10374,"Ahlen, Westfalen",51.75972,7.89694
3,46007,,,2000,auto,265,andere,150000,gasoline,ford,0.0,39365,680.0,7098,Druxberge,52.15648,11.30968
4,76981,,convertible,3,manual,109,2_reihe,150000,gasoline,peugeot,0.0,55271,,2365,Stadecken-Elsheim,49.91220,8.12528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,50429,1.4,limousine,2006,manual,75,golf,90000,gasoline,volkswagen,0.0,35745,500.0,4686,"Herborn, Hessen",50.68330,8.31667
49996,64425,1.3,small car,4,manual,60,fiesta,150000,gasoline,ford,0.0,60386,,864,Frankfurt am Main Fechenheim,50.11670,8.68333
49997,90761,,limousine,1996,manual,150,5er,150000,gasoline,bmw,0.0,28309,130.0,2275,Bremen,53.07516,8.80777
49998,39709,,limousine,2007,manual,122,1er,100000,diesel,bmw,0.0,83623,500.0,8144,Dietramszell,47.85000,11.60000


Features in use

In [4]:
cat_features = ["type", "gearbox", "model", "fuel", "brand", "city"]
cont_missing_features = ["engine_capacity", "damage", "insurance_price", "latitude", "longitude"]
cat_missing_features = ["type", "gearbox", "model", "fuel", "city"]

## Functions for dataframe manipulations
* mape - calculated mean absolute percentage error
* concatenate dataframes
* split dataframes
* create submit-file for test set
* common preprocessing

In [5]:
def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def zip_dataframes(*dataframes):
    for idx, dataframe in enumerate(dataframes):
        dataframe["df_order"] = idx
    return pd.concat(dataframes)

def unzip_dataframes(dataframe):
    dataframes = []
    for n in dataframe["df_order"].unique().tolist():
        dataframes.append(dataframe[dataframe["df_order"] == n].drop(columns="df_order"))
    return dataframes
    

def create_submit_df(test_df, preds):
    submit_df = pd.DataFrame({
        "Id": test_df["index"],
        "Predicted": preds,
    })
    return submit_df

def preprocessing(train_df, test_df, funcs):
    train_df = train_df.copy()
    test_df = test_df.copy()
    for func in funcs:
        train_df, test_df = func(train_df, test_df)
    return train_df, test_df

## Preprocessing functions 
* fill NaNs with zeros
* fill NaNs as per strategy
* manual preprocessing
* drop useless columns
* drop outlayers
* cat. features encoding

In [6]:
def impute_nan_with_zero(train_df, test_df):
    for cat_feature in cat_features:
        train_df[cat_feature] = train_df[cat_feature].fillna("nan")
        test_df[cat_feature] = test_df[cat_feature].fillna("nan")
    train_df = train_df.fillna(0)
    test_df = test_df.fillna(0)
    return train_df, test_df

def impute_nan(train_df, test_df):
    for cont_missing_feature in cont_missing_features:
        imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        imp.fit(pd.concat([train_df, test_df])[[cont_missing_feature]])
        train_df[cont_missing_feature] = imp.transform(train_df[[cont_missing_feature]])
        test_df[cont_missing_feature] = imp.transform(test_df[[cont_missing_feature]])

    for cat_missing_feature in cat_missing_features:
        imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="nan")

        imp.fit(pd.concat([train_df, test_df])[[cat_missing_feature]])
        train_df[cat_missing_feature] = imp.transform(train_df[[cat_missing_feature]])
        test_df[cat_missing_feature] = imp.transform(test_df[[cat_missing_feature]])
    return train_df, test_df

def drop_columns(train_df, test_df):
    drop_columns = ["index"]
    train_df = train_df.drop(columns=drop_columns)
    test_df = test_df.drop(columns=drop_columns)
    return train_df, test_df

def drop_price_outliers(train_df, test_df):
    upper_bound = np.quantile(train_df.price, 0.95)
    train_df = train_df[train_df.price <= upper_bound]
    return train_df, test_df


def drop_insurance_price_outliers(train_df, test_df):
    upper_bound = np.quantile(train_df.insurance_price, 0.99)
    train_df = train_df[train_df.insurance_price <= upper_bound]
    return train_df, test_df

def fill_insurance_price(train_df, test_df):
    train_df.loc[train_df.insurance_price.isna(), "insurance_price"] = train_df.insurance_price.mean()
    return train_df, test_df
    
def fix_registration_year(train_df, test_df):
    train_df.loc[train_df.registration_year < 100, "is_fixed_reg_year"] = 1.0
    train_df.registration_year = train_df.registration_year.apply(lambda y : 2000 + y if y < 21 else y)
    train_df.registration_year = train_df.registration_year.apply(lambda y : 1900 + y if y < 100 else y)
    
    test_df.loc[test_df.registration_year < 100, "is_fixed_reg_year"] = 1.0
    test_df.registration_year = test_df.registration_year.apply(lambda y : 2000 + y if y < 21 else y)
    test_df.registration_year = test_df.registration_year.apply(lambda y : 1900 + y if y < 100 else y)
    return train_df, test_df

def cat_encode(train_df, test_df):
    for cat_feature in cat_features:
        le = LabelEncoder()
        le.fit(pd.concat([train_df, test_df])[cat_feature])
        train_df[cat_feature] = le.transform(train_df[cat_feature])
        test_df[cat_feature] = le.transform(test_df[cat_feature])
        
    return train_df, test_df

def indicate_missing(train_df, test_df):
    for missing_feature in cont_missing_features+cat_missing_features:
        imp = MissingIndicator(missing_values=np.nan)
        imp.fit(pd.concat([train_df, test_df])[[missing_feature]])
        train_df["is_missing_" + missing_feature] = imp.transform(train_df[[missing_feature]])
        test_df["is_missing_" + missing_feature] = imp.transform(test_df[[missing_feature]])
    return train_df, test_df

## Deep Neural Networks Functions
* Function to plot DNN logs (MAPE)
* Function to break DNN training when reach some accuracy (MAPE) value:
* Identity block for ResNet
* Dense block for ResNet


In [7]:
def hs_plot(history):
    ''' history plot '''
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    mape = history.history['mape']
    val_mape = history.history['val_mape']
    epochs = range(1, len(loss) + 1)
    plt.plot(epochs, loss, color='red', label='Training loss')
    plt.plot(epochs, val_loss, color='deeppink', label='Validation loss')
    plt.plot(epochs, mape, color='lime', label='mape')
    plt.plot(epochs, val_mape, color='green', label='Validation mape')
    plt.title('Training and validation loss & Metrics(accuracy)')
    plt.xlabel('Epochs')
    plt.ylabel('Loss / acc')
    plt.grid()
    plt.legend()
    plt.savefig('hist.png')
    plt.show()

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('loss')<1):
            print("\nMAPE reached 1.5 so cancelling training!")
            self.model.stop_training = True
    
def identity_block(input_tensor,units):
    '''he identity block is the block that has no conv layer at shortcut.
    # Arguments
    input_tensor: input tensor
    units:output shape
	# Returns
	Output tensor for the block.
	'''
    x = Dense(units, activation='relu')(input_tensor)
    x = BatchNormalization()(x)
    x = Dropout(0.75)(x)
    x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.25))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.75)(x)
    x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.25))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.75)(x)
    x = add([x, input_tensor])
    x = Activation('relu')(x)
    return x

def dens_block(input_tensor,units):
    ''' A block that has a dense layer at shortcut.
    # Arguments
    input_tensor: input tensor
    unit: output tensor shape
    # Returns
    Output tensor for the block.
    '''
    x = Dense(units, activation='relu')(input_tensor)
    x = BatchNormalization()(x)
    x = Dropout(0.75)(x)
    x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.25))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.75)(x)
    x = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.25))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.75)(x)
    shortcut = Dense(units)(input_tensor)
    shortcut = BatchNormalization()(shortcut)
    x = add([x, shortcut])
    x = Activation('relu')(x)
    return x


## Cross Validation Function

In [8]:
def cross_validate(
    DNN,
    model,
    train_df,
    kfold,
    metric,
    preproc_funcs,
    target="price",
    test_df=None,
    log_target=False,
    *args,
    **kwargs
):
    val_scores = []
    test_preds = []
    train_df = train_df.drop_duplicates()
    
    if isinstance(kfold, GroupKFold):
        splits = kfold.split(train_df, groups=kwargs["groups"])
    elif isinstance(kfold, StratifiedKFold):
        target_values = train_df[[target]]
        est = KBinsDiscretizer(n_bins=50, encode='ordinal', strategy='quantile')
        stratify_on = est.fit_transform(target_values).T[0]
        splits = kfold.split(train_df, stratify_on)
    else:
        splits = kfold.split(train_df)

    for idx, (tr_idx, val_idx) in enumerate(splits):
        tr_df = train_df.iloc[tr_idx]
        val_df = train_df.iloc[val_idx]
        
        if test_df is not None:
            tr_df, zip_df = preprocessing(tr_df, zip_dataframes(val_df, test_df), preproc_funcs)
            val_df, ts_df = unzip_dataframes(zip_df)
        else:
            tr_df, val_df = preprocessing(tr_df, val_df, preproc_funcs)
        
        x_tr = tr_df.drop(columns=target).values
        y_tr = tr_df[target].values
        x_val = val_df.drop(columns=target).values
        y_val = val_df[target].values
        
        if log_target:
            y_tr = np.log(y_tr)
            y_val = np.log(y_val)
        
        x_tr = np.asarray(x_tr).astype(np.float32)
        y_tr = np.asarray(y_tr).astype(np.float32)
        x_val = np.asarray(x_val).astype(np.float32)
        
        if DNN: model.fit(x_tr, y_tr, epochs=5,
                           verbose=1,
                           batch_size=32,
                           callbacks=[callbacks],
                           validation_split=0.05)
        else:
            model.fit(x_tr, y_tr)
        preds = model.predict(x_val)
        
        preds = np.exp(preds) if log_target else preds
        y_val = np.exp(y_val) if log_target else y_val
        
        fold_score = metric(y_val, preds)
        val_scores.append(fold_score)
        
        print(f"fold {idx+1} score: {fold_score}")

        if test_df is not None:
            x_ts = ts_df.drop(columns=target).values
            x_ts = np.asarray(x_ts).astype(np.float32)
            test_fold_preds = model.predict(x_ts)
            test_fold_preds = np.exp(test_fold_preds) if log_target else test_fold_preds
            test_preds.append(test_fold_preds)
            
    print(f"mean score: {np.mean(val_scores)}")
    print(f"score variance: {np.var(val_scores)}")

    if test_df is not None:
        return val_scores, test_preds
    
    return val_scores

## Run LGBM Regression model

In [9]:
%%time
model = LGBMRegressor(
    random_state=42,
    objective='mape',
    num_leaves=100,
    max_depth=-1,
    learning_rate=0.03,
    num_iterations=100,
    subsample=0.5)

kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
preproc_funcs = [indicate_missing, impute_nan_with_zero, drop_columns, cat_encode]

val_scores, test_preds = cross_validate(False,
    model, 
    train_df,
    kfold,
    mape,
    preproc_funcs,
    test_df=test_df,
    log_target=True)

fold 1 score: 25.342916622216453
fold 2 score: 25.96884289647295
fold 3 score: 25.082276540154382
fold 4 score: 25.092093123317742
fold 5 score: 25.46615069228981
mean score: 25.390455974890266
score variance: 0.10530321330907041
Wall time: 14.7 s


## Run XGBoost Regression model

In [10]:
%%time
model = XGBRegressor(colsample_bytree=0.6,
                         gamma=0.6,
                         learning_rate=0.1,
                         max_depth=20,
                         min_child_weight=6,
                         n_estimators=200,
                         nthread=-1,
                         reg_alpha=0.8,
                         subsample=1,
                         random_state=42,
                         objective='reg:squarederror')
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
preproc_funcs = [indicate_missing, impute_nan_with_zero, drop_columns, cat_encode]

val_scores, test_preds = cross_validate(False,
    model, 
    train_df,
    kfold,
    mape,
    preproc_funcs,
    test_df=test_df,
    log_target=True)

fold 1 score: 22.882582724327257
fold 2 score: 23.19899623333632
fold 3 score: 22.51334350172197
fold 4 score: 22.675473423983387
fold 5 score: 23.065459344645276
mean score: 22.86717104560284
score variance: 0.06232112670718617
Wall time: 1min 30s


## Deep Neural Network (ResNet50) assembling 

In [11]:
callbacks = myCallback()
width = 16
net_input = Input(shape=(25,))
x = dens_block(net_input, width)
x = identity_block(x, width)
x = identity_block(x, width)
x = dens_block(x, width)
x = identity_block(x, width)
x = identity_block(x, width)
x = dens_block(x, width)
x = identity_block(x, width)
x = identity_block(x, width)
x = BatchNormalization()(x)
x = Dense(1, activation='relu')(x)

## Run ResNet50 model

In [12]:
%%time
model = Model(inputs=net_input, outputs=x)
model.compile(loss='mean_absolute_percentage_error',
              optimizer='adam')
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
preproc_funcs = [indicate_missing, impute_nan_with_zero, drop_columns, cat_encode]


val_scores, test_preds = cross_validate(True,
    model, 
    train_df,
    kfold,
    mape,
    preproc_funcs,
    test_df=test_df,
    log_target=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fold 1 score: 97.96941124258271
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fold 2 score: 94.3550771178399
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fold 3 score: 95.79938993209376
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fold 4 score: 103.05141043999602
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fold 5 score: 101.17995847771704
mean score: 98.47104944204588
score variance: 10.529705725798296
Wall time: 4min 38s


## Conclusion
Running the classic model through 5 cross validation iteration we received more or less stable results where XGBoost showed a bit better results. Meanwhile **DNN** results are not so good. When **XGBoost** gives around 23 MAPE (Mean Absolut percentage error), **DNN** shows only around 100. 

The problem is in overfitting. Yes, taking into account a lot of problems with the date, as it usually happens in a real world, overfiting here is a real problem. Even regularization L2 with lambda up to 0.5 or dropout of layers up to 0.5 didn’t give any significant results. The best stabile results where reached with dropout 0.75 and L2 regularization lambda 0.75. 

So, the conclusion is that **DNN** is to complicate for such simple tasks like Regression and classic models like **XGBoost** or **LightGBM** demonstrate really great results.   