In [None]:
import optuna
import pickle
import numpy as np
import pandas as pd

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for neural networks
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.backend import clear_session
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

# train_test_split was moved from cross_validation to model_selection in 0.18
from sklearn.model_selection import train_test_split

# import MinMixScaler for normalization of training data
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import confusion_matrix, classification_report

## Define parameters batchsize and epochs

In [None]:
BATCHSIZE = 1000
# number of epochs
EPOCHS = 300

# needed to save best model so far
global best_accuracy_so_far

## The objective function for optuna to optimize the hyperparameters as well as the preperation of the training data

In [None]:
def objective(trial):
    global best_accuracy_so_far

    # Clear clutter from previous Keras session graphs.
    clear_session()

    # Import DataFrame
    trees = pd.read_csv('../train.csv', header=0, index_col="Id")

    # Convert Wilderness_Types to single column
    columns = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']
    wilderness_areas = []
    for index, row in trees.iterrows():
        dummy = 'N/A'
        i = 0
        for col in columns:
            i += 1
            if row[col] == 1:
                dummy = i
                break
        wilderness_areas.append(dummy)
    trees['Wilderness_Areas'] = wilderness_areas



    # Convert Soil_Types to single column
    # Drop Soil_Types that never occur (always 0) here and later in actual drop
    columns = [
        'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4',
        'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
        'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
        'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
        'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
        'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
        'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
        'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
        'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
        'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40']
    soil_types = []
    for index, row in trees.iterrows():
        dummy = 'N/A'
        i = 0
        for col in columns:
            i += 1
            if row[col] == 1:
                dummy = i
                break
        soil_types.append(dummy)
    trees['Soil_Types'] = soil_types

    trees.drop(columns=['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4'], inplace=True)

    trees.drop(columns=[
        'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4',
        'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
        'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
        'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
        'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
        'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
        'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
        'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
        'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
        'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40'], inplace=True)



    # get features X and labels y
    X = trees.values[:,:-1]
    y = trees["Cover_Type"]-1
    # split dataset into training and validation datasets
    X_train, X_val, y_train, y_val = train_test_split(X, y)

    scaler = MinMaxScaler()
    # Fit only to the training data
    scaler.fit(X_train)
    # save fitted scaler, because you need it later for the test dataset
    pickle.dump(scaler, open("scaler.p", "wb"))

    # Now apply the transformations to the data:
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)

    # create neural network
    model = Sequential()

    model.add(Input(shape=12))
    model.add(Dense(128, kernel_initializer='random_uniform', activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(256, kernel_initializer='he_uniform', activation='selu'))
    model.add(Dense(units=trial.suggest_int("units", 64, 128, step=16), kernel_initializer='he_uniform',
                    activation=trial.suggest_categorical("activation1", ["selu", "relu", "linear"])))
    model.add(Dropout(0.3))
    model.add(Dense(units=trial.suggest_int("units", 64, 128, step=16), kernel_initializer='uniform',
                   activation=trial.suggest_categorical("activation2", ["relu", "linear"])))
    model.add(Dropout(0.2))
    model.add(tf.keras.layers.LayerNormalization(axis=1 , center=True , scale=True))
    model.add(tf.keras.layers.Flatten())
    # Adding dropout to prevent overfitting
    model.add(Dropout(rate=trial.suggest_float("rate", 0.2, 0.5, step=0.1)))
    model.add(Dense(7, kernel_initializer='uniform', activation='sigmoid'))
    # Sigmoid outputs the probability predicted for each label individually


    # We compile our model with a sampled learning rate.
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=Adam(learning_rate=learning_rate),
        metrics=["accuracy"],
    )

    my_callbacks = tf.keras.callbacks.EarlyStopping(patience=2)

    # train neural network
    my_model = model.fit(
        X_train,
        y_train,
        validation_data=(X_val, y_val),
        shuffle=True,
        batch_size=BATCHSIZE,
        epochs=EPOCHS,
        verbose=False,
        callbacks=my_callbacks
    )


    # Evaluate the model accuracy on the validation set.
    score = model.evaluate(X_val, y_val, verbose=0)

    # save best model so far to be able to use the best model later to predict with test data
    if score[1] >= best_accuracy_so_far:
        tf.keras.models.save_model(model, '{0}.mdl'.format(trial.number))
        best_accuracy_so_far = score[1]

    # return accuracy
    return score[1]

In [None]:
best_accuracy_so_far = -100
study = optuna.create_study(direction="maximize")
# be cautious with the number of trials: Do not use a number larger than 50
# this call starts the hyperparameter optimization process: the above define function "objective" is called with
# n_trials different hyperparameter combinations
study.optimize(objective, n_trials=5, timeout=2000)

Analyse the best model and use it to predict accuracy on test data

In [None]:
print("Best trial:")
trial = study.best_trial
print(trial)

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Load the best model. This model was saved in the function "objective".

In [None]:
best_model = tf.keras.models.load_model('{0}.mdl'.format(trial.number))

### Return performance of final model on new data (test data)
TODO: only load test data here, that you get a few days before the deadline.

In [None]:
trees = pd.read_csv('../test_file.csv', header=0, index_col="Id")

# Do the same as with the training data at the beginning
columns = ['Wilderness_Area_0', 'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3']
soil_types = []
wilderness_areas = []
for index, row in trees.iterrows():
    dummy = 'N/A'
    i = 0
    for col in columns:
        i += 1
        if row[col] == 1:
            dummy = i
            break
    wilderness_areas.append(dummy)
trees['Wilderness_Areas'] = wilderness_areas

columns = [
    'Soil_Type_0', 'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3',
    'Soil_Type_4', 'Soil_Type_5', 'Soil_Type_6', 'Soil_Type_7',
    'Soil_Type_8', 'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11',
    'Soil_Type_12', 'Soil_Type_13', 'Soil_Type_14', 'Soil_Type_15',
    'Soil_Type_16', 'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19',
    'Soil_Type_20', 'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23',
    'Soil_Type_24', 'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27',
    'Soil_Type_28', 'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31',
    'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34', 'Soil_Type_35',
    'Soil_Type_36', 'Soil_Type_37', 'Soil_Type_38', 'Soil_Type_39']
soil_types = []

for index, row in trees.iterrows():
    dummy = 'N/A'
    i = 0
    for col in columns:
        i += 1
        if row[col] == 1:
            dummy = i
            break
    soil_types.append(dummy)
trees['Soil_Types'] = soil_types


trees.drop(columns=['Wilderness_Area_0', 'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3'], inplace=True)



trees.drop(columns=[
    'Soil_Type_0', 'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3',
    'Soil_Type_4', 'Soil_Type_5', 'Soil_Type_6', 'Soil_Type_7',
    'Soil_Type_8', 'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11',
    'Soil_Type_12', 'Soil_Type_13', 'Soil_Type_15', 'Soil_Type_15',
    'Soil_Type_16', 'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19',
    'Soil_Type_20', 'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23',
    'Soil_Type_24', 'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27',
    'Soil_Type_28', 'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31',
    'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34', 'Soil_Type_35',
    'Soil_Type_36', 'Soil_Type_37', 'Soil_Type_38', 'Soil_Type_39'], inplace=True)


X_test = trees.values[:,1:-1]
y_test = trees["Cover_Type"]-1

scaler = pickle.load(open("scaler.p", "rb"))
# important: preprocessing of test dataset has to be the same as for the training dataset
X_test = scaler.transform(X_test)

Predicting the Test set results

In [None]:
y_pred = best_model.predict(X_test)
print(y_pred)
# create labels out of predictions
y_pred_labels = np.argmax(y_pred, axis=1)

Making the Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred_labels)

print("Our accuracy is {}%".format(((cm[0][0] + cm[1][1]) / cm.sum()) * 100))

Plot heatmap

In [None]:
sns.heatmap(cm, annot=True)
plt.savefig('confmat.png')

print(classification_report(y_test, y_pred_labels))