## Case Study 6

Identifying a new particle using a large dataset.

Client wants a high level of accuracy.

Binary - 1 for detection and 0 for nondetection

Build a neural network and a classifier.

"Build a dense neural network to accurately detect the particle. The goal is to maximize your accuracy. Include a discussion of how you know your model has finished training as well as what design decisions you made while building the network."

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt


In [4]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

REPLICAS:  1
Num GPUs Available:  1


In [3]:
df = pd.read_csv('all_train.csv', dtype='float16')

In [6]:
df.describe()

Unnamed: 0,# label,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f18,f19,f20,f21,f22,f23,f24,f25,f26,mass
count,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,...,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0
mean,,,0.0,0.0,,-0.0,0.0,,0.0,0.0,...,,-0.0,0.0,0.0,,,0.0,-0.0,,
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
min,0.0,-1.960938,-2.365234,-1.732422,-9.976562,-1.732422,-1.054688,-3.035156,-2.757812,-1.732422,...,-1.728516,-2.28125,-1.731445,-0.5737305,-3.630859,-4.730469,-20.625,-3.453125,-2.632812,500.0
25%,0.0,-0.7290039,-0.7333984,-0.8657227,-0.609375,-0.8657227,-1.054688,-0.7568359,-0.7011719,-0.8657227,...,-0.7421875,-0.7207031,-0.8657227,-0.5737305,-0.5419922,-0.5117188,-0.3544922,-0.6923828,-0.7944336,750.0
50%,1.0,-0.03930664,0.0008523464,0.0003198385,0.01963806,-0.0005071163,-0.00598526,-0.1499023,-0.0001067519,0.001384735,...,-0.08990479,-6.735325e-05,-0.0004425049,-0.5737305,-0.1602783,-0.3144531,-0.326416,-0.3569336,-0.08828735,1000.0
75%,1.0,0.6899414,0.7348633,0.8657227,0.6796875,0.8657227,0.8505859,0.7685547,0.7011719,0.8666992,...,0.6420898,0.7207031,0.8657227,-0.5737305,0.4812012,0.1634521,-0.2337646,0.4753418,0.7612305,1250.0
max,1.0,4.378906,2.365234,1.732422,4.148438,1.732422,4.484375,3.720703,2.757812,1.731445,...,5.867188,2.28125,1.732422,1.743164,7.292969,9.335938,14.99219,5.277344,4.445312,1500.0


In [7]:
count_na = df.isna().sum()
print(count_na)
class_counts = df['# label'].value_counts()

# label    0
f0         0
f1         0
f2         0
f3         0
f4         0
f5         0
f6         0
f7         0
f8         0
f9         0
f10        0
f11        0
f12        0
f13        0
f14        0
f15        0
f16        0
f17        0
f18        0
f19        0
f20        0
f21        0
f22        0
f23        0
f24        0
f25        0
f26        0
mass       0
dtype: int64


In [8]:
color_list = ['#92B2F7', '#FA7D7A']
fig = go.Figure()
#create a horizontal bar for each class in the particle data.
for idx, (label, count) in enumerate(class_counts.items()):
    fig.add_trace(
        go.Bar(
            y=[str(label)],
            x=[count],
            name=str(label),
            orientation='h',
            text=[count],
            textposition='auto',
            marker_color=color_list[idx % len(color_list)]
        )
    )

fig.update_layout(
    title_text='Frequency of Particle Existing or Not Existing',
    title_x=0.5, 
    xaxis_title='Frequency',
    yaxis_title='Label',
    showlegend = False,
    autosize=False,
    width=500,
    height=500,
    margin=dict(l=50, r=50, b=100, t=100, pad=4),
    plot_bgcolor='white', 
    xaxis=dict(gridcolor='lightgrey')
)

fig.show()

##### Standardization of Data

In [9]:
#split the label values into y and the features into X
y = df['# label'].values
X = df.drop('# label', axis=1).values

scaler = StandardScaler()
X = scaler.fit_transform(X)

#split into train and test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1234)

#split test set into validation and test set
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1234)

##### Dense Neural Network

In [35]:
def build_model(hp):
    model = keras.Sequential()

    #Add hidden layers
    for i in range(hp.Int('num_layers', 2, 6)):
        model.add(layers.Dense(units=hp.Int('units_' + str(i), min_value=16, max_value=512, step=8), activation='relu'))

    #Add output layer
    model.add(layers.Dense(1, activation='sigmoid'))

    #Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])), loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [38]:
#build a keras_tuner RandomSearch to tune the training process and optimize the Neural Network
# https://www.tensorflow.org/tutorials/keras/keras_tuner
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=15,
    factor=3,
    directory='hp_tuning',
    project_name='Particle_CS6',
    overwrite=True
)
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [39]:
#train the DNN with a hyper parameter search
tuner.search(X_train, y_train,
             epochs=50,
             validation_data=(X_val, y_val),
             callbacks=[es_callback])

#get the best hyperparameters and store them in a var
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]




Trial 30 Complete [00h 58m 13s]
val_accuracy: 0.8819666504859924

Best val_accuracy So Far: 0.8858504891395569
Total elapsed time: 12h 44m 33s
INFO:tensorflow:Oracle triggered exit


KeyError: 'units does not exist.'

Error occured not in the code to tune the hyperparameters but in a print statement that was referencing a bad variable name.  Moved the code after to a new cell and ran from there given that the extensive training time had completed successfully.

##### Train using the best hyperparameters

After using the Hyperband to tune the hyperparameters, the identified 'best' hyperparameters will be used to train a final model.

In [40]:
denseNN_model = tuner.hypermodel.build(best_hps)
model = denseNN_model.fit(X_train, y_train, epochs=500, validation_data=(X_val, y_val), callbacks=[es_callback])


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500


In [46]:
best_hps.values

{'num_layers': 4,
 'units_0': 384,
 'units_1': 312,
 'learning_rate': 0.0001,
 'units_2': 448,
 'units_3': 472,
 'units_4': 296,
 'units_5': 360,
 'tuner/epochs': 15,
 'tuner/initial_epoch': 5,
 'tuner/bracket': 2,
 'tuner/round': 2,
 'tuner/trial_id': '0014'}

In [57]:
tuner.results_summary()

Results summary
Results in hp_tuning\Particle_CS6
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 0016 summary
Hyperparameters:
num_layers: 4
units_0: 384
units_1: 312
learning_rate: 0.0001
units_2: 448
units_3: 472
units_4: 296
units_5: 360
tuner/epochs: 15
tuner/initial_epoch: 5
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: 0014
Score: 0.8858504891395569

Trial 0017 summary
Hyperparameters:
num_layers: 4
units_0: 432
units_1: 232
learning_rate: 0.0001
units_2: 184
units_3: 48
units_4: 488
units_5: 216
tuner/epochs: 15
tuner/initial_epoch: 5
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: 0012
Score: 0.8855599761009216

Trial 0024 summary
Hyperparameters:
num_layers: 6
units_0: 128
units_1: 440
learning_rate: 0.0001
units_2: 488
units_3: 344
units_4: 320
units_5: 472
tuner/epochs: 15
tuner/initial_epoch: 5
tuner/bracket: 1
tuner/round: 1
tuner/trial_id: 0022
Score: 0.8847533464431763

Trial 0027 summary
Hyperparameters:
num_layers: 5
units_0: 88
units_1

In [48]:
#Model performance
test_loss, test_acc = denseNN_model.evaluate(X_test, y_test, verbose=2)

32813/32813 - 36s - loss: 0.2572 - accuracy: 0.8854 - 36s/epoch - 1ms/step


Plotting the model convergence.

In [66]:
import plotly.graph_objects as go
import plotly.subplots as sp

#bring in the final_model.csv training and validation scores
df = pd.read_csv('final_model.csv')
df.columns = df.columns.str.strip()


In [63]:
df

Unnamed: 0,Epoch,Training Loss,Training Accuracy,Validation Loss,Validation Accuracy
0,1,0.2754,0.8748,0.2664,0.8801
1,2,0.2638,0.8813,0.2616,0.8825
2,3,0.2609,0.8828,0.2604,0.8835
3,4,0.2592,0.8839,0.2591,0.8841
4,5,0.258,0.8846,0.2593,0.884
5,6,0.2571,0.8851,0.2588,0.8843
6,7,0.2563,0.8855,0.2581,0.8847
7,8,0.2557,0.8858,0.2587,0.8846
8,9,0.255,0.8862,0.2566,0.8852
9,10,0.2543,0.8866,0.2563,0.8853


In [77]:

fig = sp.make_subplots(rows=2, cols=1, subplot_titles=('Loss over Epochs', 'Accuracy over Epochs'))
#add in each trace
fig.add_trace(go.Scatter(x=df['Epoch'], y=df['Training Loss'], mode='lines', name='Training Loss', line=dict(color='blue')), row=1, col=1)
fig.add_trace(go.Scatter(x=df['Epoch'], y=df['Validation Loss'], mode='lines', name='Validation Loss', line=dict(color='red')), row=1, col=1)
fig.add_trace(go.Scatter(x=df['Epoch'], y=df['Training Accuracy'], mode='lines', name='Training Accuracy', line=dict(color='green')), row=2, col=1)
fig.add_trace(go.Scatter(x=df['Epoch'], y=df['Validation Accuracy'], mode='lines', name='Validation Accuracy', line=dict(color='orange')), row=2, col=1)

#get location of min, max, and values for those plus final values
min_loss_epoch = df['Validation Loss'].idxmin() + 1
min_loss = df.loc[min_loss_epoch-1, 'Validation Loss']
max_acc_epoch = df['Validation Accuracy'].idxmax() + 1
max_acc = df.loc[max_acc_epoch-1, 'Validation Accuracy']
final_loss = df.loc[df['Epoch'].idxmax(), 'Validation Loss']
final_acc = df.loc[df['Epoch'].idxmax(), 'Validation Accuracy']

#add in the annotaions of the above metrics
fig.add_annotation(dict(x=min_loss_epoch, y=min_loss, text=f"Lowest Validation Loss: {min_loss:.4f}", showarrow=False, arrowhead=1, yshift=15), row=1, col=1)
fig.add_annotation(dict(x=max_acc_epoch, y=max_acc, text=f"Highest Validation Accuracy: {max_acc:.4f}", showarrow=False, arrowhead=1, yshift=-30), row=2, col=1)
fig.add_annotation(dict(x=df['Epoch'].max(), y=final_loss, text=f"Final Validation Loss: {final_loss:.4f}", showarrow=False, arrowhead=1, yshift=-10), row=1, col=1)
fig.add_annotation(dict(x=df['Epoch'].max(), y=final_acc, text=f"Final Validation Accuracy: {final_acc:.4f}", showarrow=False, arrowhead=1, yshift=-10), row=2, col=1)

#update layout
fig.update_layout(height=700, width=900, title_text="Training and Validation Metrics over Epochs", title_x=0.5)
fig.update_xaxes(title_text='Epoch', row=1, col=1)
fig.update_xaxes(title_text='Epoch', row=2, col=1)
fig.update_yaxes(title_text='Loss', row=1, col=1, range=[0.24, 0.3]) # you can adjust the range as per your requirements
fig.update_yaxes(title_text='Accuracy', row=2, col=1, range=[0.87, 0.9]) # you can adjust the range as per your requirements

fig.show()