## Model Training
#### (Use this notebook to train the neural network)

In [32]:
import pandas as pd 
import numpy as np
from datetime import datetime, timedelta
from glob import glob 
import tensorflow as tf
from keras import Input, Model
from keras.layers import Dense
import plotly.express as px

In [33]:
#get pressure data
df = []
for f in glob('data/tank_header_pressure_*.csv'):
    df.append(pd.read_csv(f))

df = pd.concat(df)
df.drop(columns=["TagType"], inplace=True)
df.timestamp = pd.to_datetime(df.timestamp)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2246256 entries, 0 to 246255
Data columns (total 3 columns):
 #   Column        Dtype         
---  ------        -----         
 0   timestamp     datetime64[ns]
 1   FACILITY_ID   int64         
 2   pressure_osi  float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 68.6 MB


In [34]:
# known data to build training set from
knowns = [
    (10085460,"2022-07-05 15:14:00","2022-08-25 21:14:00"),
    (10085941,"2021-10-19 22:42:00","2021-10-28 14:57:00"),
    (10085510,"2022-02-14 16:43:21","2022-05-13 15:28:13"),
    (10086098,"2022-07-24 21:12:00","2022-09-21 16:57:54"),
    (10085544,"2022-10-13 20:40:25","2022-11-27 16:55:24"),
    (10111756,"2022-11-12 12:00:00","2023-02-23 12:00:00"),
    (10085464,"2022-10-13 00:00:01","2022-10-13 00:00:00"),
    (10085694,"2022-10-13 00:00:01","2022-11-27 00:00:00"),
    (10086083,"2022-11-23 18:20:01","2022-12-09 06:13:49")
]

In [35]:
#build training dataframe
COLUMNS = ["isOpen", "facilityMean", "facilitySTD", "localMean", "localSTD"]

all_data = []
for (facility_id, open, closed) in knowns:
    #filter dataframe by specific facility id
    df_facility = df[df.FACILITY_ID == facility_id].copy()
    #time the hatch was opened and closed
    timeOpen = pd.to_datetime(pd.Timestamp(open))
    timeClosed = pd.to_datetime(pd.Timestamp(closed))
    #get facility info
    facilityMean = df_facility["pressure_osi"].mean()
    facilitySTD = df_facility["pressure_osi"].std()
    #time to start getting data
    tStart = df_facility["timestamp"].min().round("4H") + timedelta(days=2)
    tEnd = df_facility["timestamp"].max().round("4H")
    #how often to add a new datapoint
    tFreq = timedelta(hours=4)

    t = tStart
    while t < tEnd:
        #get local info
        isOpen = timeOpen < t and t < timeClosed
        dfLocal = df_facility[(df_facility.timestamp > t-timedelta(days=2))&
                              (df_facility.timestamp < t)]
        localMean = dfLocal["pressure_osi"].mean()
        localSTD = dfLocal["pressure_osi"].std()

        #append datapoint
        all_data.append((isOpen, facilityMean, facilitySTD, localMean, localSTD))

        t += tFreq

#convert the list of datapoints to a dataframe
df_all_data = pd.DataFrame(all_data, columns=COLUMNS)
#drop any rows with a NaN
df_all_data.dropna(inplace=True)

df_all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15721 entries, 0 to 15803
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   isOpen        15721 non-null  bool   
 1   facilityMean  15721 non-null  float64
 2   facilitySTD   15721 non-null  float64
 3   localMean     15721 non-null  float64
 4   localSTD      15721 non-null  float64
dtypes: bool(1), float64(4)
memory usage: 629.5 KB


In [36]:
#convert the dataframe to a tensorflow dataset
text = df_all_data[COLUMNS[1:]]
targets = df_all_data[COLUMNS[0]]
dataset_all = tf.data.Dataset.from_tensor_slices((text, targets))
dataset_all

<TensorSliceDataset element_spec=(TensorSpec(shape=(4,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.bool, name=None))>

In [37]:
VALIDATION_SET_SIZE = 0.2
BATCH_SIZE = 8
LEN = len(dataset_all)

#shuffle dataset
dataset_all = dataset_all.shuffle(LEN)

#split dataset into training and validation datasets
split = int(LEN * VALIDATION_SET_SIZE)
dataset_training = dataset_all.skip(split).take(LEN-split)
dataset_validation = dataset_all.take(split)

#batch datasets
dataset_training = dataset_training.batch(BATCH_SIZE)
dataset_validation = dataset_validation.batch(BATCH_SIZE)

print("Training set:", dataset_training)
print("Training set size:", len(dataset_training))
print("Validation set:", dataset_validation)
print("Validation set size:", len(dataset_validation))

Training set: <BatchDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.bool, name=None))>
Training set size: 1573
Validation set: <BatchDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.bool, name=None))>
Validation set size: 393


In [38]:
#function used to determine model accuracy (lower is better)
def test(true, pred):
    #diff is the exact difference between true and predicted
    true = tf.cast(true, float)
    diff = tf.subtract(true, pred)
    diff = tf.abs(diff)
    #incentive punishes the model for false negative
    incentive = tf.subtract(true, pred)
    incentive = tf.multiply(incentive, true)
    incentive = tf.multiply(incentive, 5)
    diff = tf.add(diff, incentive)
    return diff

In [39]:
tf.keras.backend.clear_session()

#set up the neural network
inputs = Input(shape=(4,))
x = Dense(8)(inputs)
outputs = Dense(1, activation="sigmoid")(x)

model = Model(inputs, outputs)

#compile the neural network
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001), loss=test)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4)]               0         
                                                                 
 dense (Dense)               (None, 8)                 40        
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 49
Trainable params: 49
Non-trainable params: 0
_________________________________________________________________


In [40]:
#train the neural network!
history = model.fit(dataset_training, 
                    validation_data=dataset_validation, 
                    epochs=20)

#plot the model's progress
fig = px.line(
    history.history, y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'test() score'}, 
    title='Training History')
fig

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [41]:
#verify model on a known test case and 4 known times
test_facility = 10085460
test_times = ['2022-08-17 15:14:00','2022-08-23 12:00:00','2022-09-05 19:45:00','2022-07-01 02:03:00']
df_460 = df[df.FACILITY_ID == test_facility].copy()
facilityMean = df_460["pressure_osi"].mean()
facilitySTD = df_460["pressure_osi"].std()

preds = []
for t in test_times:
    time = pd.to_datetime(pd.Timestamp(t))
    dfLocal = df_460[(df_460.timestamp > time-timedelta(days=1))&(df_460.timestamp < time+timedelta(days=1))]
    localMean = dfLocal["pressure_osi"].mean()
    localSTD = dfLocal["pressure_osi"].std()
    
    testInput = tf.constant([[facilityMean, facilitySTD, localMean, localSTD]])
    preds.append(model.predict(testInput)[0][0].round(20))

print("this should be close to 1 ->", preds[0])
print("this should be close to 1 ->", preds[1])
print("this should be close to 0 ->", preds[2])
print("this should be close to 0 ->", preds[3])

this should be close to 1 -> 0.9708329
this should be close to 1 -> 0.9717755
this should be close to 0 -> 0.0006714915
this should be close to 0 -> 0.0064867763


In [42]:
#save the model to 'models/model'
model.save("models/model")

INFO:tensorflow:Assets written to: models/model\assets
