In [None]:
#Updated 12/21/2023

#---Jimmy's test AI WL QC model that currently does the following:---  
#-User defines station IDs, file names, binarization threshold, and hyperparameters for the neural net
#-Loads the train/validation/test data for multiple stations
#-Separates the 8 training features from the target values into X or Y for all datasets
#-Can apply random undersampling to the training data (uncomment and select a training option)
#-Can apply class weighting to the training data (uncomment and select a training option)
#-Builds the neural network model
#-Trains the model using the training dataset then performs a prediction on the validation data each epoch
#-Outputs the training/validation metrics per epoch in a csv
#-Performs predictions with the fully trained model on the validation data and outputs predictions in a csv 

import os
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.under_sampling import RandomUnderSampler
from tensorflow.keras.callbacks import Callback

# User defined list of station IDs for file loading
station_ids = ['1612340','1617433','8418150','8443970','8447386','8449130','8452660','8452944','8454049','8461490','8510560','8534720','8536110','8557380','8573364','8574680','8651370','8658120','8658163','8665530','8670870','8720030','8721604','8723214','8726520','8726607','8729108','8729840','8735180','8736897','8737048','8741533','8767816','8767961','8771341','8771450','8775870','8779770','9410840','9411340','9414290','9414750','9418767','9419750','9432780','9435380','9446484','9447130','9451054','9451600','9459450','9459881','9462450','9462620','9751381','9751639']

# User defined output file names
test_number = '3' #Update every time you run the code- appends a number to the output file names
output_metrics_file = f'out/metrics/metrics_{test_number}.csv'
output_model_file = f'out/models/model_{test_number}.h5'
output_val_predictions_file = f'out/predictions/val_pred_{test_number}.csv'
output_test_predictions_file = f'out/predictions/test_pred_{test_number}.csv'

# User defined binarization threshold for both validation and testing
threshold= 0.2 #0.5 was the default value

# User defined undersampling ratio (adjust as needed)
undersampling_ratio = 0.1

# User defined Hyperparameters
input_neurons = 8
hidden_layer1_neurons = 64
hidden_layer2_neurons = 32
output_neurons = 1
learning_rate = 0.001
epochs = 1
batch_size = 32

# Creation of a Validation callback to provide useful metrics per epoch during training
class ValidationMetricsCallback(Callback):
    def __init__(self, X_val, y_val, threshold):
        super().__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.threshold = threshold
        self.validation_metrics = []

    def on_epoch_end(self, epoch, logs=None):
        # Calculate accuracy for predicting "bad" points on the validation set
        y_pred_val = (self.model.predict(self.X_val) >= self.threshold).astype(int).flatten()
        true_bad_points_val = self.y_val[self.y_val == 0]
        predicted_bad_points_val = y_pred_val[self.y_val == 0]
        accuracy_bad_points_val = accuracy_score(true_bad_points_val, predicted_bad_points_val)

        # Calculate the number of instances where y_pred_val was 0 but y_true_val was 1
        false_negatives_val = np.sum((y_pred_val == 0) & (self.y_val == 1))
        
        # Print and store the accuracy and false negatives for each epoch
        print(f'Epoch {epoch + 1} - Validation Accuracy for "Bad" Points: {accuracy_bad_points_val * 100:.2f}%')
        print(f'Epoch {epoch + 1} - Validation False Negatives: {false_negatives_val}')
        
        self.validation_metrics.append({
            'Validation_Accuracy_Bad_Points': accuracy_bad_points_val,
            'False_Negatives_Val': false_negatives_val
        })

# Load the data for each station and concatenate them
all_train_data = []
all_test_data = []
all_val_data = []

for station_id in station_ids:
    train_file = f'data/{station_id}_processed_ver_merged_wl_train.csv'
    test_file = f'data/{station_id}_processed_ver_merged_wl_test.csv'
    val_file = f'data/{station_id}_processed_ver_merged_wl_validation.csv'

    if os.path.isfile(train_file) and os.path.isfile(test_file) and os.path.isfile(val_file):
        train_data_station = pd.read_csv(train_file)
        test_data_station = pd.read_csv(test_file)
        val_data_station = pd.read_csv(val_file)

        all_train_data.append(train_data_station)
        all_test_data.append(test_data_station)
        all_val_data.append(val_data_station)

# Concatenate data for all stations
train_data = pd.concat(all_train_data, ignore_index=True)
test_data = pd.concat(all_test_data, ignore_index=True)
val_data = pd.concat(all_val_data, ignore_index=True)

# Separate features and labels
X_train = train_data[['PRIMARY', 'PRIMARY_TRUE', 'PRIMARY_SIGMA', 'PRIMARY_SIGMA_TRUE', 'PRIMARY_RESIDUAL', 'BACKUP', 'BACKUP_TRUE', 'PREDICTION']].values
y_train = train_data['TARGET'].values

X_test = test_data[['PRIMARY', 'PRIMARY_TRUE', 'PRIMARY_SIGMA', 'PRIMARY_SIGMA_TRUE', 'PRIMARY_RESIDUAL', 'BACKUP', 'BACKUP_TRUE', 'PREDICTION']].values
y_test = test_data['TARGET'].values

X_val = val_data[['PRIMARY', 'PRIMARY_TRUE', 'PRIMARY_SIGMA', 'PRIMARY_SIGMA_TRUE', 'PRIMARY_RESIDUAL', 'BACKUP', 'BACKUP_TRUE', 'PREDICTION']].values
y_val = val_data['TARGET'].values


#Options to add undersampling/class weights# ---------------------------------------------------------------------------------------------------------------------------------------

# Apply random undersampling 
#rus = RandomUnderSampler(sampling_strategy=undersampling_ratio, random_state=42)
#X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Print the number of each class after resampling
#class_counts_resampled = pd.Series(y_train_resampled).value_counts()
#print("Class Counts After Resampling:")
#print(class_counts_resampled)


# Calculate class weights to be used in training
#class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
#class_weight = dict(zip(np.unique(y_train), class_weights))

# Calculate class weights to be used in training along with undersampling
#class_weights = compute_class_weight('balanced', classes=np.unique(y_train_resampled), y=y_train_resampled)
#class_weight = dict(zip(np.unique(y_train_resampled), class_weights))
#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Define the neural network model
model = Sequential([
    Dense(hidden_layer1_neurons, activation='relu', input_shape=(input_neurons,)),
    Dense(hidden_layer2_neurons, activation='relu'),
    Dense(output_neurons, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

# Define and compile the model before creating the callback
validation_metrics_callback = ValidationMetricsCallback(X_val, y_val, threshold)

#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#TRAINING OPTIONS- Choose One

#Train the model 
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=[validation_metrics_callback])

# Train the model with class weights
#history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=[validation_metrics_callback], class_weight=class_weight )

# Train the model with the resampled data
#history = model.fit(X_train_resampled, y_train_resampled, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=[validation_metrics_callback])

# Train the model with the resampled data and class weights
#history = model.fit(X_train_resampled, y_train_resampled, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=[validation_metrics_callback], class_weight=class_weight)

#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Save the trained model
model.save(output_model_file)

# Extract training metrics
training_metrics = pd.DataFrame({
    'Epoch': range(1, len(history.history['accuracy']) + 1),
    'Training_Loss': history.history['loss'],
    'Training_Accuracy': history.history['accuracy'],
    'Validation_Loss': history.history['val_loss'],
    'Validation_Accuracy': history.history['val_accuracy']
})

# Include the validation metrics for "bad" points in the training metrics DataFrame
validation_metrics_df = pd.DataFrame(validation_metrics_callback.validation_metrics)
training_metrics = pd.concat([training_metrics, validation_metrics_df], axis=1)

# Save training metrics to CSV
training_metrics.to_csv(output_metrics_file, index=False)

#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#Code to predict and output predictions csv for validation data below:

# Create Function to output validation predictions in a csv
def output_predictions_csv_validation(X_val, y_val, primary_values, station_ids, file_path, accuracy_val_set, accuracy_bad_points_val_set, false_negatives_val_set, threshold):
    y_pred_probs_val = model.predict(X_val)
    y_pred_binary_val = (y_pred_probs_val >= threshold).astype(int).flatten()  # Apply the threshold

    df_val = pd.DataFrame({
        'DATE_TIME': val_data['DATE_TIME'],  # Assuming 'val_data' is your validation data
        'PRIMARY': primary_values,
        'STATION_ID': station_ids,
        'True_Target': y_val,
        'Predicted_Target': y_pred_binary_val,
        'Prediction_Score': y_pred_probs_val.flatten(),
    })

    # Add accuracy-related columns with a header row and one value row
    df_val = pd.concat([
        df_val,
        pd.DataFrame({
            'Accuracy_Val_Set': [accuracy_val_set],
            'Accuracy_Bad_Points_Val_Set': [accuracy_bad_points_val_set],
            'False_Negatives_Val_Set': [false_negatives_val_set]
        })
    ], axis=1)

# Calculate accuracy on bad points and false negatives for each unique STATION_ID
    station_accuracy_bad_points_val = []
    station_false_negatives_val = []

    unique_stations_val = df_val['STATION_ID'].unique()
    for station_id_val in unique_stations_val:
        station_df_val = df_val[df_val['STATION_ID'] == station_id_val]
        true_bad_points_station_val = station_df_val['True_Target'][station_df_val['True_Target'] == 0]
        predicted_bad_points_station_val = station_df_val['Predicted_Target'][station_df_val['True_Target'] == 0]
        accuracy_bad_points_station_val = accuracy_score(true_bad_points_station_val, predicted_bad_points_station_val) * 100
        false_negatives_station_val = np.sum((station_df_val['Predicted_Target'] == 0) & (station_df_val['True_Target'] == 1))

        station_accuracy_bad_points_val.append(accuracy_bad_points_station_val)
        station_false_negatives_val.append(false_negatives_station_val)

    # Add new columns to the DataFrame
    df_val = pd.concat([
        df_val,
        pd.DataFrame({
            'Station_ID_Results_Val': unique_stations_val,
            'Accuracy_Bad_Points_Station_Val': station_accuracy_bad_points_val,
            'False_Negatives_Station_Val': station_false_negatives_val
        })
    ], axis=1)

    # Reorder the columns
    df_val = df_val[['DATE_TIME', 'STATION_ID', 'PRIMARY', 'True_Target', 'Predicted_Target', 'Prediction_Score', 'Accuracy_Val_Set',
                     'Accuracy_Bad_Points_Val_Set', 'False_Negatives_Val_Set', 'Station_ID_Results_Val',
                     'Accuracy_Bad_Points_Station_Val', 'False_Negatives_Station_Val']]

    df_val.to_csv(file_path, index=False)

# Evaluate the model on the validation data
y_pred_val = model.predict(X_val)
y_pred_binary_val = (y_pred_val >= threshold).astype(int).flatten()  # Apply the threshold

#Calculates a prediction total accuracy on the validation data
accuracy_val = accuracy_score(y_val, y_pred_binary_val)
print(f'Accuracy on validation set: {accuracy_val * 100:.2f}%')

#Defines true bad points and predicted bad points in validation data
true_bad_points_val = y_val[y_val == 0]
predicted_bad_points_val = y_pred_binary_val[y_val == 0]

#calculates a prediciton accuracy score for just bad points in validation data
accuracy_bad_points_val = accuracy_score(true_bad_points_val, predicted_bad_points_val) * 100
print(f'Accuracy for predicting "bad" points on validation set: {accuracy_bad_points_val:.2f}%')

# Calculate the number of false negatives for the validation set
false_negatives_val_set = np.sum((y_pred_binary_val == 0) & (y_val == 1))
print(f'Number of False Negatives on validation set: {false_negatives_val_set}')

# Create variables to store evaluation metrics for validation set
accuracy_val_set = accuracy_val * 100
accuracy_bad_points_val_set = accuracy_bad_points_val

# Use the function to output predictions for validation data
output_predictions_csv_validation(
    X_val,
    y_val,
    val_data['PRIMARY'].values,
    val_data['STATION_ID'].values,
    output_val_predictions_file,
    accuracy_val_set,
    accuracy_bad_points_val_set,
    false_negatives_val_set,
    threshold
)


In [None]:
#OPTIONAL CELL TO PREDICT AND CREATE OUTPUT PREDICTIONS CSV FOR TEST DATA

# Function for test output predicitons CSV
def output_predictions_csv(X, y_true, primary_values, station_ids, file_path, accuracy_test_set, accuracy_bad_points_test_set, false_negatives_test_set, threshold):
    y_pred_probs = model.predict(X)
    y_pred_binary = (y_pred_probs >= threshold).astype(int).flatten()  # Apply the threshold

    df = pd.DataFrame({
        'DATE_TIME': test_data['DATE_TIME'],
        'STATION_ID': station_ids,
        'PRIMARY': primary_values,
        'True_Target': y_true,
        'Predicted_Target': y_pred_binary,
        'Prediction_Score': y_pred_probs.flatten(),
    })

    # Add accuracy-related columns with a header row and one value row
    df = pd.concat([
        df,
        pd.DataFrame({
            'Accuracy_Test_Set': [accuracy_test_set],
            'Accuracy_Bad_Points_Test_Set': [accuracy_bad_points_test_set],
            'False_Negatives_Test_Set': [false_negatives_test_set]
        })
    ], axis=1)

    # Calculate accuracy on bad points and false negatives for each unique STATION_ID
    station_accuracy_bad_points = []
    station_false_negatives = []

    unique_stations = df['STATION_ID'].unique()
    for station_id in unique_stations:
        station_df = df[df['STATION_ID'] == station_id]
        true_bad_points_station = station_df['True_Target'][station_df['True_Target'] == 0]
        predicted_bad_points_station = station_df['Predicted_Target'][station_df['True_Target'] == 0]
        accuracy_bad_points_station = accuracy_score(true_bad_points_station, predicted_bad_points_station) *100
        false_negatives_station = np.sum((station_df['Predicted_Target'] == 0) & (station_df['True_Target'] == 1))

        station_accuracy_bad_points.append(accuracy_bad_points_station)
        station_false_negatives.append(false_negatives_station)

    # Add new columns to the DataFrame
    df = pd.concat([
        df,
        pd.DataFrame({
            'Station_ID_Results': unique_stations,
            'Accuracy_Bad_Points_Station': station_accuracy_bad_points,
            'False_Negatives_Station': station_false_negatives
        })
    ], axis=1)

    # Reorder the columns
    df = df[['DATE_TIME', 'STATION_ID', 'PRIMARY', 'True_Target', 'Predicted_Target', 'Prediction_Score', 'Accuracy_Test_Set',
             'Accuracy_Bad_Points_Test_Set', 'False_Negatives_Test_Set', 'Station_ID_Results', 'Accuracy_Bad_Points_Station', 'False_Negatives_Station']]

    df.to_csv(file_path, index=False)

# Evaluate the model on test data
y_pred = model.predict(X_test)
y_pred_binary = (y_pred >= threshold).astype(int).flatten()

#Calculates a prediciton total accuracy score on the test data
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy on test set: {accuracy * 100:.2f}%')

#Defines true bad points and predicted bad points in test data
true_bad_points = y_test[y_test == 0]
predicted_bad_points = y_pred_binary[y_test == 0]

#Calculates a prediction accuracy score for just bad points in test data
accuracy_bad_points = accuracy_score(true_bad_points, predicted_bad_points)
print(f'Accuracy for predicting "bad" points: {accuracy_bad_points * 100:.2f}%')

# Calculate the number of false negatives for the test set
false_negatives_test_set = np.sum((y_pred_binary == 0) & (y_test == 1))
print(f'Number of False Negatives on test set: {false_negatives_test_set}')

# Create variables to store evaluation metrics
accuracy_test_set = accuracy * 100
accuracy_bad_points_test_set = accuracy_bad_points * 100

# Use the function to output predictions for test data
output_predictions_csv(
    X_test,
    y_test,
    test_data['PRIMARY'].values,
    test_data['STATION_ID'].values,
    output_test_predictions_file,
    accuracy_test_set,
    accuracy_bad_points_test_set,
    false_negatives_test_set,
    threshold
)