## Testing SW on FordA dataset

* Reading and concatenating the FordA dataset

In [2]:
import pandas as pd
from scipy.io import arff

# Paths to the ARFF files
path_train = r"C:\Users\Jaber\OneDrive - University of Florida\Educational\GitHub\Datasets\Chapters\Chapter2\Datasets\FordA\FordA_TRAIN.arff"
path_test = r"C:\Users\Jaber\OneDrive - University of Florida\Educational\GitHub\Datasets\Chapters\Chapter2\Datasets\FordA\FordA_TEST.arff"

# Load the training ARFF file
data_train, meta_train = arff.loadarff(path_train)
train = pd.DataFrame(data_train)

# Load the testing ARFF file
data_test, meta_test = arff.loadarff(path_test)
test = pd.DataFrame(data_test)

# Combine the train and test datasets into one DataFrame named FordA_data
FordA_data = pd.concat([train, test], ignore_index=True)

# Path to save the CSV file
csv_path = r"C:\Users\Jaber\OneDrive - University of Florida\Educational\GitHub\Datasets\Chapters\Chapter2\Datasets\FordA\FordA_data.csv"

# Save the DataFrame to a CSV file
FordA_data.to_csv(csv_path, index=False)

print(f"Dataset saved as a CSV file at {csv_path}")
FordA_data

Dataset saved as a CSV file at C:\Users\Jaber\OneDrive - University of Florida\Educational\GitHub\Datasets\Chapters\Chapter2\Datasets\FordA\FordA_data.csv


Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att492,att493,att494,att495,att496,att497,att498,att499,att500,target
0,-0.797172,-0.664392,-0.373015,0.040815,0.526936,0.984288,1.353120,1.578108,1.659251,1.640809,...,0.722417,0.362068,0.092083,-0.081268,-0.212573,-0.391456,-0.664392,-1.073796,-1.564343,b'-1'
1,0.804855,0.634629,0.373474,0.038343,-0.340988,-0.740860,-1.109667,-1.395357,-1.570192,-1.619951,...,0.049213,-0.258138,-0.510583,-0.683647,-0.773817,-0.785255,-0.714885,-0.560443,-0.319086,b'1'
2,0.727985,0.111284,-0.499124,-1.068629,-1.578351,-1.990534,-2.302031,-2.503403,-2.585211,-2.550600,...,0.463685,0.507735,0.517174,0.504588,0.476270,0.438513,0.394463,0.339400,0.255391,b'-1'
3,-0.234439,-0.502157,-0.732488,-0.946128,-1.139739,-1.323336,-1.490243,-1.607077,-1.620430,-1.506933,...,-0.929437,-0.922761,-0.929437,-0.909409,-0.835970,-0.695768,-0.478790,-0.188707,0.119736,b'-1'
4,-0.171328,-0.062285,0.235829,0.710396,1.239969,1.649823,1.876321,1.865535,1.703751,1.466467,...,0.725496,0.697453,0.731967,0.808545,0.839823,0.733046,0.437520,-0.026585,-0.602213,b'-1'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4916,0.143630,-0.135823,-0.510278,-0.850804,-1.058080,-1.082756,-0.961845,-0.748399,-0.575669,-0.569500,...,0.865397,1.366315,1.896845,2.229968,2.192954,1.761128,0.939424,-0.096588,-1.076587,b'-1'
4917,-0.165568,-0.504614,-0.780065,-0.937044,-0.950518,-0.854054,-0.701736,-0.544270,-0.424473,-0.357913,...,-1.916656,-1.774917,-1.439674,-0.935756,-0.298758,0.406564,1.100995,1.722323,2.191682,b'1'
4918,0.710084,0.593979,0.381886,0.127285,-0.112304,-0.274140,-0.312698,-0.195008,0.063567,0.398281,...,0.272573,-0.040822,-0.235795,-0.304746,-0.270086,-0.192379,-0.126553,-0.117822,-0.189094,b'1'
4919,0.006847,-0.140624,-0.270594,-0.378835,-0.461983,-0.515125,-0.538119,-0.532769,-0.495602,-0.436697,...,0.176298,-0.031868,-0.262996,-0.492936,-0.698291,-0.870596,-1.000708,-1.084108,-1.109963,b'1'


**Testing the main code on FordA**
* RNN-AE for anomally detection (instead of IF).

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.figure_factory as ff
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, LSTM, RepeatVector, TimeDistributed

# Function to apply sliding time window
def create_sliding_windows(data, labels, window_size, step_size):
    windows = []
    new_labels = []
    original_indices = []
    for i in range(len(labels)):
        for j in range(0, data.shape[1] - window_size + 1, step_size):
            windows.append(data[i, j:j + window_size])
            new_labels.append(labels[i])
            original_indices.append(i)
    return np.array(windows), np.array(new_labels), original_indices

# Function to extract features from sliding windows
def extract_features(windows):
    features = []
    for window in windows:
        mean = np.mean(window)
        std = np.std(window)
        skew = np.mean((window - mean)**3) / (std**3)
        kurtosis = np.mean((window - mean)**4) / (std**4) - 3
        features.append([mean, std, skew, kurtosis])
    return np.array(features)

# Function to build and train the RNN Autoencoder
def build_rnn_autoencoder(input_shape):
    model = Sequential()
    # Encoder
    model.add(LSTM(128, activation='relu', input_shape=(input_shape[1], input_shape[2]), return_sequences=True))
    model.add(LSTM(64, activation='relu', return_sequences=False))
    model.add(RepeatVector(input_shape[1]))
    
    # Decoder
    model.add(LSTM(64, activation='relu', return_sequences=True))
    model.add(LSTM(128, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(input_shape[2])))
    
    model.compile(optimizer='adam', loss='mse')
    return model

# Function to classify with reject option
def classify_with_reject(probabilities, threshold, initial_predictions, y_true):
    predictions = []
    abstain_instances = []
    for i, (prob, pred, true) in enumerate(zip(probabilities, initial_predictions, y_true)):
        if max(prob) >= threshold or pred == true:
            predictions.append(pred)
        else:
            predictions.append(-1)
            abstain_instances.append(i)
    return np.array(predictions), abstain_instances

# Function to train RNN model to predict performance and determine window size increment
def train_rnn_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse')
    return model

# Load Data from CSV file
data_path = r"C:\Users\Jaber\OneDrive - University of Florida\Educational\GitHub\Datasets\Chapters\Chapter2\Datasets\FordA\FordA_data.csv"
save_directory = r"C:\Users\Jaber\OneDrive - University of Florida\Educational\GitHub\Datasets\Chapters\Chapter2\Results\Ford\FordA"
df = pd.read_csv(data_path)

# Initial window size and step size
window_size = 25  # 25 measurements
step_size = 12    # 12 measurements

# Extract features (time series) and labels
df['label'] = df['label'].apply(lambda x: 0 if x == 1 else 1)
df = df.select_dtypes(include=[np.number])
df.fillna(df.mean(), inplace=True)

X_time_series = df.drop(columns=['label']).values
y = df['label'].values

performance_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'specificity': []}
stop_criteria = False

# Loop to automatically adjust the window size
while not stop_criteria:
    # Create sliding windows
    X_sliding_windows, y_sliding_windows, original_indices = create_sliding_windows(X_time_series, y, window_size, step_size)

    # Normalize the sliding windows data
    scaler = StandardScaler()
    X_sliding_windows_scaled = scaler.fit_transform(X_sliding_windows.reshape(X_sliding_windows.shape[0], -1))
    
    # Reshape the scaled data into 3D (samples, timesteps, features) for RNN
    X_sliding_windows_scaled = X_sliding_windows_scaled.reshape(X_sliding_windows.shape[0], window_size, -1)

    # Build and train the RNN Autoencoder
    input_shape = X_sliding_windows_scaled.shape
    rnn_autoencoder = build_rnn_autoencoder(input_shape)
    rnn_autoencoder.fit(X_sliding_windows_scaled, X_sliding_windows_scaled, epochs=50, batch_size=32, shuffle=True, verbose=0)

    # Encode the features using the trained RNN Autoencoder
    X_encoded_features = rnn_autoencoder.predict(X_sliding_windows_scaled)

    # Calculate reconstruction error to detect anomalies
    reconstruction_errors = np.mean(np.abs(X_sliding_windows_scaled - X_encoded_features), axis=1)
    threshold = np.percentile(reconstruction_errors, 90)  # Set threshold for anomalies
    anomaly_labels = (reconstruction_errors > threshold).astype(int)

    # Update labels based on anomaly detection (anomalies are labeled as 1, normal as 0)
    updated_labels = anomaly_labels

    # Create a DataFrame to map original samples to their generated subsamples and labels
    original_sample_data = []
    for idx, (original_index, window, label) in enumerate(zip(original_indices, X_sliding_windows, updated_labels)):
        original_sample_data.append({
            'Original Sample Index': original_index,
            'Original Sample Label': y[original_index],
            'Subsample Index': idx,
            'Subsample Label': label,
            'Subsample Data': window
        })

    df_original_samples = pd.DataFrame(original_sample_data)

    # Save the DataFrame to an Excel file
    df_original_samples.to_excel(f'{save_directory}/Original_Samples_and_Subsamples.xlsx', index=False)

    # Split the data into training and testing sets (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X_sliding_windows, updated_labels, test_size=0.2, random_state=42)

    # Standardize the features (mean=0, std=1)
    X_train = scaler.fit_transform(X_train.reshape(X_train.shape[0], -1))
    X_test = scaler.transform(X_test.reshape(X_test.shape[0], -1))

    # Oversample the minority class using RandomOverSampler on training data
    oversampler = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

    # Train a Random Forest model with early stopping
    best_model = None
    best_score = 0
    no_improvement_epochs = 0
    patience = 2

    for epoch in range(10):
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train_resampled, y_train_resampled)

        train_accuracy = accuracy_score(y_train_resampled, model.predict(X_train_resampled))

        if train_accuracy > best_score:
            best_model = model
            best_score = train_accuracy
            no_improvement_epochs = 0
        else:
            no_improvement_epochs += 1

        if no_improvement_epochs >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    # Use the best model for predictions
    test_probabilities = best_model.predict_proba(X_test)
    initial_predictions = best_model.predict(X_test)

    # Initialize lists to store confusion matrix elements
    tp_list = []
    tn_list = []
    fp_list = []
    fn_list = []

    # Initialize a table to store results for each lambda
    table_data = []
    abstain_table_data = []
    metrics_table_data = []

    # Initialize dictionaries to store metrics for each lambda
    metrics_dict = {l: {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'specificity': []} for l in np.arange(0.5, 0.95, 0.05)}

    # Define the range of lambda values (excluding 0.95)
    lambdas = np.arange(0.5, 0.95, 0.05)

    # Loop through lambda values and calculate metrics
    for reject_threshold in lambdas:
        predictions, abstain_indices = classify_with_reject(test_probabilities, reject_threshold, initial_predictions, y_test)

        filtered_indices = [i for i in range(len(predictions)) if predictions[i] != -1]
        y_test_filtered = y_test[filtered_indices]
        predictions_filtered = predictions[filtered_indices]

        if len(predictions_filtered) > 0:
            cm = confusion_matrix(y_test_filtered, predictions_filtered, labels=[0, 1])
            tn, fp, fn, tp = cm.ravel()
        else:
            cm = np.array([[0, 0], [0, 0]])
            tn, fp, fn, tp = 0, 0, 0, 0

        tp_list.append(tp)
        tn_list.append(tn)
        fp_list.append(fp)
        fn_list.append(fn)

        table_data.append([round(reject_threshold, 2), tn, fp, fn, tp])

        abstain_instances_info = []
        for idx in abstain_indices:
            abstain_instances_info.append((idx, y_test[idx]))

        abstain_table_data.append([round(reject_threshold, 2), abstain_instances_info])

        if len(y_test_filtered) > 0:
            accuracy = accuracy_score(y_test_filtered, predictions_filtered) * 100
            precision = precision_score(y_test_filtered, predictions_filtered, zero_division=0) * 100
            recall = recall_score(y_test_filtered, predictions_filtered, zero_division=0) * 100
            f1 = f1_score(y_test_filtered, predictions_filtered, zero_division=0) * 100
            specificity = (tn / (tn + fp)) * 100 if (tn + fp) > 0 else 0
        else:
            accuracy = precision = recall = f1 = specificity = 0

        metrics_dict[reject_threshold]['accuracy'].append(accuracy)
        metrics_dict[reject_threshold]['precision'].append(precision)
        metrics_dict[reject_threshold]['recall'].append(recall)
        metrics_dict[reject_threshold]['f1'].append(f1)
        metrics_dict[reject_threshold]['specificity'].append(specificity)

        metrics_table_data.append([round(reject_threshold, 2), f"{accuracy:.2f}%", f"{precision:.2f}%", f"{recall:.2f}%", f"{f1:.2f}%", f"{specificity:.2f}%"])

        # Show confusion matrix for each lambda
        if cm.shape != (2, 2):
            cm_padded = np.zeros((2, 2), dtype=int)
            cm_padded[:cm.shape[0], :cm.shape[1]] = cm
        else:
            cm_padded = cm

        x_labels = ['Normal', 'Abnormal']
        y_labels = ['Abnormal', 'Normal']
        cm_reversed = cm_padded[::-1]
        fig = ff.create_annotated_heatmap(z=cm_reversed, x=x_labels, y=y_labels, colorscale='Blues')
        fig.update_layout(
            title=f'Confusion Matrix, Lambda {reject_threshold:.2f}',
            xaxis=dict(title='Predicted labels', tickfont=dict(size=10)),
            yaxis=dict(title='True labels', tickfont=dict(size=10)),
            width=400,
            height=300,
            margin=dict(l=50, r=50, t=130, b=50)
        )
        fig.show()

        # Check if all metrics meet the stop criteria for this lambda
        if accuracy >= 99 and precision >= 99 and recall >= 99 and f1 >= 99 and specificity >= 99:
            stop_criteria = True
            print(f"Stopping criteria met with lambda {reject_threshold:.2f}, window size {window_size}, and step size {step_size}.")
            break

    if not stop_criteria:
        # Use RNN to predict the new window size increment
        rnn_data = np.array([list(metrics_dict[l].values()) for l in lambdas]).reshape(len(lambdas), -1, 5)
        rnn_model = train_rnn_model((rnn_data.shape[1], rnn_data.shape[2]))
        rnn_model.fit(rnn_data, np.array([window_size] * len(lambdas)), epochs=100, verbose=0)
        predicted_increment = rnn_model.predict(rnn_data[-1].reshape(1, rnn_data.shape[1], rnn_data.shape[2]))[0, 0]
        window_size += int(predicted_increment)
        step_size = window_size // 2
        print(f"Increasing window size to {window_size} and step size to {step_size}.")

    # Plot performance metrics
    if len(lambdas) > len(tp_list):
        lambdas = lambdas[:len(tp_list)]
    plt.figure(figsize=(8, 5))
    plt.plot(lambdas, tp_list, marker='o', linestyle='-', label='True Positives (TP)')
    plt.plot(lambdas, tn_list, marker='o', linestyle='-', label='True Negatives (TN)')
    plt.plot(lambdas, fp_list, marker='o', linestyle='-', label='False Positives (FP)')
    plt.plot(lambdas, fn_list, marker='o', linestyle='-', label='False Negatives (FN)')
    plt.xlabel('Lambda (Abstain Threshold)')
    plt.ylabel('Count')
    plt.title('Confusion Matrix Elements vs. Lambda Threshold')
    plt.legend()
    plt.grid(True)
    plt.show()

    df_table_cm = pd.DataFrame(table_data, columns=['Lambda Threshold', 'True Negatives (TN)', 'False Positives (FP)', 'False Negatives (FN)', 'True Positives (TP)'])
    fig_table_cm = go.Figure(data=[go.Table(
        header=dict(values=list(df_table_cm.columns), fill_color='paleturquoise', align='left'),
        cells=dict(values=[df_table_cm[col].tolist() for col in df_table_cm.columns], fill=dict(color=['lavender', 'white']), align='left')
    )])
    fig_table_cm.update_layout(width=800, height=500)
    fig_table_cm.show()

    df_table_cm.to_excel(f'{save_directory}/Lambda_Abstain_Confusion_Matrix_Elements.xlsx', index=False)

    df_abstain_table = pd.DataFrame(abstain_table_data, columns=['Lambda Threshold', 'Abstain Instances (Index, True Label)'])
    fig_abstain_table = go.Figure(data=[go.Table(
        header=dict(values=list(df_abstain_table.columns), fill_color='paleturquoise', align='left'),
        cells=dict(values=[df_abstain_table[col].tolist() for col in df_abstain_table.columns], fill=dict(color=['lavender', 'white']), align='left')
    )])
    fig_abstain_table.update_layout(width=800, height=500)
    fig_abstain_table.show()

    df_abstain_table.to_excel(f'{save_directory}/Lambda_Abstain_Instances.xlsx', index=False)

    df_metrics_table = pd.DataFrame(metrics_table_data, columns=['Lambda Threshold', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Specificity'])
    fig_metrics_table = go.Figure(data=[go.Table(
        header=dict(values=list(df_metrics_table.columns), fill_color='paleturquoise', align='left'),
        cells=dict(values=[df_metrics_table[col].tolist() for col in df_metrics_table.columns], fill=dict(color=['lavender', 'white']), align='left')
    )])
    fig_metrics_table.update_layout(width=800, height=500)
    fig_metrics_table.show()

    df_metrics_table.to_excel(f'{save_directory}/Lambda_Abstain_Results_Metrics.xlsx', index=False)

    avg_metrics_data = []
    for l in lambdas:
        avg_accuracy = np.mean(metrics_dict[l]['accuracy'])
        avg_precision = np.mean(metrics_dict[l]['precision'])
        avg_recall = np.mean(metrics_dict[l]['recall'])
        avg_f1 = np.mean(metrics_dict[l]['f1'])
        avg_specificity = np.mean(metrics_dict[l]['specificity'])

        avg_metrics_data.append([round(l, 2), f"{avg_accuracy:.2f}%", f"{avg_precision:.2f}%", f"{avg_recall:.2f}%", f"{avg_f1:.2f}%", f"{avg_specificity:.2f}%"])

    df_avg_metrics = pd.DataFrame(avg_metrics_data, columns=['Lambda', 'Average Accuracy', 'Average Precision', 'Average Recall', 'Average F1-score', 'Average Specificity'])
    fig_avg_metrics = go.Figure(data=[go.Table(
        header=dict(values=list(df_avg_metrics.columns), fill_color='paleturquoise', align='left'),
        cells=dict(values=[df_avg_metrics[col].tolist() for col in df_avg_metrics.columns], fill=dict(color=['lavender', 'white']), align='left')
    )])
    fig_avg_metrics.update_layout(width=800, height=500)
    fig_avg_metrics.show()

    df_avg_metrics.to_excel(f'{save_directory}/Average_Metrics_Per_Lambda.xlsx', index=False)

    plt.figure(figsize=(8, 5))
    plt.plot(df_avg_metrics['Lambda'], df_avg_metrics['Average Accuracy'].str.rstrip('%').astype(float), marker='o', linestyle='-', label='Average Accuracy')
    plt.plot(df_avg_metrics['Lambda'], df_avg_metrics['Average Precision'].str.rstrip('%').astype(float), marker='o', linestyle='-', label='Average Precision')
    plt.plot(df_avg_metrics['Lambda'], df_avg_metrics['Average Recall'].str.rstrip('%').astype(float), marker='o', linestyle='-', label='Average Recall')
    plt.plot(df_avg_metrics['Lambda'], df_avg_metrics['Average F1-score'].str.rstrip('%').astype(float), marker='o', linestyle='-', label='Average F1-score')
    plt.plot(df_avg_metrics['Lambda'], df_avg_metrics['Average Specificity'].str.rstrip('%').astype(float), marker='o', linestyle='-', label='Average Specificity')
    plt.xlabel('Lambda (Abstain Threshold)')
    plt.ylabel('Percentage')
    plt.title('Average Performance Metrics vs. Lambda Threshold')
    plt.legend()
    plt.grid(True)
    plt.show()

    print("\nAverage metrics for each lambda have been saved to 'Average_Metrics_Per_Lambda.xlsx'.")


  super().__init__(**kwargs)
