In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
if tf.test.is_gpu_available():
    print("TensorFlow is using the GPU.")
else:
    print("TensorFlow is using the CPU.")

Num GPUs Available:  1
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
TensorFlow is using the GPU.


In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, RepeatVector, TimeDistributed
from tensorflow.keras.optimizers import Adam

#load data
filename = r"D:\NeuralNetworksDataset\Master\PHY_BRE_MASTER.csv"
df = pd.read_csv(filename)

# Convert 'Timestamp' to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])


In [3]:
#Display a snippit of the data
df.head()

Unnamed: 0,Timestamp,Phy_1_Temp,Phy_1_Humidity,Phy_1_NoiseL,Phy_1_NoiseM,Phy_1_NoiseH,Phy_1_NoiseA,Phy_1_Pressure,Phy_1_Lux,Phy_1_Prox,...,A6-00-K1_motion_y,A6-00-K1_temperature_y,A6-00-K1_tamper_y,A6-01-B1_humidity_y,A6-01-B1_illuminance_y,A6-01-B1_motion_y,A6-01-B1_temperature_y,OC-00-L1_contact,OC-01-HB_contact,OC-01-BD_contact
0,2022-10-18 00:00:00,25.859341,22.376295,53.510992,22.695362,9.559516,14.294312,10.105296,790.12875,0.0,...,0.0,20.5,0.0,53.0,0.0,0.0,20.4,1.0,1.0,1.0
1,2022-10-18 00:00:01,25.859341,22.376295,53.510992,22.695362,9.559516,14.294312,10.105296,790.12875,0.0,...,0.0,20.5,0.0,53.0,0.0,0.0,20.4,1.0,1.0,1.0
2,2022-10-18 00:00:02,25.859341,22.376295,53.510992,22.695362,9.559516,14.294312,10.105296,790.12875,0.0,...,0.0,20.5,0.0,53.0,0.0,0.0,20.4,1.0,1.0,1.0
3,2022-10-18 00:00:03,25.859341,22.376295,53.510992,22.695362,9.559516,14.294312,10.105296,790.12875,0.0,...,0.0,20.5,0.0,53.0,0.0,0.0,20.4,1.0,1.0,1.0
4,2022-10-18 00:00:04,25.859341,22.376295,53.510992,22.695362,9.559516,14.294312,10.105296,790.12875,0.0,...,0.0,20.5,0.0,53.0,0.0,0.0,20.4,1.0,1.0,1.0


In [4]:
# Normalize the features
scaler = MinMaxScaler()
ts_col = df['Timestamp']
df = df.drop(['Timestamp'], axis=1)

In [5]:
# Display the original DataFrame
print("Original DataFrame:")

# Compute the correlation matrix
correlation_matrix = df.corr()

# threshold for correlation, correlation is atleast 0.2 we keep them
threshold = 0.8

highly_correlated = (correlation_matrix > threshold) & (correlation_matrix < 1.0)

columns_to_drop = set()

for col in highly_correlated.columns:
    correlated_cols = list(highly_correlated.index[highly_correlated[col]])
    columns_to_drop.update(correlated_cols)

# Drop the identified columns
df_filtered = df.drop(columns=list(columns_to_drop), inplace=False)

print("\nDropped Columns:", list(columns_to_drop))

print("\nDataFrame After Dropping Columns:")
print(df_filtered)

print("\nCorrelation Matrix After Dropping Columns:")
print(df_filtered.corr())

# deleting original dataframe to free up memory :)
del df

Original DataFrame:

Dropped Columns: ['Phy_1_NumPM0.5', 'A6-00-L1_humidity', 'Phy_3_NumPM2.5', 'Phy_5_NoiseH', 'Phy_5_NumPM0.5', 'A6-00-HW_humidity', 'S3-01-B1_Number Concentration NC4.0', 'A6-02-H3_illuminance', 'Phy_2_NumPM1.0', 'Phy_2_MassPM1.0', 'Phy_2_NoiseM', 'S3-01-B1_Number Concentration NC2.5', 'Phy_2_NumPM0.5', 'S3-01-B1_Number Concentration NC1.0', 'Phy_2_MassPM4.0', 'Phy_3_MassPM4.0', 'Phy_3_NoiseH', 'A6-01-B3_illuminance', 'OC-00-L1_contact', 'OC-01-HB_contact', 'OC-02-S1_temperature', 'OC-01-B3_temperature', 'A6-02-SL_temperature', 'S3-01-B1_Mass Concentration PM10.0', 'A6-00-K1_humidity_x', 'Phy_1_Pressure', 'A6-01-B1_humidity_y', 'Phy_5_Humidity', 'A6-00-L1_temperature', 'Phy_5_NumPM1.0', 'Phy_1_MassPM1.0', 'S3-00-L1_Mass Concentration PM1.0', 'Phy_3_MassPM1.0', 'Phy_5_NoiseA', 'Phy_1_MassPM4.0', 'S3-00-L1_Number Concentration NC2.5', 'Phy_4_NumPM1.0', 'Phy_1_NoiseH', 'Phy_3_NumPM4.0', 'S3-00-L1_Number Concentration NC4.0', 'Phy_4_Pressure', 'OC-00-L1_temperature', 'Ph

In [6]:
timesteps = 1
n_features = df_filtered.shape[1]
print(n_features)

data_scaled = scaler.fit_transform(df_filtered)

# Reshape data for LSTM
n_features = data_scaled.shape[1]
data_reshaped = data_scaled.reshape((data_scaled.shape[0], timesteps, n_features))

63


In [7]:
# adding timestamp back and labelling
df_filtered = pd.concat([ts_col,df_filtered], axis=1)
df_filtered['Label'] = 0  # normal
df_filtered.loc[(df_filtered['Timestamp'] >= '2022-11-08') & (df_filtered['Timestamp'] <= '2022-11-10'), 'Label'] = 1  # anomaly


In [8]:
# Split the data based on labels
normal_data = data_reshaped[(df_filtered['Timestamp'] >= '2022-11-05') & (df_filtered['Timestamp'] <= '2022-11-07')]
anomaly_data = data_reshaped[(df_filtered['Timestamp'] >= '2022-11-08') & (df_filtered['Timestamp'] <= '2022-11-10')]


# Combine the two periods to form the training data, all this data is normal points
train_data = np.concatenate([data_reshaped[(df_filtered['Timestamp'] >= '2022-10-28') & (df_filtered['Timestamp'] <= '2022-11-04')],
                              data_reshaped[(df_filtered['Timestamp'] >= '2022-11-11') & (df_filtered['Timestamp'] <= '2022-11-17')]], axis=0)

# Create y_test for anomaly and normal data
y_test_anomaly = np.ones(len(anomaly_data))
y_test_normal = np.zeros(len(normal_data))

# delete dataframe to also free up memory :)
del df_filtered
del data_reshaped

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
from tensorflow.keras.optimizers import Adam
from keras.layers import Dropout, Dense, Flatten

class IntruderDetectionAutoencoder:
    def __init__(self, input_shape, lstm_units=50, dropout_rate=0.2, learning_rate=0.001):
        self.model = Sequential()

        # Encoder
        self.model.add(LSTM(units=lstm_units, return_sequences=True, input_shape=input_shape))
        self.model.add(Dropout(dropout_rate))
        self.model.add(LSTM(units=lstm_units))
        self.model.add(Dropout(dropout_rate))

        # Repeat vector
        self.model.add(RepeatVector(input_shape[0]))

        # Decoder
        self.model.add(LSTM(units=lstm_units, return_sequences=True))
        self.model.add(Dropout(dropout_rate))
        self.model.add(LSTM(units=lstm_units, return_sequences=True))
        self.model.add(Dropout(dropout_rate))
        self.model.add(TimeDistributed(Dense(input_shape[1])))

        self.model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

    def train(self, X_train, epochs=100, batch_size=64, validation_data=None):
        return self.model.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, validation_data=validation_data)

    def evaluate(self, X_test):
        return self.model.evaluate(X_test, X_test)

    def predict(self, X):
        return self.model.predict(X)


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Dropout
from tensorflow.keras.optimizers import Adam

class IntruderDetectionAutoencoder:
    def __init__(self, input_shape, lstm_units=64, dropout_rate=0.1, learning_rate=0.001):
        self.model = Sequential()

        # Encoder
        self.model.add(LSTM(units=lstm_units, return_sequences=True, input_shape=input_shape))
        self.model.add(Dropout(dropout_rate))

        # Bottleneck layer - Reducing the number of units
        bottleneck_units = max(32, lstm_units // 2)  # Ensuring at least 32 units
        self.model.add(LSTM(units=bottleneck_units, activation='relu'))
        self.model.add(RepeatVector(input_shape[0]))

        # Decoder
        self.model.add(LSTM(units=bottleneck_units, return_sequences=True, activation='relu'))
        self.model.add(Dropout(dropout_rate))
        self.model.add(TimeDistributed(Dense(input_shape[1])))

        self.model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

    def train(self, X_train, epochs=100, batch_size=64, validation_data=None):
        return self.model.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, validation_data=validation_data)

    def evaluate(self, X_test):
        return self.model.evaluate(X_test, X_test)

    def predict(self, X):
        return self.model.predict(X)


In [10]:
autoencoder = IntruderDetectionAutoencoder(input_shape=train_data.shape[1:])
history = autoencoder.train(train_data, epochs=8, batch_size=128)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [11]:
# Predicting on test data
reconstructed_anomaly = autoencoder.predict(anomaly_data)
reconstructed_normal = autoencoder.predict(normal_data)



In [12]:
# Calculate reconstruction error for anomaly and normal data
error_anomaly = np.mean(np.power(anomaly_data.squeeze() - reconstructed_anomaly.squeeze(), 2), axis=1)
error_normal = np.mean(np.power(normal_data.squeeze() - reconstructed_normal.squeeze(), 2), axis=1)

In [51]:
percentiles = [60]
best_f1 = 0
best_threshold = 0

for percentile in percentiles:
    threshold = np.percentile(np.concatenate([error_anomaly, error_normal]), percentile)

    y_pred_anomaly = [1 if e > threshold else 0 for e in error_anomaly]
    y_pred_normal = [1 if e > threshold else 0 for e in error_normal]

    y_pred = np.concatenate([y_pred_anomaly, y_pred_normal])
    y_true = np.concatenate([y_test_anomaly, y_test_normal])

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

    print(f"Percentile: {percentile}, Threshold: {threshold}, F1 Score: {f1}")


Percentile: 60, Threshold: 0.0008810447056306201, F1 Score: 0.2139839635804811


In [52]:
y_pred_anomaly = [1 if e > best_threshold else 0 for e in error_anomaly]
y_pred_normal = [1 if e > best_threshold else 0 for e in error_normal]

y_pred = np.concatenate([y_pred_anomaly, y_pred_normal])
y_true = np.concatenate([y_test_anomaly, y_test_normal])

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print(f"Best Threshold: {best_threshold}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Best Threshold: 0.0008810447056306201
Accuracy: 0.2925851123546739
Precision: 0.24073176553989048
Recall: 0.19258569105502862
F1 Score: 0.2139839635804811


In [1]:
from sklearn.metrics import confusion_matrix

# Assuming y_true and y_pred are already defined
# y_true: True labels
# y_pred: Predictions from the model

# Generate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Function to print the confusion matrix in a more readable format
def print_confusion_matrix(conf_matrix):
    print("Confusion Matrix:")
    print("          Predicted: ")
    print("          Normal Anomaly")
    print("Actual: Normal  {}      {}".format(conf_matrix[0][0], conf_matrix[0][1]))
    print("        Anomaly {}      {}".format(conf_matrix[1][0], conf_matrix[1][1]))

print_confusion_matrix(cm)


NameError: name 'y_true' is not defined