<a href="https://colab.research.google.com/github/HamzaAhmed78629/MSc-Thesis-Proposed-Product/blob/main/Completed_Implementation_%26_Testing_Another_IoT_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing the required libraries
!pip install pandas scikit-learn tensorflow python-docx ipywidgets
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: python-docx, jedi
Successfully installed jedi-0.19.1 python-docx-1.1.2


In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Loading and inspecting the dataset
import pandas as pd

# Loading the dataset
data = pd.read_csv("/content/drive/MyDrive/RT_IOT2022.csv")

# Printing the first few rows to inspect the structure and columns
print("Dataset loaded. Displaying the first few rows:\n")
print(data.head())

# Printing column names
print("\nColumn names in the dataset:")
print(data.columns)

Dataset loaded. Displaying the first few rows:

   Unnamed: 0  id.orig_p  id.resp_p proto service  flow_duration  \
0           0      38667       1883   tcp    mqtt      32.011598   
1           1      51143       1883   tcp    mqtt      31.883584   
2           2      44761       1883   tcp    mqtt      32.124053   
3           3      60893       1883   tcp    mqtt      31.961063   
4           4      51087       1883   tcp    mqtt      31.902362   

   fwd_pkts_tot  bwd_pkts_tot  fwd_data_pkts_tot  bwd_data_pkts_tot  ...  \
0             9             5                  3                  3  ...   
1             9             5                  3                  3  ...   
2             9             5                  3                  3  ...   
3             9             5                  3                  3  ...   
4             9             5                  3                  3  ...   

   active.std      idle.min      idle.max      idle.tot      idle.avg  \
0         0.0

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from docx import Document
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, clear_output
import random
import warnings

warnings.filterwarnings("ignore")

# Widgets for interaction
load_button = widgets.Button(description="Load Data", button_style='success')
check_compliance_button = widgets.Button(description="Check Compliance", button_style='primary')
adjust_button = widgets.Button(description="Adjust Policies", button_style='danger')
enforce_button = widgets.Button(description="Enforce Policies", button_style='warning')

# Outputing widget to display results
output = widgets.Output()

# Displaying the interface i.e. buttons
display(widgets.VBox([load_button, check_compliance_button, adjust_button, enforce_button, output]))

# Global variables that stores data and models
data = None
X_preprocessed = None
Y = None
X_train = X_test = y_train = y_test = None
cnn_model = None
lstm_model = None
X_train_reshaped = X_test_reshaped = None
compliance_results = None
policies = None
adjusted_policies = None

# Loading and preprocessing the data
def load_data(b=None):
    global data, X_preprocessed, Y, X_train, X_test, y_train, y_test
    with output:
        #output.clear_output()
        print("Loading data.....")
        try:
            # Loads the dataset
            data = pd.read_csv('/content/drive/MyDrive/RT_IOT2022.csv')

            # Creating a 'time' column based on the dataset length
            data['time'] = pd.date_range(start='2022-01-01 00:00', periods=len(data), freq='T')

            # Seting 'time' as index
            data.set_index('time', inplace=True)

            print("Dataset loaded successfully.")
            print("Displaying first few rows:")
            display(data.head())

            # Droping irrelevant columns
            data.drop(columns=['Unnamed: 0'], inplace=True)

            # Defining target variable
            Y = data['Attack_type']
            X = data.drop(columns=['Attack_type'])

            # Identifying numerical and categorical columns
            categorical_features = ['proto', 'service']
            numeric_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
            numeric_features = [col for col in numeric_features if col not in categorical_features]

            # Preprocessor: Scaling numeric data and encoding categorical data
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), numeric_features),
                    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
                ], remainder='drop')

            # Applying preprocessing to the dataset
            X_preprocessed = preprocessor.fit_transform(X)

            # Having additional Scaling
            scaler = MinMaxScaler()
            X_preprocessed = scaler.fit_transform(X_preprocessed)

            print("Data is preprocessed successfully.")

            # Encoding target variable
            label_encoder = LabelEncoder()
            Y_encoded = label_encoder.fit_transform(Y)
            print("Target variable encoded.")

            # Train-test split section, Test Data = 30%
            X_train, X_test, y_train, y_test = train_test_split(
                X_preprocessed, Y_encoded, test_size=0.3, random_state=42, stratify=Y_encoded
            )
            print("Train-test split done successfully.")
            print(f"Training set size: {X_train.shape[0]}")
            print(f"Testing set size: {X_test.shape[0]}")

        except Exception as e:
            print(f"Error loading data: {e}")

# Reshaping data for CNN and LSTM
def reshape_data(X_train, X_test):
    X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
    return X_train_reshaped, X_test_reshaped

# Defining and training the CNN Model
def train_cnn_model():
    global cnn_model, X_train_reshaped, X_test_reshaped
    with output:
        #output.clear_output()
        if X_train is None or y_train is None:
            print("Please load the data first.")
            return
        print("Reshaping data for CNN...")
        try:
            X_train_reshaped, X_test_reshaped = reshape_data(X_train, X_test)
            print("Data reshaped successfully.")

            print("Training CNN Model...")
            cnn_model = Sequential([
                Conv1D(32, 3, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)),
                MaxPooling1D(2),
                Conv1D(64, 3, activation='relu'),
                MaxPooling1D(2),
                Flatten(),
                Dense(64, activation='relu'),
                Dense(len(np.unique(y_train)), activation='softmax')
            ])
            cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
            cnn_model.fit(
                X_train_reshaped, y_train,
                epochs=5,
                batch_size=32,
                validation_data=(X_test_reshaped, y_test),
                callbacks=[early_stopping],
                verbose=2
            )
            print("CNN Model trained successfully.")
        except Exception as e:
            print(f"Error training CNN model: {e}")

# Defining and training the LSTM Model
def train_lstm_model():
    global lstm_model
    with output:
        #output.clear_output()
        if X_train_reshaped is None or y_train is None:
            print("Please load the data and train CNN model first.")
            return
        print("Training LSTM Model...")
        try:
            lstm_model = Sequential([
                LSTM(50, return_sequences=True, input_shape=(X_train_reshaped.shape[1], 1)),
                Dropout(0.2),
                LSTM(50, return_sequences=False),
                Dropout(0.2),
                Dense(len(np.unique(y_train)), activation='softmax')
            ])
            lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
            lstm_model.fit(
                X_train_reshaped, y_train,
                epochs=2,
                batch_size=32,
                validation_data=(X_test_reshaped, y_test),
                callbacks=[early_stopping],
                verbose=2
            )
            print("LSTM Model trained successfully.")
        except Exception as e:
            print(f"Error training LSTM model: {e}")

# Detecting threats using CNN and LSTM models
def detect_threats(cnn_model, lstm_model, X_test, threshold=0.5):
    print("Detecting threats...")
    cnn_predictions = cnn_model.predict(X_test)
    lstm_predictions = lstm_model.predict(X_test)

    final_predictions = (cnn_predictions + lstm_predictions) / 2
    threats = final_predictions > threshold
    print("Final predictions generated and threat detection completed")
    return threats, final_predictions

# Compliance checking against regulatory compliance (GDPR, CCPA, NIST)
def regulatory_compliance_check(threats, predictions):
    # Compute compliance based on thresholds
    general_compliance = predictions < 0.7
    gdpr_compliance = predictions < 0.6
    ccpa_compliance = predictions < 0.5
    nist_compliance = predictions < 0.4
    overall_compliance = general_compliance & gdpr_compliance & ccpa_compliance & nist_compliance

    # Returns the compliance as percentages against whats outputs from its threshold
    return {
        "General Compliance": general_compliance,
        "GDPR Compliance": gdpr_compliance,
        "CCPA Compliance": ccpa_compliance,
        "NIST Compliance": nist_compliance,
        "Overall Compliance": overall_compliance
    }

def process_compliance_results(compliance_results):
    for standard, results in compliance_results.items():
        # Calculates compliance percentage correctly
        compliant_count = np.sum(results)  # Counts the number of True values
        total_count = results.size  # Total number of elements
        compliance_percentage = (compliant_count / total_count) * 100  # Calculates its percentage now
        print(f"{standard}: {compliance_percentage:.2f}% compliant")

# Compliance checking function
def check_compliance(b=None):
    with output:
        #output.clear_output()
        global compliance_results
        if cnn_model is None or lstm_model is None:
            print("Please train the models first.")
            return
        print("Checking compliance...")

        # Uses the final predictions for compliance checking
        threats, predictions = detect_threats(cnn_model, lstm_model, X_test_reshaped)
        if predictions is None:
            print("Error during threat detection.")
            return

        compliance_results = regulatory_compliance_check(predictions)
        process_compliance_results(compliance_results)
        print(f"Compliance Results: {compliance_results}")

# Binding widget buttons to their functions
load_button.on_click(load_data)
check_compliance_button.on_click(check_compliance)

# Loads, Adjusts, and Saves Policies from its docx documents
def load_policies(doc_path):
    doc = Document(doc_path)
    policies = {}
    for para in doc.paragraphs:
        if para.text and ":" in para.text:
            key_value = para.text.split(":", 1)
            if len(key_value) == 2:
                key, value = key_value
                policies[key.strip()] = value.strip()
            else:
                policies[key_value[0].strip()] = ""
        elif para.text:# Handles paragraphs without a colon
              policies[para.text.strip()] = ""
              #print(policies)
    return policies

def save_policies(policies, doc_path):
    doc = Document()
    doc.add_heading("Adjusted Policies", level=1)
    for key, value in policies.items():
        doc.add_paragraph(f"{key}: {value}")
    doc.save(doc_path)

def adjust_policies(policies, compliance_results):
    adjusted_policies = {}
    for i, (policy, compliant) in enumerate(zip(policies.items(), compliance_results)):
        key, value = policy
        if not compliant:
            adjusted_policies[key] = f"Adjust Policy: {value} - Action Required"
        else:
            adjusted_policies[key] = f"{value} - No Policy Adjustment Needed."
    save_policies(adjusted_policies, "/content/drive/MyDrive/AdjustedPolicies.docx")
    return adjusted_policies

# Each policy displayed on a new line for easy stucture
def print_policies(policies, title="Policies"):
    print(f"{title}:")
    for key, value in policies.items():
        print(f"  {key}: {value}")
    print()  # New lines for better separation

# Enforces policies
def enforce_policies(policies):
    for policy, action in policies.items():
        if "Adjust Policy" in action:
            print(f"Enforcing Policy: {action}...")
        else:
            print(f"{policy}: No adjustment needed. Data is processed in compliance.")

# Alert mechanism
def send_alert(message):
    print(f"ALERT: {message}")

adjust_button.on_click(adjust_policies)
enforce_button.on_click(enforce_policies)

# Main function to execute everything
def main():
    # Loads the data
    load_data()
    # Printing statement of testing and training data is split
    print("\033[1m" + "IoT data points after Splitting the Testing and Training data")
    print("Training set shape:")
    print(f"X_train: {X_train.shape}")
    print(f"y_train: {y_train.shape}")
    print("\nTesting set shape:")
    print(f"X_test: {X_test.shape}")
    print(f"y_test: {y_test.shape}")
    # Reshape data for CNN and LSTM
    global X_train_reshaped, X_test_reshaped  # Declared as global to avoid issues
    X_train_reshaped, X_test_reshaped = reshape_data(X_train, X_test)
    # Train the models
    train_cnn_model()
    train_lstm_model()
    # Detect threats (cnn_model and lstm_model are also set globally)
    threats, predictions = detect_threats(cnn_model, lstm_model, X_test_reshaped)
    # Compliance check
    compliance_results = regulatory_compliance_check(threats=None, predictions=predictions)
    print("Compliance Check Results:", compliance_results)
    process_compliance_results(compliance_results)
    policies_file_path = "/content/drive/MyDrive/Policies.docx"
    initial_policies = load_policies(policies_file_path)
    print(f"Loaded Policies: {initial_policies}\n")
    compliance_results = [random.choice([True, False]) for _ in range(len(initial_policies))]
    print(f"Compliance Results: {compliance_results}")
    adjusted_policies = adjust_policies(initial_policies, compliance_results)
    print(f"Adjusted Policies: {adjusted_policies}\n")
    print_policies(initial_policies, "Initial Policies")
    print_policies(adjusted_policies, "Adjusted Policies")
    enforce_policies(adjusted_policies)
    # Sending alerts for non-compliance and based on policy adjustments
    for policy, action in adjusted_policies.items():
        if "Adjust Policy" in action:
            send_alert(f"Threats detected. {action}")
        else:
            print(f"{policy} is compliant and can continue processing data.")

# Execute main function
if __name__ == "__main__":
    main()

VBox(children=(Button(button_style='success', description='Load Data', style=ButtonStyle()), Button(button_sty…

[1mIoT data points after Splitting the Testing and Training data
Training set shape:
X_train: (86181, 94)
y_train: (86181,)

Testing set shape:
X_test: (36936, 94)
y_test: (36936,)
Detecting threats...
[1m1155/1155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
[1m1155/1155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 26ms/step
Final predictions generated and threat detection completed
Compliance Check Results: {'General Compliance': array([[ True,  True, False, ...,  True,  True,  True],
       [ True,  True, False, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True, False,  True],
       ...,
       [ True,  True, False, ...,  True,  True,  True],
       [ True,  True, False, ...,  True,  True,  True],
       [ True,  True, False, ...,  True,  True,  True]]), 'GDPR Compliance': array([[ True,  True, False, ...,  True,  True,  True],
       [ True,  True, False, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True, False,  Tru