# NeuroGuardX: An Explainable AI Pipeline for Intrusion Detection\n\nThis notebook provides a tutorial for the NeuroGuardX system, an Explainable AI (XAI) pipeline for intrusion detection in Online Social Networks (OSN) and Network Systems. We will walk through each step of the pipeline, from data preprocessing to model explanation.

In [None]:
import pandas as pd\nimport numpy as np\nfrom scipy import stats\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder\nimport requests\nimport os\nfrom sklearn.decomposition import PCA\nfrom sklearn.manifold import TSNE\nfrom sklearn.feature_selection import RFE\nfrom sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.model_selection import train_test_split\nfrom tensorflow.keras.models import Sequential, Model\nfrom tensorflow.keras.layers import LSTM, Dense, Input\nfrom tensorflow.keras.optimizers import Adam\nimport shap\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\nprint('Setup Complete!')

## 0. Data Download\n\nThis notebook uses the NSL-KDD dataset. We will download it from a public repository.

In [None]:
if not os.path.exists('data'):\n    os.makedirs('data')\n\nurl = 'https://figshare.com/ndownloader/files/54839969'\nresponse = requests.get(url)\nwith open('data/Train_data.csv', 'wb') as f:\n    f.write(response.content)\n\n# Create a dummy Test_data.csv for demonstration purposes, as a separate test file is not available from the source.\ntrain_df = pd.read_csv('data/Train_data.csv')\ntest_df = train_df.sample(frac=0.2, random_state=42)\ntest_df.to_csv('data/Test_data.csv', index=False)\n\nprint('Data downloaded and dummy test set created.')

## 1. Data Preprocessing\n\nIn this section, we'll preprocess the OSN and Network datasets. The preprocessing pipeline consists of three main steps:\n\n1.  **Data Collection:** We load the data from the `data` directory.\n2.  **Data Cleaning:** We handle missing values and remove outliers.\n3.  **Data Transformation:** We scale numerical features and one-hot encode categorical features.

In [None]:
def collect_data(dataset_path):\n    try:\n        df = pd.read_csv(dataset_path)\n        print(f"Successfully loaded data from local file: {dataset_path}")\n        return df\n    except Exception as e:\n        print(f"An error occurred while collecting data from {dataset_path}: {e}")\n        return None\n\ndef clean_data(df):\n    numeric_cols = df.select_dtypes(include=np.number).columns\n    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())\n    return df\n\ndef transform_data(data, method):\n    numeric_data = data.select_dtypes(include=np.number)\n    categorical_data = data.select_dtypes(exclude=np.number)\n\n    if not numeric_data.empty:\n        if method == \"Z-score\":\n            scaler = StandardScaler()\n        elif method == \"Min-Max\":\n            scaler = MinMaxScaler()\n        scaled_numeric_data = scaler.fit_transform(numeric_data)\n        scaled_df = pd.DataFrame(scaled_numeric_data, columns=numeric_data.columns, index=numeric_data.index)\n    else:\n        scaled_df = pd.DataFrame()\n\n    if not categorical_data.empty:\n        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)\n        encoded_categorical_data = encoder.fit_transform(categorical_data)\n        encoded_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_data.columns), index=categorical_data.index)\n    else:\n        encoded_df = pd.DataFrame()\n\n    return pd.concat([scaled_df, encoded_df], axis=1)\n\ndef preprocess_data(osn_path, network_path):\n    # For the purpose of this notebook, we'll assign the NSL-KDD dataset to both OSN and Network roles.\n    OSN_data = collect_data(osn_path)\n    Network_data = collect_data(network_path)\n\n    if OSN_data is None or Network_data is None:\n        return None, None, None, None\n\n    # Separate features and labels\n    OSN_features = OSN_data.iloc[:, :-1]\n    OSN_labels = OSN_data.iloc[:, -1]\n    Network_features = Network_data.iloc[:, :-1]\n    Network_labels = Network_data.iloc[:, -1]\n\n    # Encode labels\n    le = LabelEncoder()\n    OSN_labels_encoded = le.fit_transform(OSN_labels)\n    Network_labels_encoded = le.fit_transform(Network_labels)\n\n    # Clean and transform features only\n    OSN_cleaned_features = clean_data(OSN_features)\n    Network_cleaned_features = clean_data(Network_features)\n    OSN_transformed_features = transform_data(OSN_cleaned_features, \"Z-score\")\n    Network_transformed_features = transform_data(Network_cleaned_features, \"Min-Max\")\n\n    return OSN_transformed_features, pd.Series(OSN_labels_encoded), Network_transformed_features, pd.Series(Network_labels_encoded)

In [None]:
OSN_dataset_path = 'data/Train_data.csv'\nNetwork_dataset_path = 'data/Test_data.csv'\n\nOSN_processed_features, OSN_processed_labels, Network_processed_features, Network_processed_labels = preprocess_data(OSN_dataset_path, Network_dataset_path)\n\nprint(\"OSN Processed Features Head:\")\nprint(OSN_processed_features.head())\n\nprint(\"Network Processed Features Head:\")\nprint(Network_processed_features.head())

### Data Visualization

In [None]:
plt.figure(figsize=(12, 6))\nsns.histplot(OSN_processed_features.iloc[:, 0], kde=True)\nplt.title('Distribution of the First Feature in the OSN Dataset')\nplt.show()

## 2. Feature Selection\n\nIn this section, we'll select the most important features from the preprocessed datasets. The feature selection pipeline consists of two main steps:\n\n1.  **Feature Importance:** We use Recursive Feature Elimination (RFE) to select the most important features.\n2.  **Dimensionality Reduction:** We reduce the dimensionality of the selected features using PCA or t-SNE.

In [None]:
def reduce_dimensionality(features, dataset_type):\n    numeric_features = features.select_dtypes(include=np.number)\n    non_numeric_features = features.select_dtypes(exclude=np.number)\n\n    if numeric_features.empty:\n        return features\n\n    if dataset_type == \"OSN\" and numeric_features.shape[1] > 50:\n        reducer = TSNE(n_components=2, perplexity=min(30, len(numeric_features)-1), random_state=42)\n    else:\n        reducer = PCA(n_components=2, random_state=42)\n\n    reduced_numeric_data = reducer.fit_transform(numeric_features)\n    reduced_df = pd.DataFrame(reduced_numeric_data, index=numeric_features.index)\n\n    return pd.concat([reduced_df, non_numeric_features], axis=1)\n\ndef select_important_features(features, labels):\n    numeric_features = features.select_dtypes(include=np.number)\n    non_numeric_features = features.select_dtypes(exclude=np.number)\n\n    if numeric_features.empty:\n        return features\n\n    estimator = RandomForestClassifier(random_state=42)\n    n_features_to_select = min(5, numeric_features.shape[1])\n    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)\n    selector = selector.fit(numeric_features, labels)\n\n    selected_numeric_features = numeric_features.loc[:, selector.support_]\n\n    return pd.concat([selected_numeric_features, non_numeric_features], axis=1)\n\ndef feature_selection(features, labels, dataset_type):\n    if features is None or features.empty:\n        print(f\"Features for {dataset_type} is empty. Skipping feature selection.\")\n        return None\n\n    X_important = select_important_features(features, labels)\n    X_reduced = reduce_dimensionality(X_important, dataset_type)\n\n    print(f\"Feature selection complete for {dataset_type} dataset.\")\n    return X_reduced

In [None]:
OSN_features_selected = feature_selection(OSN_processed_features, OSN_processed_labels, \"OSN\")\nNetwork_features_selected = feature_selection(Network_processed_features, Network_processed_labels, \"Network\")\n\nprint(\"OSN Features Head:\")\nprint(OSN_features_selected.head())\n\nprint(\"Network Features Head:\")\nprint(Network_features_selected.head())

## 3. Model Training

In [None]:
def train_deep_learning_model(features, labels, dataset_type):\n    if features is None or labels is None or features.empty:\n        print(f\"Skipping deep learning model training for {dataset_type} due to missing data.\")\n        return None\n\n    num_classes = len(np.unique(labels))\n    model = Sequential([\n        Input(shape=(features.shape[1],)),\n        Dense(64, activation='relu'),\n        Dense(32, activation='relu'),\n        Dense(num_classes, activation='softmax')\n    ])\n\n    optimizer = Adam(learning_rate=0.001)\n    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n\n    X_train, X_val, y_train, y_val = train_test_split(features.values, labels.values, test_size=0.2, random_state=42)\n\n    model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val), verbose=1)\n    return model\n\nosn_dl_model = train_deep_learning_model(OSN_features_selected, OSN_processed_labels, \"OSN\")\nnetwork_dl_model = train_deep_learning_model(Network_features_selected, Network_processed_labels, \"Network\")

## 4. Explainable AI

In [None]:
def lrp_rule(layer, R, a):\n    w = layer.get_weights()[0]\n    b = layer.get_weights()[1] if len(layer.get_weights()) > 1 else 0\n    epsilon = 1e-7\n    z = np.dot(a, w) + b\n    s = R / (z + epsilon)\n    c = np.dot(s, w.T)\n    return a * c\n\ndef lrp_explain(model, data):\n    layer_outputs = [layer.output for layer in model.layers]\n    activation_model = Model(inputs=model.input, outputs=layer_outputs)\n    activations = activation_model.predict(data)\n    R = model.predict(data)\n    for i in range(len(model.layers) - 1, 0, -1):\n        R = lrp_rule(model.layers[i], R, activations[i-1])\n    return R\n\ndef shap_explain(model, data, model_type='deep_learning'):\n    if model_type == 'deep_learning':\n        explainer = shap.GradientExplainer(model, data)\n    elif model_type == 'tree':\n        explainer = shap.TreeExplainer(model, data)\n    else:\n        explainer = shap.KernelExplainer(model.predict, data)\n\n    shap_values = explainer.shap_values(data)\n\n    if isinstance(shap_values, list):\n        shap_values = shap_values[0]\n\n    return shap_values\n\nosn_lrp = lrp_explain(osn_dl_model, OSN_features_selected.values)\nosn_shap = shap_explain(osn_dl_model, OSN_features_selected.values, 'deep_learning')\n\nprint(\"OSN LRP Values Head:\")\nprint(pd.DataFrame(osn_lrp, columns=OSN_features_selected.columns).head())\n\nshap.summary_plot(osn_shap, OSN_features_selected)

## 5. Conclusion