## IDS -  Intrusion Detection System

In [2]:
pip install pandas scikit-learn numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [11]:
# Load the training and testing datasets
train_data = pd.read_csv(r"C:\Users\User\Documents\Cybersecurity\IDS\IDS\Data\UNSW_NB15_training-set.csv")
test_data = pd.read_csv(r"C:\Users\User\Documents\Cybersecurity\IDS\IDS\Data\UNSW_NB15_testing-set.csv")

In [12]:
# Display the columns
print(train_data.columns)

Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')


In [13]:
# Drop unnecessary columns (e.g., "id", "attack_cat")
train_data = train_data.drop(columns=["id", "attack_cat"])
test_data = test_data.drop(columns=["id", "attack_cat"])

In [14]:
# Handle missing values (if any)
train_data = train_data.dropna()
test_data = test_data.dropna()

In [18]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    train_data[col] = label_encoders[col].fit_transform(train_data[col])

    # Map test data using the same encoder, replacing unknown labels with -1
    test_data[col] = test_data[col].apply(lambda x: label_encoders[col].classes_.tolist().index(x) 
                                          if x in label_encoders[col].classes_ else -1)


In [19]:
# Encode labels (normal = 0, attack = 1)
train_data["label"] = train_data["label"].apply(lambda x: 0 if x == 0 else 1)
test_data["label"] = test_data["label"].apply(lambda x: 0 if x == 0 else 1)

In [20]:
# Normalize numerical features
scaler = StandardScaler()
numerical_cols = train_data.columns.difference(categorical_cols).difference(["label"])
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

In [21]:
# Save preprocessed data
train_data.to_csv("train_processed.csv", index=False)
test_data.to_csv("test_processed.csv", index=False)

## Model Training

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [23]:
# Load preprocessed data
train_data = pd.read_csv("train_processed.csv")
test_data = pd.read_csv("test_processed.csv")

In [24]:
# Split features and labels
X_train = train_data.drop("label", axis=1)
y_train = train_data["label"]
X_test = test_data.drop("label", axis=1)
y_test = test_data["label"]

In [25]:
# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [26]:
# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9008617493911864
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.98      0.86     56000
           1       0.99      0.87      0.92    119341

    accuracy                           0.90    175341
   macro avg       0.88      0.92      0.89    175341
weighted avg       0.92      0.90      0.90    175341



In [27]:
import joblib
joblib.dump(model, "intrusion_detection_model_unsw.pkl")

['intrusion_detection_model_unsw.pkl']

## Feature Importance Analysis

In [None]:
# Install visualization libraries
pip install matplotlib seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import joblib

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Load the trained model
model = joblib.load('intrusion_detection_model_unsw.pkl')

# Load the preprocessed training data to get feature names
train_data = pd.read_csv('train_processed.csv')
feature_names = train_data.drop('label', axis=1).columns.tolist()

print(f"Total number of features: {len(feature_names)}")
print(f"\nFeature names: {feature_names[:10]}...")  # Show first 10

In [None]:
# Extract feature importances from the Random Forest model
importances = model.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display top 20 most important features
print("Top 20 Most Important Features:")
print("=" * 50)
print(feature_importance_df.head(20).to_string(index=False))
print("\n" + "=" * 50)
print(f"\nTop 10 features account for {feature_importance_df.head(10)['Importance'].sum():.2%} of total importance")
print(f"Top 20 features account for {feature_importance_df.head(20)['Importance'].sum():.2%} of total importance")

In [None]:
# Visualization 1: Top 20 Features - Horizontal Bar Chart
plt.figure(figsize=(12, 8))
top_20 = feature_importance_df.head(20)
plt.barh(range(len(top_20)), top_20['Importance'], color='steelblue')
plt.yticks(range(len(top_20)), top_20['Feature'])
plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
plt.ylabel('Features', fontsize=12, fontweight='bold')
plt.title('Top 20 Most Important Features for Intrusion Detection', fontsize=14, fontweight='bold', pad=20)
plt.gca().invert_yaxis()  # Highest importance at the top
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('feature_importance_top20.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: feature_importance_top20.png")

In [None]:
# Visualization 2: Top 15 Features - Vertical Bar Chart with Values
plt.figure(figsize=(14, 6))
top_15 = feature_importance_df.head(15)
bars = plt.bar(range(len(top_15)), top_15['Importance'], color='coral', edgecolor='black', linewidth=1.2)
plt.xticks(range(len(top_15)), top_15['Feature'], rotation=45, ha='right')
plt.xlabel('Features', fontsize=12, fontweight='bold')
plt.ylabel('Importance Score', fontsize=12, fontweight='bold')
plt.title('Top 15 Most Important Features with Importance Scores', fontsize=14, fontweight='bold', pad=20)
plt.grid(axis='y', alpha=0.3)

# Add value labels on top of bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.4f}',
             ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('feature_importance_top15_vertical.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: feature_importance_top15_vertical.png")

In [None]:
# Visualization 3: Cumulative Importance Plot
plt.figure(figsize=(12, 6))
cumulative_importance = np.cumsum(feature_importance_df['Importance'])
plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance, 
         linewidth=2.5, color='darkgreen', marker='o', markersize=3, markevery=5)
plt.axhline(y=0.95, color='red', linestyle='--', linewidth=2, label='95% Threshold')
plt.axhline(y=0.90, color='orange', linestyle='--', linewidth=2, label='90% Threshold')
plt.xlabel('Number of Features', fontsize=12, fontweight='bold')
plt.ylabel('Cumulative Importance', fontsize=12, fontweight='bold')
plt.title('Cumulative Feature Importance', fontsize=14, fontweight='bold', pad=20)
plt.grid(True, alpha=0.3)
plt.legend(fontsize=10)

# Find how many features needed for 90% and 95% importance
features_90 = np.argmax(cumulative_importance >= 0.90) + 1
features_95 = np.argmax(cumulative_importance >= 0.95) + 1
plt.axvline(x=features_90, color='orange', linestyle=':', alpha=0.5)
plt.axvline(x=features_95, color='red', linestyle=':', alpha=0.5)

plt.text(features_90, 0.5, f'{features_90} features\nfor 90%', 
         ha='center', fontsize=9, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.text(features_95, 0.6, f'{features_95} features\nfor 95%', 
         ha='center', fontsize=9, bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.5))

plt.tight_layout()
plt.savefig('cumulative_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\n✓ Saved: cumulative_feature_importance.png")
print(f"\nInsight: Only {features_90} features are needed to capture 90% of the model's predictive power")
print(f"Insight: Only {features_95} features are needed to capture 95% of the model's predictive power")

In [None]:
# Visualization 4: Correlation Heatmap of Top 10 Important Features
top_10_features = feature_importance_df.head(10)['Feature'].tolist()
train_data_subset = train_data[top_10_features]

plt.figure(figsize=(12, 10))
correlation_matrix = train_data_subset.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap of Top 10 Most Important Features', 
          fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('top10_features_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: top10_features_correlation.png")

In [None]:
# Summary Statistics
print("\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS SUMMARY")
print("="*60)
print(f"\nTotal Features: {len(feature_names)}")
print(f"\nMost Important Feature: {feature_importance_df.iloc[0]['Feature']}")
print(f"Importance Score: {feature_importance_df.iloc[0]['Importance']:.4f}")
print(f"\nLeast Important Feature: {feature_importance_df.iloc[-1]['Feature']}")
print(f"Importance Score: {feature_importance_df.iloc[-1]['Importance']:.6f}")
print(f"\nMean Importance: {importances.mean():.4f}")
print(f"Median Importance: {np.median(importances):.4f}")
print(f"Std Deviation: {importances.std():.4f}")
print(f"\nFeatures above mean importance: {sum(importances > importances.mean())}")
print(f"Features below mean importance: {sum(importances < importances.mean())}")
print("\n" + "="*60)

# Save the full feature importance data to CSV
feature_importance_df.to_csv('feature_importance_full.csv', index=False)
print("\n✓ Saved: feature_importance_full.csv")