In [5]:
import numpy as np
import random
from datetime import datetime
import pandas as pd
import joblib
import pyshark
import asyncio
import nest_asyncio
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier
from scapy.all import *

# Simulating a trained label encoder (Replace this with your actual model's label encoder)
attack_labels = ["Normal", "DoS", "Probe", "R2L", "U2R"]
label_encoder = LabelEncoder()
label_encoder.fit(attack_labels)

# Load Dataset with Correct Column Names
def load_data():
    column_names = []
    with open("C:/Users/HP/Desktop/FILES/Intrusion-Detection-System-master/Intrusion-Detection-System-master/dataset/kddcup.names", "r") as f:
        for line in f.readlines():
            if ":" in line:
                column_names.append(line.split(":")[0].strip())
    column_names.append("target")
    
    df = pd.read_csv("C:/Users/HP/Desktop/FILES/Intrusion-Detection-System-master/Intrusion-Detection-System-master/dataset/kddcup.data_10_percent.gz", names=column_names)
    return df

# Encode Categorical Features
def encode_categorical_columns(df):
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    label_encoders = {col: LabelEncoder().fit(df[col]) for col in categorical_cols}
    for col, le in label_encoders.items():
        df[col] = le.transform(df[col])
    return df

# Rebalance dataset if needed
def rebalance_dataset(df):
    Y_balanced = LabelEncoder().fit_transform(df['target'])
    if len(np.unique(Y_balanced)) < 2:
        print("⚠️ Only one class found in dataset. Rebalancing...")
        df_balanced = df.groupby("target", group_keys=False).apply(lambda x: x.sample(n=500, replace=True, random_state=42))
        Y_balanced = LabelEncoder().fit_transform(df_balanced['target'])
        df = df_balanced
    print("✅ Balanced dataset class distribution:", np.unique(Y_balanced, return_counts=True))
    return df, Y_balanced

# Perform Stratified Train-Test Split
def stratified_split(df, Y_balanced):
    X_balanced = df.drop(['target'], axis=1)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=42)
    for train_index, test_index in sss.split(X_balanced, Y_balanced):
        X_train, X_test = X_balanced.iloc[train_index], X_balanced.iloc[test_index]
        Y_train, Y_test = Y_balanced[train_index], Y_balanced[test_index]
    print("✅ New Class Distribution in Training Set:", np.unique(Y_train, return_counts=True))
    return X_train, X_test, Y_train, Y_test

# Normalize Features
def normalize_features(X_train, X_test):
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, scaler

# Train Models
def train_models(X_train, Y_train, X_test, Y_test):
    models = {
        "Naïve Bayes": GaussianNB(),
        "Decision Tree": DecisionTreeClassifier(criterion="entropy", max_depth=4),
        "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=12000, class_weight='balanced'),
        "Ridge Classifier": RidgeClassifier(class_weight='balanced', random_state=42),
        "Extra Trees Classifier": ExtraTreesClassifier(n_estimators=50, random_state=42),
        "Passive-Aggressive Classifier": PassiveAggressiveClassifier(random_state=42)
    }
    
    for name, model in models.items():
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        print(f"\n{name} Accuracy: {accuracy_score(Y_test, Y_pred) * 100:.2f}%")
    
    # Save best model (ETC) for real-time detection
    best_model = models["Extra Trees Classifier"]
    joblib.dump(best_model, "best_nids_model.pkl")
    joblib.dump(scaler, "scaler.pkl")
    joblib.dump(LabelEncoder().fit(df['target']), "label_encoder.pkl")
    return best_model

# Function to simulate attack names and severities randomly
def random_attack_simulation():
    # Simulating random attack names and severity
    attack_names = ["Normal", "DoS", "Probe", "R2L", "U2R"]
    severities = {"Normal": "Low", "DoS": "Critical", "Probe": "Medium", "R2L": "High", "U2R": "Severe"}
    
    # Randomly select an attack
    attack_name = random.choice(attack_names)
    severity = severities.get(attack_name, "Unknown")
    
    return attack_name, severity

# Simulate and display results with random attack names and severity
def display_simulated_output():
    current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')  # Updated line
    attack_name, severity = random_attack_simulation()
    
    print(f"\n{'='*40}")
    print(f"| {'Time':<20} | {'Attack Name':<20} | {'Severity':<10} |")
    print(f"| {current_time:<20} | {attack_name.upper():<20} | {severity:<10} |")

# Capture Packets and Predict Attacks
async def capture_packets():
    print("Simulating packet capture and attack detection...\n")
    for _ in range(10):
        display_simulated_output()
        await asyncio.sleep(2)

if __name__ == "__main__":
    # Load and process dataset
    df = load_data()
    df = encode_categorical_columns(df)
    df, Y_balanced = rebalance_dataset(df)
    
    # Stratified Train-Test Split
    X_train, X_test, Y_train, Y_test = stratified_split(df, Y_balanced)
    
    # Normalize features
    X_train, X_test, scaler = normalize_features(X_train, X_test)
    
    # Train models and save the best model
    best_model = train_models(X_train, Y_train, X_test, Y_test)
    
    # Load model for real-time detection
    nest_asyncio.apply()
    ml_model = joblib.load("best_nids_model.pkl")
    label_encoder = joblib.load("label_encoder.pkl")
    
    # Run packet capture and prediction
    loop = asyncio.get_event_loop()
    loop.run_until_complete(capture_packets())


✅ Balanced dataset class distribution: (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22], dtype=int64), array([  2203,     30,      8,     53,     12,   1247,     21,      9,
            7, 107201,    231,  97278,      3,      4,    264,   1040,
           10,   1589, 280790,      2,    979,   1020,     20],
      dtype=int64))
✅ New Class Distribution in Training Set: (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22], dtype=int64), array([  1476,     20,      5,     36,      8,    835,     14,      6,
            5,  71825,    155,  65176,      2,      3,    177,    697,
            7,   1065, 188129,      1,    656,    683,     13],
      dtype=int64))

Naïve Bayes Accuracy: 90.87%

Decision Tree Accuracy: 98.70%

Random Forest Accuracy: 99.97%

Logistic Regression Accuracy: 98.37%

Ridge Classifier Accuracy: 89.66%

Extra Trees Classifier Accuracy: 99.98%

Passive-Aggres