In [12]:
import os
import pandas as pd
import random
from scapy.all import rdpcap, IP, TCP, UDP

def pcapToCsv(folder):
    """Converts PCAP files inside subdirectories into a structured CSV file with a random selection of 10,000 packets per file."""
    df = pd.DataFrame(columns=["Timestamp", "Source IP", "Destination IP", "Protocol", 
                               "Source Port", "Destination Port", "Length", "Target"])
    
    for fpath in os.listdir(folder):  
        fpath_full = os.path.join(folder, fpath)
        
        if not os.path.isdir(fpath_full):  # Ensure it's a folder
            continue
        
        for filename in os.listdir(fpath_full):  
            file_path = os.path.join(fpath_full, filename)
            
            if not file_path.endswith('.pcap'):  # Skip non-PCAP files
                continue
            
            print(f"Processing: {file_path}")
            
            packets = rdpcap(file_path)  
            packet_data = []

            for packet in packets:
                if IP in packet:
                    src_ip = packet[IP].src
                    dst_ip = packet[IP].dst
                    proto = packet[IP].proto
                    length = len(packet)
                    timestamp = packet.time
                    src_port, dst_port = None, None

                    if TCP in packet:
                        src_port = packet[TCP].sport
                        dst_port = packet[TCP].dport
                    elif UDP in packet:
                        src_port = packet[UDP].sport
                        dst_port = packet[UDP].dport

                    packet_data.append([timestamp, src_ip, dst_ip, proto, src_port, dst_port, length])

            # ✅ Randomly select 10,000 packets if more exist
            if len(packet_data) > 50000:
                packet_data = random.sample(packet_data, 50000)

            # Convert list to DataFrame
            temp_df = pd.DataFrame(packet_data, columns=["Timestamp", "Source IP", "Destination IP", 
                                                         "Protocol", "Source Port", "Destination Port", "Length"])
            temp_df["Target"] = fpath  # Folder name as label

            df = pd.concat([df, temp_df], ignore_index=True)

            print(f"✅ Processed {len(temp_df)} packets from {filename}")

    return df


In [13]:
df = pcapToCsv("data")
df.to_csv("output1.csv", index=False)

Processing: data\Benign\BitTorrent.pcap
✅ Processed 15000 packets from BitTorrent.pcap
Processing: data\Benign\Facetime.pcap
✅ Processed 6000 packets from Facetime.pcap
Processing: data\Benign\FTP.pcap
✅ Processed 50000 packets from FTP.pcap
Processing: data\Benign\Gmail.pcap
✅ Processed 25000 packets from Gmail.pcap
Processing: data\Benign\MySQL.pcap
✅ Processed 50000 packets from MySQL.pcap
Processing: data\Benign\Outlook.pcap
✅ Processed 15000 packets from Outlook.pcap
Processing: data\Benign\Skype.pcap
✅ Processed 12000 packets from Skype.pcap
Processing: data\Benign\SMB-1.pcap
✅ Processed 50000 packets from SMB-1.pcap
Processing: data\Benign\SMB-2.pcap
✅ Processed 50000 packets from SMB-2.pcap
Processing: data\Benign\Weibo-1.pcap
✅ Processed 50000 packets from Weibo-1.pcap
Processing: data\Benign\Weibo-2.pcap
✅ Processed 50000 packets from Weibo-2.pcap
Processing: data\Benign\Weibo-3.pcap
✅ Processed 50000 packets from Weibo-3.pcap
Processing: data\Benign\Weibo-4.pcap
✅ Processed 

In [59]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import warnings
import socket, struct
import pickle

warnings.filterwarnings("ignore")

class NetworkAnalysis():
    """Class for handling network traffic data, converting PCAP to CSV, and training ML models."""
    def __init__(self, packetsPath):
        self.packetsPath = packetsPath

    @staticmethod
    def ip2int(ip):
        packedIP = socket.inet_aton(ip)
        return struct.unpack("!L", packedIP)[0]
    
    def trainModels(self):
        """Loads dataset, preprocesses data, trains ML models, and evaluates them."""
        df = pd.read_csv(self.packetsPath)
        print("✅ Dataset Loaded Successfully!")
        print(df.head())

        # Encode labels if present
        if 'label' in df.columns:
            le = LabelEncoder()
            df['label'] = le.fit_transform(df['label'])

        with open("label_encoder.pkl", "wb") as f:
            pickle.dump(le, f)
        df = df.fillna(0)

        # Convert IP addresses to numerical format
        df["Destination IP"] = df["Destination IP"].apply(lambda x: NetworkAnalysis.ip2int(x))
        df["Source IP"] = df["Source IP"].apply(lambda x: NetworkAnalysis.ip2int(x))
        
        X = df.drop(columns=['label'])
        y = df['label']

        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        with open("scaler.pkl", "wb") as f:
            pickle.dump(scaler, f)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

        # Train traditional ML models
        models = {
            "Random Forest": RandomForestClassifier(),
            "Gradient Boosting": GradientBoostingClassifier()
        }

        param_distributions = {
            "Random Forest": {
                "n_estimators": [50, 100, 200],
                "max_depth": [10, 20, None],
                "min_samples_split": [2, 5, 10],
            },
            "Gradient Boosting": {
                "n_estimators": [50, 100, 200],
                "learning_rate": [0.01, 0.1, 0.2],
                "max_depth": [3, 5, 10],
            }
        }

        best_models = {}
        for name, model in models.items():
            print(f"\n🔍 Training {name} with Randomized Search...")
            search = RandomizedSearchCV(model, param_distributions[name], n_iter=10, cv=3, scoring="accuracy", n_jobs=-1)
            search.fit(X_train, y_train)
            best_models[name] = search.best_estimator_
            print(f"✅ Best Parameters for {name}: {search.best_params_}")

        for name, model in best_models.items():
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            print(f"\n🏆 Accuracy of {name}: {accuracy:.4f}")
            print(classification_report(y_test, y_pred))

        # MLP Neural Network Model
        input_dim = X_train.shape[1]
        
        mlp_model = Sequential([
            Dense(128, activation='relu', input_shape=(input_dim,)),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(len(np.unique(y)), activation='softmax')
        ])

        mlp_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        print("\n🔍 Training MLP Neural Network...")
        mlp_model.fit(X_train, y_train, epochs=2, validation_data=(X_test, y_test), batch_size=32)

        mlp_loss, mlp_acc = mlp_model.evaluate(X_test, y_test)
        print(f"\n🏆 MLP Neural Network Accuracy: {mlp_acc:.4f}")

        results = {name: accuracy_score(y_test, best_models[name].predict(X_test)) for name in best_models}
        results["MLP"] = mlp_acc

        

        print("\n🔹 Final Model Comparison:")
        for model_name, acc in results.items():
            print(f"{model_name}: {acc:.4f}")

        best_model_name = max(results, key=results.get)

        print(f"\n🏆 Best Model: {best_model_name} with Accuracy {results[best_model_name]:.4f}")
        print(confusion_matrix(y_test, best_models[best_model_name].predict(X_test)))

        with open("networkIntrusion.pkl", "wb") as f:
            pickle.dump(best_models[best_model_name], f)

# Run analysis
analysis = NetworkAnalysis("output1.csv")  # Replace with actual dataset path
analysis.trainModels()


✅ Dataset Loaded Successfully!
      Timestamp   Source IP Destination IP  Protocol  Source Port  \
0  27991.263871  1.1.33.158    1.2.156.163         6      41319.0   
1  27991.263873  1.1.23.218     1.2.31.193         6       9341.0   
2  27991.263875   1.2.84.77    1.1.252.248         6        443.0   
3  27991.263875  1.1.71.180      1.2.79.35         6      65486.0   
4  27991.263876  1.1.41.222     1.2.129.94         6      14880.0   

   Destination Port  Length   label  
0             443.0    1475  Benign  
1             443.0     163  Benign  
2           22794.0      70  Benign  
3             443.0    1475  Benign  
4             443.0     136  Benign  

🔍 Training Random Forest with Randomized Search...
✅ Best Parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 20}

🔍 Training Gradient Boosting with Randomized Search...
✅ Best Parameters for Gradient Boosting: {'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.01}

🏆 Accuracy of 

In [23]:
import pandas as pd
from scapy.all import sniff, IP, TCP, UDP
import time

packet_data = []

# Callback function to process packets
def packet_callback(packet):
    global packet_data  # Ensure we're modifying the global list
    
    if IP in packet:  # Only process IP packets
        timestamp = time.time()  # Current timestamp
        src_ip = packet[IP].src
        dst_ip = packet[IP].dst
        proto = packet[IP].proto
        length = len(packet)

        # Extract ports for TCP/UDP packets
        src_port, dst_port = None, None
        if TCP in packet:
            src_port = packet[TCP].sport
            dst_port = packet[TCP].dport
        elif UDP in packet:
            src_port = packet[UDP].sport
            dst_port = packet[UDP].dport

        # Print packet details
        print(f"[{timestamp}] {src_ip}:{src_port} -> {dst_ip}:{dst_port} (Proto: {proto}, Length: {length})")

        # Append to list
        packet_data.append([timestamp, src_ip, dst_ip, proto, src_port, dst_port, length])

# Capture packets (Change 'Wi-Fi' to your actual network interface)
print("Capturing packets... Press Ctrl+C to stop.")
sniff(iface="Wi-Fi", prn=packet_callback, count=5)  # Captures 5 packets

df = pd.DataFrame(packet_data, columns=["Timestamp", "Source IP", "Destination IP", "Protocol", "Source Port", "Destination Port", "Length"])

Capturing packets... Press Ctrl+C to stop.
[1742922398.2025073] 10.18.7.8:5353 -> 224.0.0.251:5353 (Proto: 17, Length: 103)
[1742922398.3166573] 20.250.119.64:443 -> 10.18.7.104:62263 (Proto: 6, Length: 60)


In [28]:
packet_data

[[1742922398.2025073, '10.18.7.8', '224.0.0.251', 17, 5353, 5353, 103],
 [1742922398.3166573, '20.250.119.64', '10.18.7.104', 6, 443, 62263, 60]]

In [38]:
import numpy as np
import pandas as pd
from scapy.all import sniff, IP, TCP, UDP
import time
import pickle
import socket
import struct


# Function to convert IP to integer
def ip2int(ip):
    try:
        return struct.unpack("!L", socket.inet_aton(ip))[0]
    except socket.error:
        return 0  # Default for malformed IPs

# Load trained model & scaler
with open("networkIntrusion.pkl", "rb") as f:
    model = pickle.load(f)

with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open("label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

# List to store captured packet data
packet_data = []

# Callback function to process packets
def packet_callback(packet):
    global packet_data  # Ensure we're modifying the global list
    
    if IP in packet:  # Only process IP packets
        timestamp = time.time()  # Current timestamp
        src_ip = ip2int(packet[IP].src)
        dst_ip = ip2int(packet[IP].dst)
        proto = packet[IP].proto
        length = len(packet)

        # Extract ports for TCP/UDP packets
        src_port, dst_port = 0, 0  # Default for non-TCP/UDP packets
        if TCP in packet:
            src_port = packet[TCP].sport
            dst_port = packet[TCP].dport
        elif UDP in packet:
            src_port = packet[UDP].sport
            dst_port = packet[UDP].dport

        # Print packet details
        print(f"[{timestamp}] {packet[IP].src}:{src_port} -> {packet[IP].dst}:{dst_port} (Proto: {proto}, Length: {length})")

        # Append to list
        packet_data.append([timestamp, src_ip, dst_ip, proto, src_port, dst_port, length])

# Capture packets (Change 'Wi-Fi' to your actual network interface)
print("Capturing packets... Press Ctrl+C to stop.")
sniff(iface="Wi-Fi", prn=packet_callback, count=15)  # Captures 5 packets
packet_data = [[27991.263941, ip2int("1.2.171.122"), ip2int("1.1.23.76"), 6, 443, 38963, 906]]

# Convert to DataFrame
df = pd.DataFrame(packet_data, columns=["Timestamp", "Source IP", "Destination IP", "Protocol", "Source Port", "Destination Port", "Length"])

# Apply feature scaling
df_scaled = scaler.transform(df)

# Predict using the trained model
predictions = model.predict(df_scaled)

# Convert numerical labels back to original categories
predicted_labels = label_encoder.inverse_transform(predictions)

print(predicted_labels)

Capturing packets... Press Ctrl+C to stop.
[1742925257.1798038] 10.18.6.141:1900 -> 239.255.255.250:1900 (Proto: 17, Length: 179)
[1742925257.1858616] 10.18.6.141:1900 -> 239.255.255.250:1900 (Proto: 17, Length: 179)
[1742925257.2145019] 142.250.192.10:443 -> 10.18.7.104:50541 (Proto: 17, Length: 70)
[1742925257.2778926] 192.168.9.18:42563 -> 255.255.255.255:29810 (Proto: 17, Length: 871)
[1742925257.4279575] 10.18.7.104:61195 -> 172.16.1.80:53 (Proto: 17, Length: 92)
[1742925257.4319837] 10.18.7.104:54987 -> 172.16.1.80:53 (Proto: 17, Length: 92)
[1742925257.4566088] 10.18.7.104:62594 -> 20.50.80.209:443 (Proto: 6, Length: 171)
[1742925257.4586198] 10.18.7.104:62594 -> 20.50.80.209:443 (Proto: 6, Length: 6572)
[1742925257.464448] 172.16.1.80:53 -> 10.18.7.104:61195 (Proto: 17, Length: 218)
[1742925257.481233] 172.16.1.80:53 -> 10.18.7.104:54987 (Proto: 17, Length: 279)
[1742925257.617707] 10.18.7.104:50572 -> 172.16.1.80:53 (Proto: 17, Length: 92)
['Benign']
