In [55]:
import os
import pandas as pd
from scapy.all import *
from ipaddress import ip_address
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from statistics import median, stdev, variance

In [56]:
def extract_features_from_session(session_packets):
    session_length = len(session_packets)
    total_packet_length = sum(len(pkt) for pkt in session_packets)
    total_payload_size = sum(len(pkt.payload) for pkt in session_packets)
    start_time = session_packets[0].time
    end_time = session_packets[-1].time
    
    # Initialize default values for features
    protocol_type = None
    src_ip = None
    dst_ip = None
    src_port = None
    dst_port = None
    flags = None
    
    # IPv4 features
    version_ipv4 = None
    ihl_ipv4 = None
    type_of_service_ipv4 = None
    total_length_ipv4 = None
    identification_ipv4 = None
    fragment_offset_ipv4 = None
    ttl_ipv4 = None
    header_checksum_ipv4 = None
    options_ipv4 = None
    
    # IPv6 features
    version_ipv6 = None
    traffic_class_ipv6 = None
    flow_label_ipv6 = None
    payload_length_ipv6 = None
    next_header_ipv6 = None
    hop_limit_ipv6 = None
    
    # Payload-level features
    payload_lengths = [len(pkt.payload) for pkt in session_packets]
    mean_payload_length = sum(payload_lengths) / len(payload_lengths)
    median_payload_length = median(payload_lengths)
    max_payload_length = max(payload_lengths)
    min_payload_length = min(payload_lengths)
    std_payload_length = stdev(payload_lengths) if len(payload_lengths) > 1 else 0  # Avoid division by zero
    var_payload_length = variance(payload_lengths) if len(payload_lengths) > 1 else 0  # Avoid division by zero
    
    # Extract features from the packet in the session
    first_pkt = session_packets[0]
    if IP in first_pkt:
        protocol_type = first_pkt[IP].proto
        src_ip = int(ip_address(first_pkt[IP].src))
        dst_ip = int(ip_address(first_pkt[IP].dst))
        
        # Extract IPv4 header features
        version_ipv4 = first_pkt[IP].version
        ihl_ipv4 = first_pkt[IP].ihl
        type_of_service_ipv4 = first_pkt[IP].tos
        total_length_ipv4 = first_pkt[IP].len
        identification_ipv4 = first_pkt[IP].id
        fragment_offset_ipv4 = first_pkt[IP].frag
        ttl_ipv4 = first_pkt[IP].ttl
        header_checksum_ipv4 = first_pkt[IP].chksum
        options_ipv4 = first_pkt[IP].options if first_pkt[IP].options else None
    
    if IPv6 in first_pkt:
        protocol_type = 6  # IPv6 protocol type
        src_ip = first_pkt[IPv6].src
        dst_ip = first_pkt[IPv6].dst
        
        # Extract IPv6 header features
        version_ipv6 = first_pkt[IPv6].version
        traffic_class_ipv6 = first_pkt[IPv6].tc
        flow_label_ipv6 = first_pkt[IPv6].fl
        payload_length_ipv6 = first_pkt[IPv6].plen
        next_header_ipv6 = first_pkt[IPv6].nh
        hop_limit_ipv6 = first_pkt[IPv6].hlim
    
    if TCP in first_pkt:
        src_port = first_pkt[TCP].sport
        dst_port = first_pkt[TCP].dport
        flags = first_pkt[TCP].flags
    elif UDP in first_pkt:
        src_port = first_pkt[UDP].sport
        dst_port = first_pkt[UDP].dport
    
    # Create a dictionary of features
    features = {
        'session_length': session_length,
        'total_packet_length': total_packet_length,
        'total_payload_size': total_payload_size,
        'start_time': start_time,
        'end_time': end_time,
        'protocol_type': protocol_type,
        'src_ip': src_ip,
        'dst_ip': dst_ip,
        'src_port': src_port,
        'dst_port': dst_port,
        'flags': flags,
        # IPv4 features
        'version_ipv4': version_ipv4,
        'ihl_ipv4': ihl_ipv4,
        'type_of_service_ipv4': type_of_service_ipv4,
        'total_length_ipv4': total_length_ipv4,
        'identification_ipv4': identification_ipv4,
        'fragment_offset_ipv4': fragment_offset_ipv4,
        'ttl_ipv4': ttl_ipv4,
        'header_checksum_ipv4': header_checksum_ipv4,
        'options_ipv4': options_ipv4,
        # IPv6 features
        'version_ipv6': version_ipv6,
        'traffic_class_ipv6': traffic_class_ipv6,
        'flow_label_ipv6': flow_label_ipv6,
        'payload_length_ipv6': payload_length_ipv6,
        'next_header_ipv6': next_header_ipv6,
        'hop_limit_ipv6': hop_limit_ipv6,
        # Payload-level features
        'mean_payload_length': mean_payload_length,
        'median_payload_length': median_payload_length,
        'max_payload_length': max_payload_length,
        'min_payload_length': min_payload_length,
        'std_payload_length': std_payload_length,
        'var_payload_length': var_payload_length
    }
    
    return features

In [57]:
def extract_features_from_folder(folder_path):
    all_features = []
    pcap_files = [f for f in os.listdir(folder_path) if f.endswith('.pcap') or f.endswith('.pcapng') or f.endswith('.cap')]
    for pcap_file in pcap_files:
        pcap_file_path = os.path.join(folder_path, pcap_file)
        packets = rdpcap(pcap_file_path)
        
        sessions = packets.sessions()  # Group packets into sessions
        
        for session_key in sessions:
            session_packets = sessions[session_key]
            features = extract_features_from_session(session_packets)
            all_features.append(features)
    
    return all_features

In [58]:
# Path to the 'attack' and 'normal' folders
attack_folder = 'attack'
normal_folder = 'normal'

In [59]:
# Extract features from 'attack' folder
attack_features = extract_features_from_folder(attack_folder)

In [60]:
# Extract features from 'normal' folder
normal_features = extract_features_from_folder(normal_folder)

In [61]:
# Convert features to DataFrame
attack_df = pd.DataFrame(attack_features)
normal_df = pd.DataFrame(normal_features)

In [62]:
# Add labels to the DataFrames
attack_df['label'] = 'attack'
normal_df['label'] = 'normal'

In [63]:
# Concatenate attack and normal DataFrames
combined_df = pd.concat([attack_df, normal_df], ignore_index=True)

In [64]:
# Shuffle the DataFrame to mix attack and normal data
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

In [65]:
# Convert timestamp to seconds
combined_df['start_time'] = combined_df['start_time'].astype(int)
combined_df['end_time'] = combined_df['end_time'].astype(int)

In [66]:
# Label encode the "flags" column
le = LabelEncoder()
combined_df['flags'] = le.fit_transform(combined_df['flags'].astype(str))

In [67]:
# Split features and labels
X = combined_df.drop('label', axis=1)
y = combined_df['label']

In [68]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
# Check if 'options' column exists before preprocessing
if 'options' in X_train.columns:
    # Preprocess 'options' feature
    X_train['options'] = X_train['options'].apply(lambda x: str(x))
    X_test['options'] = X_test['options'].apply(lambda x: str(x))

In [70]:
# Select only numeric columns for imputation
numeric_cols = X_train.select_dtypes(include=['number']).columns
X_train_numeric = X_train[numeric_cols]
X_test_numeric = X_test[numeric_cols]

In [71]:
# Impute missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_numeric)
X_test_imputed = imputer.transform(X_test_numeric)

In [72]:
# Train Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_imputed, y_train)

In [73]:
# Make predictions
y_pred = rf_classifier.predict(X_test_imputed)

In [74]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9973941368078176


In [75]:
from sklearn.metrics import classification_report
# Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

      attack       0.99      0.99      0.99       199
      normal       1.00      1.00      1.00      1336

    accuracy                           1.00      1535
   macro avg       0.99      1.00      0.99      1535
weighted avg       1.00      1.00      1.00      1535



In [76]:
import joblib
joblib.dump(rf_classifier, 'ransomware_rf2.sav')

['ransomware_rf2.sav']

# Test on new data

In [4]:
import os
import tkinter as tk
from tkinter import filedialog
import pandas as pd
from scapy.all import *
from ipaddress import ip_address
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import joblib
from statistics import median, stdev, variance

# Load the trained model
rf_classifier = joblib.load('ransomware_rf2.sav')

def extract_features_from_session(session_packets):
    session_length = len(session_packets)
    total_packet_length = sum(len(pkt) for pkt in session_packets)
    total_payload_size = sum(len(pkt.payload) for pkt in session_packets)
    start_time = session_packets[0].time
    end_time = session_packets[-1].time
    
    # Initialize default values for features
    protocol_type = None
    src_ip = None
    dst_ip = None
    src_port = None
    dst_port = None
    flags = None
    
    # IPv4 features
    version_ipv4 = None
    ihl_ipv4 = None
    type_of_service_ipv4 = None
    total_length_ipv4 = None
    identification_ipv4 = None
    fragment_offset_ipv4 = None
    ttl_ipv4 = None
    header_checksum_ipv4 = None
    options_ipv4 = None
    
    # IPv6 features
    version_ipv6 = None
    traffic_class_ipv6 = None
    flow_label_ipv6 = None
    payload_length_ipv6 = None
    next_header_ipv6 = None
    hop_limit_ipv6 = None
    
    # Payload-level features
    payload_lengths = [len(pkt.payload) for pkt in session_packets]
    mean_payload_length = sum(payload_lengths) / len(payload_lengths)
    median_payload_length = median(payload_lengths)
    max_payload_length = max(payload_lengths)
    min_payload_length = min(payload_lengths)
    std_payload_length = stdev(payload_lengths) if len(payload_lengths) > 1 else 0  # Avoid division by zero
    var_payload_length = variance(payload_lengths) if len(payload_lengths) > 1 else 0  # Avoid division by zero
    
    # Extract features from the packet in the session
    first_pkt = session_packets[0]
    if IP in first_pkt:
        protocol_type = first_pkt[IP].proto
        src_ip = int(ip_address(first_pkt[IP].src))
        dst_ip = int(ip_address(first_pkt[IP].dst))
        
        # Extract IPv4 header features
        version_ipv4 = first_pkt[IP].version
        ihl_ipv4 = first_pkt[IP].ihl
        type_of_service_ipv4 = first_pkt[IP].tos
        total_length_ipv4 = first_pkt[IP].len
        identification_ipv4 = first_pkt[IP].id
        fragment_offset_ipv4 = first_pkt[IP].frag
        ttl_ipv4 = first_pkt[IP].ttl
        header_checksum_ipv4 = first_pkt[IP].chksum
        options_ipv4 = first_pkt[IP].options if first_pkt[IP].options else None
    
    if IPv6 in first_pkt:
        protocol_type = 6  # IPv6 protocol type
        src_ip = first_pkt[IPv6].src
        dst_ip = first_pkt[IPv6].dst
        
        # Extract IPv6 header features
        version_ipv6 = first_pkt[IPv6].version
        traffic_class_ipv6 = first_pkt[IPv6].tc
        flow_label_ipv6 = first_pkt[IPv6].fl
        payload_length_ipv6 = first_pkt[IPv6].plen
        next_header_ipv6 = first_pkt[IPv6].nh
        hop_limit_ipv6 = first_pkt[IPv6].hlim
    
    if TCP in first_pkt:
        src_port = first_pkt[TCP].sport
        dst_port = first_pkt[TCP].dport
        flags = first_pkt[TCP].flags
    elif UDP in first_pkt:
        src_port = first_pkt[UDP].sport
        dst_port = first_pkt[UDP].dport
    
    # Create a dictionary of features
    features = {
        'session_length': session_length,
        'total_packet_length': total_packet_length,
        'total_payload_size': total_payload_size,
        'start_time': start_time,
        'end_time': end_time,
        'protocol_type': protocol_type,
        'src_ip': src_ip,
        'dst_ip': dst_ip,
        'src_port': src_port,
        'dst_port': dst_port,
        'flags': flags,
        # IPv4 features
        'version_ipv4': version_ipv4,
        'ihl_ipv4': ihl_ipv4,
        'type_of_service_ipv4': type_of_service_ipv4,
        'total_length_ipv4': total_length_ipv4,
        'identification_ipv4': identification_ipv4,
        'fragment_offset_ipv4': fragment_offset_ipv4,
        'ttl_ipv4': ttl_ipv4,
        'header_checksum_ipv4': header_checksum_ipv4,
        'options_ipv4': options_ipv4,
        # IPv6 features
        'version_ipv6': version_ipv6,
        'traffic_class_ipv6': traffic_class_ipv6,
        'flow_label_ipv6': flow_label_ipv6,
        'payload_length_ipv6': payload_length_ipv6,
        'next_header_ipv6': next_header_ipv6,
        'hop_limit_ipv6': hop_limit_ipv6,
        # Payload-level features
        'mean_payload_length': mean_payload_length,
        'median_payload_length': median_payload_length,
        'max_payload_length': max_payload_length,
        'min_payload_length': min_payload_length,
        'std_payload_length': std_payload_length,
        'var_payload_length': var_payload_length
    }
    
    return features

def extract_features_from_file(file_path):
    packets = rdpcap(file_path)
    sessions = packets.sessions()
    all_features = []
    for session_key in sessions:
        session_packets = sessions[session_key]
        features = extract_features_from_session(session_packets)
        all_features.append(features)
    return all_features

def browse_files():
    filename = filedialog.askopenfilename(initialdir="/", title="Select a File", filetypes=(("PCAP files", "*.pcap *.pcapng *.cap"), ("all files", "*.*")))
    if filename:
        features = extract_features_from_file(filename)
        df = pd.DataFrame(features)
        df['start_time'] = df['start_time'].astype(int)
        df['end_time'] = df['end_time'].astype(int)
        if 'flags' in df.columns:
            le = LabelEncoder()
            df['flags'] = le.fit_transform(df['flags'].astype(str))
        if 'options' in df.columns:  # Check if 'options' column exists
            df['options'] = df['options'].apply(lambda x: str(x))  # Preprocess 'options' feature
        numeric_cols = df.select_dtypes(include=['number']).columns
        df_numeric = df[numeric_cols]
        imputer = SimpleImputer(strategy='mean')
        df_imputed = imputer.fit_transform(df_numeric)
        
        # Initialize DataFrame with df_imputed and use column names from df_numeric
        df_imputed = pd.DataFrame(df_imputed, columns=df_numeric.columns)
        
        # Print columns of df_imputed for debugging
        print("Columns of df_imputed:", df_imputed.columns)
        
        print("Shape of df_imputed:", df_imputed.shape)
        predictions = rf_classifier.predict(df_imputed)
        df['prediction'] = predictions
        save_path = os.path.splitext(filename)[0] + '_predictions.csv'
        df.to_csv(save_path, index=False)
        print("Predictions saved to:", save_path)

# Create the Tkinter GUI
root = tk.Tk()
root.title("Ransomware Detection")
root.geometry("400x200")

# Create a browse button
browse_button = tk.Button(root, text="Browse Files", command=browse_files)
browse_button.pack(pady=20)

root.mainloop()

Columns of df_imputed: Index(['session_length', 'total_packet_length', 'total_payload_size',
       'start_time', 'end_time', 'protocol_type', 'src_port', 'dst_port',
       'flags', 'version_ipv4', 'ihl_ipv4', 'type_of_service_ipv4',
       'total_length_ipv4', 'identification_ipv4', 'fragment_offset_ipv4',
       'ttl_ipv4', 'header_checksum_ipv4', 'version_ipv6',
       'traffic_class_ipv6', 'flow_label_ipv6', 'payload_length_ipv6',
       'next_header_ipv6', 'hop_limit_ipv6', 'mean_payload_length',
       'median_payload_length', 'max_payload_length', 'min_payload_length',
       'std_payload_length', 'var_payload_length'],
      dtype='object')
Shape of df_imputed: (815, 29)




Predictions saved to: C:/Users/Ehtisham Awan/Desktop/ransomware ml/testing/smbtorture_predictions.csv
