In [2]:
import os
import pandas as pd
from scapy.all import *
from ipaddress import ip_address
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import joblib

In [3]:
# Function to extract features from a single PCAP file
def extract_features_from_pcap(pcap_file):
    packets = rdpcap(pcap_file)
    
    packet_features = []
    for pkt in packets:
        if IP in pkt:
            packet_length = len(pkt)
            protocol_type = pkt[IP].proto
            src_ip = int(ip_address(pkt[IP].src))  # Convert source IP to integer
            dst_ip = int(ip_address(pkt[IP].dst))  # Convert destination IP to integer
            if TCP in pkt:
                src_port = pkt[TCP].sport
                dst_port = pkt[TCP].dport
                flags = pkt[TCP].flags
            elif UDP in pkt:
                src_port = pkt[UDP].sport
                dst_port = pkt[UDP].dport
                flags = None
            else:
                src_port = None
                dst_port = None
                flags = None
            time_stamp = pkt.time
            payload_size = len(pkt.payload)
            
            packet_features.append({
                'packet_length': packet_length,
                'protocol_type': protocol_type,
                'src_ip': src_ip,
                'dst_ip': dst_ip,
                'src_port': src_port,
                'dst_port': dst_port,
                'flags': flags,
                'time_stamp': time_stamp,
                'payload_size': payload_size
            })
    
    return packet_features

In [4]:
# Function to convert timestamp to seconds
def convert_timestamp_to_seconds(timestamp):
    return int(timestamp)

In [5]:
# Function to extract features from all PCAP files in a folder
def extract_features_from_folder(folder_path):
    all_features = []
    pcap_files = [f for f in os.listdir(folder_path) if f.endswith('.pcap') or f.endswith('.pcapng')]
    for pcap_file in pcap_files:
        pcap_file_path = os.path.join(folder_path, pcap_file)
        features = extract_features_from_pcap(pcap_file_path)
        all_features.extend(features)
    return all_features

In [6]:
# Path to the 'attack' and 'normal' folders
attack_folder = 'attack'
normal_folder = 'normal'

In [7]:
# Extract features from 'attack' folder
attack_features = extract_features_from_folder(attack_folder)

In [8]:
# Extract features from 'normal' folder
normal_features = extract_features_from_folder(normal_folder)

In [9]:
# Convert features to DataFrame
attack_df = pd.DataFrame(attack_features)
normal_df = pd.DataFrame(normal_features)

In [10]:
# Add labels to the DataFrames
attack_df['label'] = 'attack'
normal_df['label'] = 'normal'

In [11]:
attack_df.head()

Unnamed: 0,packet_length,protocol_type,src_ip,dst_ip,src_port,dst_port,flags,time_stamp,payload_size,label
0,253,17,3232267010,3232267263,138.0,138.0,,1112048527.695158,239,attack
1,221,17,3232267009,3232267263,138.0,138.0,,1112048628.854794,207,attack
2,243,17,3232267010,3232267263,138.0,138.0,,1112048628.855234,229,attack
3,221,17,3232267009,3232267263,138.0,138.0,,1112048630.375312,207,attack
4,243,17,3232267010,3232267263,138.0,138.0,,1112048630.375724,229,attack


In [12]:
normal_df.head()

Unnamed: 0,packet_length,protocol_type,src_ip,dst_ip,src_port,dst_port,flags,time_stamp,payload_size,label
0,54,2,3232238081,3758096406,,,,1458121359.327326,40,normal
1,54,2,3232238081,3758096406,,,,1458121359.334207,40,normal
2,54,2,3232238081,3758096406,,,,1458121359.334386,40,normal
3,54,2,3232238081,3758096406,,,,1458121359.334559,40,normal
4,63,17,3232238081,3758096636,63781.0,5355.0,,1458121359.336941,49,normal


In [14]:
# Concatenate attack and normal DataFrames
combined_df = pd.concat([attack_df, normal_df], ignore_index=True)

In [15]:
# Print count of each label
print("Number of attack instances:", combined_df[combined_df['label'] == 'attack'].shape[0])
print("Number of normal instances:", combined_df[combined_df['label'] == 'normal'].shape[0])

Number of attack instances: 209277
Number of normal instances: 818


In [16]:
# Shuffle the DataFrame to mix attack and normal data
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

In [19]:
# Convert timestamp to seconds
combined_df['time_stamp'] = combined_df['time_stamp'].apply(convert_timestamp_to_seconds)

In [20]:
# Label encode the "flags" column
le = LabelEncoder()
combined_df['flags'] = le.fit_transform(combined_df['flags'].astype(str))

In [21]:
# Split features and labels
X = combined_df.drop('label', axis=1)
y = combined_df['label']

In [22]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Impute missing values in X_train
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

In [24]:
# Train RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train_imputed, y_train)

In [25]:
# Predict
X_test_imputed = imputer.transform(X_test)
y_pred = clf.predict(X_test_imputed)

In [26]:
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [28]:
from sklearn.metrics import classification_report
# Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

      attack       1.00      1.00      1.00     41852
      normal       1.00      1.00      1.00       167

    accuracy                           1.00     42019
   macro avg       1.00      1.00      1.00     42019
weighted avg       1.00      1.00      1.00     42019



In [30]:
# Save the model to a .sav file
joblib.dump(clf, 'ransomware_rf.sav')

['ransomware_rf.sav']

In [31]:
import pickle

# Save the model to a file
with open('ransomware_rf.pkl', 'wb') as f:
    pickle.dump(clf, f)

# Resample Normal Data

In [35]:
import os
import pandas as pd
from scapy.all import *
from ipaddress import ip_address
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report
import pickle
import joblib

In [36]:
# Function to extract features from a single PCAP file
def extract_features_from_pcap(pcap_file):
    packets = rdpcap(pcap_file)
    
    packet_features = []
    for pkt in packets:
        if IP in pkt:
            packet_length = len(pkt)
            protocol_type = pkt[IP].proto
            src_ip = int(ip_address(pkt[IP].src))  # Convert source IP to integer
            dst_ip = int(ip_address(pkt[IP].dst))  # Convert destination IP to integer
            if TCP in pkt:
                src_port = pkt[TCP].sport
                dst_port = pkt[TCP].dport
                flags = pkt[TCP].flags
            elif UDP in pkt:
                src_port = pkt[UDP].sport
                dst_port = pkt[UDP].dport
                flags = None
            else:
                src_port = None
                dst_port = None
                flags = None
            time_stamp = pkt.time
            payload_size = len(pkt.payload)
            
            packet_features.append({
                'packet_length': packet_length,
                'protocol_type': protocol_type,
                'src_ip': src_ip,
                'dst_ip': dst_ip,
                'src_port': src_port,
                'dst_port': dst_port,
                'flags': flags,
                'time_stamp': time_stamp,
                'payload_size': payload_size
            })
    
    return packet_features

In [37]:
# Function to convert timestamp to seconds
def convert_timestamp_to_seconds(timestamp):
    return int(timestamp)

In [38]:
# Function to extract features from all PCAP files in a folder
def extract_features_from_folder(folder_path):
    all_features = []
    pcap_files = [f for f in os.listdir(folder_path) if f.endswith('.pcap') or f.endswith('.pcapng')]
    for pcap_file in pcap_files:
        pcap_file_path = os.path.join(folder_path, pcap_file)
        features = extract_features_from_pcap(pcap_file_path)
        all_features.extend(features)
    return all_features

In [39]:
# Path to the 'attack' and 'normal' folders
attack_folder = 'attack'
normal_folder = 'normal'

In [40]:
# Extract features from 'attack' folder
attack_features = extract_features_from_folder(attack_folder)

In [41]:
# Extract features from 'normal' folder
normal_features = extract_features_from_folder(normal_folder)

In [42]:
# Convert features to DataFrame
attack_df = pd.DataFrame(attack_features)
normal_df = pd.DataFrame(normal_features)

In [43]:
# Add labels to the DataFrames
attack_df['label'] = 'attack'
normal_df['label'] = 'normal'

In [44]:
# Concatenate attack and normal DataFrames
combined_df = pd.concat([attack_df, normal_df], ignore_index=True)

In [45]:
# Print count of each label
print("Number of attack instances:", combined_df[combined_df['label'] == 'attack'].shape[0])
print("Number of normal instances:", combined_df[combined_df['label'] == 'normal'].shape[0])

Number of attack instances: 209277
Number of normal instances: 818


In [46]:
# Shuffle the DataFrame to mix attack and normal data
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

In [47]:
# Convert timestamp to seconds
combined_df['time_stamp'] = combined_df['time_stamp'].apply(convert_timestamp_to_seconds)

In [48]:
# Label encode the "flags" column
le = LabelEncoder()
combined_df['flags'] = le.fit_transform(combined_df['flags'].astype(str))

In [49]:
# Split features and labels
X = combined_df.drop('label', axis=1)
y = combined_df['label']

In [50]:
# Resample the data to handle class imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [51]:
# Print the counts of each class after resampling
print("Number of attack instances after resampling:", sum(y_resampled == 'attack'))
print("Number of normal instances after resampling:", sum(y_resampled == 'normal'))

Number of attack instances after resampling: 209277
Number of normal instances after resampling: 209277


In [52]:
# Split resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [53]:
# Impute missing values in X_train
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

In [54]:
# Train RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train_imputed, y_train)

In [55]:
# Predict
X_test_imputed = imputer.transform(X_test)
y_pred = clf.predict(X_test_imputed)

In [57]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy after resampling:", accuracy)

Accuracy after resampling: 1.0


In [58]:
# Report
print("Classification Report after resampling:")
print(classification_report(y_test, y_pred))

Classification Report after resampling:
              precision    recall  f1-score   support

      attack       1.00      1.00      1.00     41947
      normal       1.00      1.00      1.00     41764

    accuracy                           1.00     83711
   macro avg       1.00      1.00      1.00     83711
weighted avg       1.00      1.00      1.00     83711



In [59]:
# Save the model to a .sav file
joblib.dump(clf, 'resample_ransomware_rf.sav')

['resample_ransomware_rf.sav']

In [60]:
# Save the model to a file
with open('resample_ransomware_rf.pkl', 'wb') as f:
    pickle.dump(clf, f)

# Regularization with cross-validation

In [63]:
from sklearn.model_selection import train_test_split, cross_val_score

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values in X_train
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

# Regularization with cross-validation
clf = RandomForestClassifier()
cv_scores = cross_val_score(clf, X_train_imputed, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

# Train RandomForestClassifier
clf.fit(X_train_imputed, y_train)

# Predict
X_test_imputed = imputer.transform(X_test)
y_pred = clf.predict(X_test_imputed)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy after regularization:", accuracy)

Cross-validation scores: [1.        1.        0.9999405 1.        1.       ]
Mean CV score: 0.9999881005503497
Accuracy after regularization: 1.0


# Test on new data

In [2]:
import os
import pandas as pd
from tkinter import Tk, Button, Label, filedialog
from scapy.all import *
from ipaddress import ip_address
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the trained model
model = joblib.load('ransomware_rf.sav')

# Function to extract features from a single PCAP file
def extract_features_from_pcap(pcap_file):
    packets = rdpcap(pcap_file)
    
    packet_features = []
    for pkt in packets:
        if IP in pkt:
            packet_length = len(pkt)
            protocol_type = pkt[IP].proto
            src_ip = int(ip_address(pkt[IP].src))  # Convert source IP to integer
            dst_ip = int(ip_address(pkt[IP].dst))  # Convert destination IP to integer
            if TCP in pkt:
                src_port = pkt[TCP].sport
                dst_port = pkt[TCP].dport
                flags = pkt[TCP].flags
            elif UDP in pkt:
                src_port = pkt[UDP].sport
                dst_port = pkt[UDP].dport
                flags = None
            else:
                src_port = None
                dst_port = None
                flags = None
            time_stamp = pkt.time
            payload_size = len(pkt.payload)
            
            packet_features.append({
                'packet_length': packet_length,
                'protocol_type': protocol_type,
                'src_ip': src_ip,
                'dst_ip': dst_ip,
                'src_port': src_port,
                'dst_port': dst_port,
                'flags': flags,
                'time_stamp': time_stamp,
                'payload_size': payload_size
            })
    
    return packet_features

# Function to convert timestamp to seconds
def convert_timestamp_to_seconds(timestamp):
    return int(timestamp)

# Function to extract features and apply model
def process_file(file_path):
    # Extract features
    features = extract_features_from_pcap(file_path)
    df = pd.DataFrame(features)
    
    # Convert timestamp to seconds
    df['time_stamp'] = df['time_stamp'].apply(convert_timestamp_to_seconds)
    
    # Label encode the "flags" column
    le = LabelEncoder()
    df['flags'] = le.fit_transform(df['flags'].astype(str))
    
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(df)
    
    # Apply model
    predictions = model.predict(X)
    
    # Save predictions to CSV
    df['predictions'] = predictions
    df.to_csv('predictions.csv', index=False)

# Function to handle browse button click event
def browse_file():
    file_path = filedialog.askopenfilename(filetypes=[('PCAP files', '*.pcap;*.pcapng;*.cap')])
    if file_path:
        process_file(file_path)
        result_label.config(text="Predictions saved to predictions.csv")

# Create Tkinter window
root = Tk()
root.title("Ransomware Detection")
root.geometry("400x200")  # Set window size

# Create browse button
browse_button = Button(root, text="Browse", command=browse_file)
browse_button.pack()

# Create label for displaying result
result_label = Label(root, text="")
result_label.pack()

# Run the Tkinter event loop
root.mainloop()