In [1]:
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Setup
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'stratified_sample.csv'

# Feature engineering
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

# Read sampled data
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Read and sample original data
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)
    original_dfs.append(sampled)

# Combine
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

# Convert to Pandas
print("Computing Dask to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

# Prepare X and y
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

# Fix float-style labels
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}
y = y.replace(label_mapping)

print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# Train Random Forest
print("Training Random Forest")
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)

# Predict and Evaluate
print("Evaluating")
predictions = clf.predict(X_test)

# Decode labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of Random Forest: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

In [1]:
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Setup
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'smote_data.csv'

# Feature engineering
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

# Read sampled data
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Read and sample original data
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)
    original_dfs.append(sampled)

# Combine
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

# Convert to Pandas
print("Computing Dask to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

# Prepare X and y
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

# Fix float-style labels
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}
y = y.replace(label_mapping)

print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# Train Random Forest
print("Training Random Forest")
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)

# Predict and Evaluate
print("Evaluating")
predictions = clf.predict(X_test)

# Decode labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of Random Forest: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

In [3]:
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Setup
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'data_diffusion_data.csv'

# Feature engineering
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

# Read sampled data
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Read and sample original data
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)
    original_dfs.append(sampled)

# Combine
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

# Convert to Pandas
print("Computing Dask to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

# Prepare X and y
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

# Fix float-style labels
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}
y = y.replace(label_mapping)

print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# Train Random Forest
print("Training Random Forest")
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)

# Predict and Evaluate
print("Evaluating")
predictions = clf.predict(X_test)

# Decode labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of Random Forest: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

In [5]:
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Setup
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'undersampled_data.csv'

# Feature engineering
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

# Read sampled data
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Read and sample original data
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)
    original_dfs.append(sampled)

# Combine
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

# Convert to Pandas
print("Computing Dask to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

# Prepare X and y
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

# Fix float-style labels
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}
y = y.replace(label_mapping)

print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# Train Random Forest
print("Training Random Forest")
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)

# Predict and Evaluate
print("Evaluating")
predictions = clf.predict(X_test)

# Decode labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of Random Forest: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9