In [None]:
import pandas as pd
import numpy as np
import gc


# Manipulate columns here
columns = ['Protocol', 'Duration', 'Origin_Bytes',
           'Response_Bytes', 'Connection_State', 'Origin_Packets',
           'Response_Packets', 'Origin_IP_Bytes', 'Response_IP_Bytes', 'Label', 'Detailed_Label']

def load_df_sampled(capture_number):
    base_path = '/kaggle/input/iot-master/IoT-Dataset-Master/'
    file_path = f'{base_path}CTU-Honeypot-Capture-{capture_number}/bro/output.csv'
    df = pd.read_csv(file_path, usecols=columns)
    return df


def load_df_filtered(capture_number, labels_to_keep, chunksize=10**5):
    base_path = '/kaggle/input/iot-master/IoT-Dataset-Master/'
    file_path = f"{base_path}CTU-IoT-Malware-Capture-{capture_number}/bro/output.csv"
    
    filtered_chunks = []
    for chunk in pd.read_csv(file_path, usecols=columns, chunksize=chunksize):
        filtered_chunk = chunk[chunk['Detailed_Label'].isin(labels_to_keep)]
        filtered_chunks.append(filtered_chunk)
    
    if not filtered_chunks: 
        return pd.DataFrame(columns=columns)
    
    return pd.concat(filtered_chunks)

labels_port_scan = ["-", "PartOfAHorizontalPortScan"]
labels_ddos = ["-", "DDoS"]
labels_okiru = ["-", "Okiru"]

In [None]:
# Benign
df_honeypot_4_1 = load_df_sampled('4-1')
df_honeypot_5_1 = load_df_sampled('5-1')
df_honeypot_7_1 = load_df_sampled('7-1')
df_benign = pd.concat([df_honeypot_4_1,df_honeypot_5_1,df_honeypot_7_1])
del df_honeypot_4_1
del df_honeypot_5_1
del df_honeypot_7_1
gc.collect()

In [None]:
# PartOfAHorizontalPortScan
# df_39_1 = load_df_filtered('39-1', labels_port_scan)
df_33_1 = load_df_filtered('33-1', labels_port_scan)
# df_17_1 = load_df_filtered('17-1', labels_port_scan)
# df_port_scan = pd.concat([df_39_1, df_33_1, df_17_1])
df_port_scan = df_33_1
# del df_39_1
# del df_33_1
del df_33_1
gc.collect()

In [None]:
# DDoS
df_60_1 = load_df_filtered('60-1', labels_ddos)
df_17_1 = load_df_filtered('17-1', labels_ddos)
df_ddos = pd.concat([df_60_1,df_17_1])
del df_60_1
del df_17_1
gc.collect()

In [None]:
# Okiru
df_7_1 = load_df_filtered('7-1', labels_okiru)
df_36_1 = load_df_filtered('36-1', labels_okiru)
df_okiru = pd.concat([df_7_1,df_36_1])
del df_7_1
del df_36_1
gc.collect()

In [None]:
df_combined = pd.concat([df_benign, df_port_scan, df_ddos, df_okiru])
del df_benign
del df_port_scan
del df_ddos
del df_okiru
gc.collect()

In [None]:
df_combined['Attack_Type'] = np.where(df_combined['Label'] == 'benign', 'benign', df_combined['Detailed_Label'])
df_combined.drop(['Label', 'Detailed_Label'], axis=1, inplace=True)
df_combined.loc[df_combined['Attack_Type'] == 'benign', 'Attack_Type'] = 'Benign'
df_combined.loc[df_combined['Attack_Type'] == '-', 'Attack_Type'] = 'Benign'

In [None]:
df_combined.loc[df_combined['Origin_Bytes'] == '-', 'Origin_Bytes'] = 0
df_combined.loc[df_combined['Response_Bytes'] == '-', 'Response_Bytes'] = 0

In [None]:
df_combined["Origin_Bytes"] = pd.to_numeric(df_combined["Origin_Bytes"], errors='coerce', downcast='integer')
df_combined["Response_Bytes"] = pd.to_numeric(df_combined["Response_Bytes"], errors='coerce', downcast='integer')
df_combined["Duration"] = pd.to_numeric(df_combined["Duration"], errors='coerce')
df_combined.dropna(inplace=True)

# EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

attack_type_counts = df_combined['Attack_Type'].value_counts()
plt.figure(figsize=(8,6))
plt.pie(attack_type_counts, labels=attack_type_counts.index, autopct="%1.1f%%")
plt.title("Distribution of Attack Types")
plt.savefig("1.png")


In [None]:
protocol_counts = df['Protocol'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(protocol_usage_counts, labels=protocol_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Protocol Frequency')
plt.axis('equal') 
plt.show()
plt.savefig("2.png")

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=df_combined, x="Duration")
plt.title("Distribution of Duration")
plt.show()


In [None]:
df_combined.info()

In [None]:
df_combined = pd.get_dummies(df_combined, columns=["Protocol", "Connection_State"])

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
Y = df_combined["Attack_Type"]
X = df_combined.drop(["Attack_Type"], axis=1)
del df_combined

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42)
X_resampled, Y_resampled = smote.fit_resample(X, Y)

# from imblearn.under_sampling import RandomUnderSampler
# rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
# X_resampled, Y_resampled = rus.fit_resample(X, Y)

In [None]:
combined = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(Y_resampled, columns=["Attack_Type"])], axis=1)
combined.to_csv('oversampled.csv')
# combined.to_csv('undersampled.csv')