In [1]:
import pandas as pd
import numpy as np
import os
import argparse


In [None]:

# Path to the CSV files
mal_data_path = '/gpuserver/caputo/keep_calm/data/ToN_IoT/malicious.csv'
ben_data_path = '/gpuserver/caputo/keep_calm/data/ToN_IoT/benign.csv'

print("Reading CSV files...")
mal_data = pd.read_csv(mal_data_path, index_col=0)
ben_data = pd.read_csv(ben_data_path, index_col=0)

print(f"Len Malicious: {len(mal_data)}")
print(f'Len benign: {len(ben_data)}')
    
df = pd.concat([mal_data, ben_data], ignore_index=True)

df = df[df.proto == 'tcp']
df['proto'] = 0


df['src_ip_port'] = df['src_ip'].astype(str) + ':' + df['src_port'].astype(str)
df['dst_ip_port'] = df['dst_ip'].astype(str) + ':' + df['dst_port'].astype(str)

import ipaddress

def is_internal(ip):
    try:
        return int(ipaddress.ip_address(ip).is_private)
    except ValueError:
        return 0  # IP non valido → trattalo come esterno
    
df['IPSrcType'] = df['src_ip'].apply(is_internal)   
df['IPDstType'] = df['dst_ip'].apply(is_internal)


# Converto sport e dsport a numeri (eventuali NaN diventano -1 per coerenza)
df['src_port'] = pd.to_numeric(df['src_port'], errors='coerce').fillna(-1).astype(int)
df['dst_port'] = pd.to_numeric(df['dst_port'], errors='coerce').fillna(-1).astype(int)


# Source Port
df['SrcPortWellKnown']   = ((df['src_port'] >= 0) & (df['src_port'] <= 1023)).astype(int)
df['SrcPortRegistered']  = ((df['src_port'] >= 1024) & (df['src_port'] <= 49151)).astype(int)
df['SrcPortPrivate']     = (df['src_port'] > 49151).astype(int)

# Destination Port
df['DstPortWellKnown']   = ((df['dst_port'] >= 0) & (df['dst_port'] <= 1023)).astype(int)
df['DstPortRegistered']  = ((df['dst_port'] >= 1024) & (df['dst_port'] <= 49151)).astype(int)
df['DstPortPrivate']     = (df['dst_port'] > 49151).astype(int)

print("Dropping some columns...")

df.drop(columns=['src_port', 'dst_port', 'src_ip', 'dst_ip',
                'http_uri', 'weird_name', 'weird_addl', 'weird_notice',
                'dns_query', 'ssl_version', 'ssl_cipher', 
                'ssl_subject', 'ssl_issuer', 'http_user_agent',
                'http_method', 'http_version', 'http_request_body_len',
                'http_response_body_len', 'http_status_code', 'http_user_agent',
                'http_orig_mime_types', 'http_resp_mime_types', 'http_trans_depth'], inplace=True)

print("Encoding boolean columns")
for c in ['dns_AA', 'dns_RA', 'dns_RD', 'dns_rejected', 'ssl_resumed', 'ssl_established']:
    df[c].replace('-', 'F', inplace=True)
    df.loc[df[c] == 'F', c] = 0
    df.loc[df[c] == 'T', c] = 1

# 2. One-hot encoding delle categoriche
df = pd.get_dummies(df, columns=['conn_state', 'service'], drop_first=True)

df.rename({'label': 'Label',
            'duration': 'Dur',
            'src_bytes': 'SrcBytes',
            'dst_bytes': 'DstBytes',
            'src_pkts': 'SrcPkts',
            'dst_pkts': 'DstPkts'
            }, axis=1, inplace=True)
df['TotBytes'] = df.SrcBytes + df.DstBytes
df['TotPkts'] = df.SrcPkts + df.DstPkts

Reading CSV files...
Len Malicious: 161043
Len benign: 300000
Dropping some columns...
Encoding boolean columns


In [6]:
df

Unnamed: 0,proto,Dur,SrcBytes,DstBytes,missed_bytes,SrcPkts,src_ip_bytes,DstPkts,dst_ip_bytes,dns_qclass,...,service_dce_rpc,service_dns,service_ftp,service_gssapi,service_http,service_smb,service_smb;gssapi,service_ssl,TotBytes,TotPkts
0,0,0.0,0,0,0,1,44,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0.0,0,0,0,1,44,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0.0,0,0,0,1,44,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0.0,0,0,0,1,44,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0.0,0,0,0,1,44,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461038,0,0.0,0,0,0,0,0,1,1500,0,...,0,0,0,0,0,0,0,0,0,1
461039,0,0.0,0,0,0,1,40,0,0,0,...,0,0,0,0,0,0,0,0,0,1
461040,0,0.0,0,0,0,0,0,1,1500,0,...,0,0,0,0,0,0,0,0,0,1
461041,0,0.0,0,0,0,0,0,1,1500,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
#salva il dataset
df.to_csv("/gpuserver/caputo/keep_calm/data/ToN_IoT/ToN_preprocessed.csv", index=False)

In [2]:
df = pd.read_csv("/gpuserver/caputo/keep_calm/data/ToN_IoT/ToN_preprocessed.csv", index_col=0)
df

Unnamed: 0_level_0,Dur,SrcBytes,DstBytes,missed_bytes,SrcPkts,src_ip_bytes,DstPkts,dst_ip_bytes,dns_qclass,dns_qtype,...,service_dce_rpc,service_dns,service_ftp,service_gssapi,service_http,service_smb,service_smb;gssapi,service_ssl,TotBytes,TotPkts
proto,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0,0,0,1,44,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0.0,0,0,0,1,44,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0.0,0,0,0,1,44,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0.0,0,0,0,1,44,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0.0,0,0,0,1,44,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.0,0,0,0,0,0,1,1500,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0.0,0,0,0,1,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0.0,0,0,0,0,0,1,1500,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0.0,0,0,0,0,0,1,1500,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
#mostra tutte le colonne
pd.set_option('display.max_columns', None)
df

Unnamed: 0_level_0,Dur,SrcBytes,DstBytes,missed_bytes,SrcPkts,src_ip_bytes,DstPkts,dst_ip_bytes,dns_qclass,dns_qtype,dns_rcode,dns_AA,dns_RD,dns_RA,dns_rejected,ssl_resumed,ssl_established,Label,src_ip_port,dst_ip_port,IPSrcType,IPDstType,SrcPortWellKnown,SrcPortRegistered,SrcPortPrivate,DstPortWellKnown,DstPortRegistered,DstPortPrivate,conn_state_REJ,conn_state_RSTO,conn_state_RSTOS0,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR,service_dce_rpc,service_dns,service_ftp,service_gssapi,service_http,service_smb,service_smb;gssapi,service_ssl,TotBytes,TotPkts,attack_cat
proto,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
0,0.0,0,0,0,1,44,0,0,0,0,0,0,0,0,0,0,0,1,192.168.1.30:42908,192.168.1.250:7435,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,scanning
0,0.0,0,0,0,1,44,0,0,0,0,0,0,0,0,0,0,0,1,192.168.1.30:42908,192.168.1.46:1641,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,scanning
0,0.0,0,0,0,1,44,0,0,0,0,0,0,0,0,0,0,0,1,192.168.1.30:42909,192.168.1.103:2046,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,scanning
0,0.0,0,0,0,1,44,0,0,0,0,0,0,0,0,0,0,0,1,192.168.1.30:42909,192.168.1.46:2004,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,scanning
0,0.0,0,0,0,1,44,0,0,0,0,0,0,0,0,0,0,0,1,192.168.1.30:42911,192.168.1.49:992,1,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,scanning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.0,0,0,0,0,0,1,1500,0,0,0,0,0,0,0,0,0,0,192.168.1.195:49884,13.107.4.50:80,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,normal
0,0.0,0,0,0,1,40,0,0,0,0,0,0,0,0,0,0,0,0,192.168.1.195:49885,13.107.4.50:80,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,normal
0,0.0,0,0,0,0,0,1,1500,0,0,0,0,0,0,0,0,0,0,192.168.1.195:49884,13.107.4.50:80,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,normal
0,0.0,0,0,0,0,0,1,1500,0,0,0,0,0,0,0,0,0,0,192.168.1.195:49885,13.107.4.50:80,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,normal


In [9]:
df['attack_cat'].value_counts()

normal        131532
backdoor       19990
password       19820
ransomware     19774
scanning       19424
injection      19270
xss            18600
dos            18520
ddos           14730
Name: attack_cat, dtype: int64

In [5]:
#from type to attack_cat
df['attack_cat'] = df['type']
df = df.drop(columns=['type'])

In [11]:
# Rimuovo le righe con meno di 1000 campioni
df = df[df['attack_cat'].isin(df['attack_cat'].value_counts()[df['attack_cat'].value_counts() > 1000].index)]

# Creo un df con soli benigni
df_benigni = df[df['Label'] == 0]
tot_benigni = len(df_benigni)

for attack in df['attack_cat'].unique():
    if attack != 'normal':
        df_attack = df[df['attack_cat'] == attack]
        attack_len = len(df_attack)

        # Provo con 20:1
        num_needed = attack_len * 20
        if num_needed <= tot_benigni:
            ratio_used = 20
        else:
            # Provo con 10:1
            num_needed = attack_len * 10
            if num_needed <= tot_benigni:
                ratio_used = 10
            else:
                # Calcolo quanti attacchi posso tenere per mantenere 10:1
                max_attacks = tot_benigni // 10
                print(f"ATTENZIONE: non abbastanza benigni per {attack}. Uso solo {max_attacks} attacchi.")
                df_attack = df_attack.sample(n=max_attacks, random_state=42)
                num_needed = max_attacks * 10
                ratio_used = 10

        df_sampled_benigni = df_benigni.sample(n=num_needed, replace=False, random_state=42)
        df_balanced = pd.concat([df_attack, df_sampled_benigni])

        df_balanced.to_csv(f'/gpuserver/caputo/keep_calm/data/ToN_IoT/ToN_IoT_{attack}.csv', index=False)
        print(f"Dataset {attack} creato con rapporto {ratio_used}:1 e {len(df_balanced)} campioni")


ATTENZIONE: non abbastanza benigni per scanning. Uso solo 13153 attacchi.
Dataset scanning creato con rapporto 10:1 e 144683 campioni
ATTENZIONE: non abbastanza benigni per dos. Uso solo 13153 attacchi.
Dataset dos creato con rapporto 10:1 e 144683 campioni
ATTENZIONE: non abbastanza benigni per injection. Uso solo 13153 attacchi.
Dataset injection creato con rapporto 10:1 e 144683 campioni
ATTENZIONE: non abbastanza benigni per ddos. Uso solo 13153 attacchi.
Dataset ddos creato con rapporto 10:1 e 144683 campioni
ATTENZIONE: non abbastanza benigni per password. Uso solo 13153 attacchi.
Dataset password creato con rapporto 10:1 e 144683 campioni
ATTENZIONE: non abbastanza benigni per xss. Uso solo 13153 attacchi.
Dataset xss creato con rapporto 10:1 e 144683 campioni
ATTENZIONE: non abbastanza benigni per ransomware. Uso solo 13153 attacchi.
Dataset ransomware creato con rapporto 10:1 e 144683 campioni
ATTENZIONE: non abbastanza benigni per backdoor. Uso solo 13153 attacchi.
Dataset ba