# import modules

In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble  import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# import train and test dataset

In [2]:
train_csv_file_path = './data/part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv'
train_raw_data = pd.read_csv(train_csv_file_path)

test_csv_file_path = './data/part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv'
test_raw_data = pd.read_csv(test_csv_file_path)

In [3]:
train_data = train_raw_data.iloc[0:, :-1]
train_label = train_raw_data.iloc[0:, -1]

test_data = test_raw_data.iloc[0:, :-1]
test_label = test_raw_data.iloc[0:, -1]

In [4]:
train_label = pd.DataFrame(train_label)
test_label = pd.DataFrame(test_label)

In [5]:
unique = train_label["label"].unique()
attacks = ['DDoS', 'DoS', 'Mirai', 'Recon']

In [6]:
def attackLists(dataframe):
    """ find unique attacks

    Args: 
        dataframe(Pandas Dataframe): Contains specific attacks
    
    Return:
        attack_specifics: list of lists
    """

    unique = dataframe["label"].unique()

    attack_specifics = []

    DDoS = [element for element in unique if 'DDoS' in element]
    DoS = [element for element in unique if 'DoS' in element and 'DDoS' not in element]
    Mirai = [element for element in unique if 'Mirai' in element]
    Recon = [element for element in unique if 'Recon' in element]

    attack_specifics.append(DDoS)
    attack_specifics.append(DoS)
    attack_specifics.append(Mirai)
    attack_specifics.append(Recon)

    return unique, attack_specifics

In [7]:
train_unique, train_attack_specifics = attackLists(train_label)

print(f'types of attacks in training lable: {len(train_unique)}')

for index, attack in enumerate(train_attack_specifics):
    print(f'types of {attacks[index]}: {len(attack)}')

types of attacks in training lable: 34
types of DDoS: 12
types of DoS: 4
types of Mirai: 3
types of Recon: 4


In [8]:
def mergeAttacks(dataframe, attacks):
    """ Merge attacks into big category

    Args:
        dataframe(Pandas dataframe): Contains specific attacks

    Return:
        (Pandas dataframe): merged dataframe
    """

    dataframe.loc[dataframe['label'].str.contains('DDoS'), 'label'] = 'andrew-nah'

    for attack in attacks:
        if attack == 'DDoS':
            continue
        dataframe.loc[dataframe['label'].str.contains(attack), 'label'] = attack
    
    dataframe.loc[dataframe['label'].str.contains('andrew-nah'), 'label'] = 'DDoS'
    
    return dataframe

In [9]:
train_label =  mergeAttacks(train_label, attacks)
unique = train_label["label"].unique()
print(f'types of attacks in train_label after merged: {len(unique)}')

types of attacks in train_label after merged: 15


In [10]:
test_label = mergeAttacks(test_label, attacks)
unique_test = test_label["label"].unique()
print(f'types of attacks after merged: {len(unique_test)}')

types of attacks after merged: 15


# check the distribution of raw training data

In [11]:
train_merged_data = mergeAttacks(train_raw_data, attacks)

In [12]:
label_counts = train_merged_data['label'].value_counts()
num_columns = train_merged_data.shape[0]
print(f'total traffics: {num_columns}')

total traffics: 238687


In [13]:
label_counts

label
DDoS                    173777
DoS                      41276
Mirai                    13435
BenignTraffic             5600
Recon                     1650
MITM-ArpSpoofing          1614
DNS_Spoofing               925
VulnerabilityScan          210
DictionaryBruteForce        63
SqlInjection                31
BrowserHijacking            30
CommandInjection            28
Backdoor_Malware            22
XSS                         18
Uploading_Attack             8
Name: count, dtype: int64

In [14]:
# Benign traffic is only less than 3%.
# For now, any traffics below this ratio is considered "minor"
# which are, 
minor_attacks = ['Recon', 'MITM-ArpSpoofing', 'DNS_Spoofing', 'VulnerabilityScan', 'DictionaryBruteForce', 'SqlInjection', 'BrowserHijacking', 'CommandInjection', 'Backdoor_Malware', 'XSS', 'Uploading_Attack']
print(len(minor_attacks))

# Basically everything but DDoS, DoS, and Mirai

11


# load the synthesized data

In [15]:
import pickle

In [16]:
with open('./data/1_gan_train_v1.pkl', 'rb') as file:
    gan_data_v1 = pickle.load(file)
with open('./data/1_gan_label_v1.pkl', 'rb') as file:
    gan_label_v1 = pickle.load(file)

In [17]:
with open('./data/1_gan_train_v2.pkl', 'rb') as file:
    gan_data_v2 = pickle.load(file)
with open('./data/1_gan_label_v2.pkl', 'rb') as file:
    gan_label_v2 = pickle.load(file)

In [18]:
with open('./data/class_mapping.pkl', 'rb') as file:
    class_mapping = pickle.load(file)

In [19]:
class_mapping

{'Backdoor_Malware': 0,
 'BenignTraffic': 1,
 'BrowserHijacking': 2,
 'CommandInjection': 3,
 'DDoS-ACK_Fragmentation': 4,
 'DDoS-HTTP_Flood': 5,
 'DDoS-ICMP_Flood': 6,
 'DDoS-ICMP_Fragmentation': 7,
 'DDoS-PSHACK_Flood': 8,
 'DDoS-RSTFINFlood': 9,
 'DDoS-SYN_Flood': 10,
 'DDoS-SlowLoris': 11,
 'DDoS-SynonymousIP_Flood': 12,
 'DDoS-TCP_Flood': 13,
 'DDoS-UDP_Flood': 14,
 'DDoS-UDP_Fragmentation': 15,
 'DNS_Spoofing': 16,
 'DictionaryBruteForce': 17,
 'DoS-HTTP_Flood': 18,
 'DoS-SYN_Flood': 19,
 'DoS-TCP_Flood': 20,
 'DoS-UDP_Flood': 21,
 'MITM-ArpSpoofing': 22,
 'Mirai-greeth_flood': 23,
 'Mirai-greip_flood': 24,
 'Mirai-udpplain': 25,
 'Recon-HostDiscovery': 26,
 'Recon-OSScan': 27,
 'Recon-PingSweep': 28,
 'Recon-PortScan': 29,
 'SqlInjection': 30,
 'Uploading_Attack': 31,
 'VulnerabilityScan': 32,
 'XSS': 33}

In [20]:
# flip the dictionary. key into value, and vice versa
flipped_dict = dict((v, k) for k, v in class_mapping.items())

In [21]:
print(type(train_data))
print(type(train_label))
print(type(gan_data_v1))
print(type(gan_label_v1))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [22]:
gan_label_v1 = pd.DataFrame(gan_label_v1)
gan_label_v2 = pd.DataFrame(gan_label_v2)

In [23]:
gan_label_v1

Unnamed: 0,label
0,12
1,20
2,6
3,6
4,6
...,...
707417,1
707418,1
707419,1
707420,1


In [24]:
gan_label_v1['label'] = gan_label_v1['label'].replace(flipped_dict)
gan_label_v2['label'] = gan_label_v2['label'].replace(flipped_dict)

In [25]:
gan_label_v1['label'].value_counts()

label
DDoS-ICMP_Flood            109020
DDoS-UDP_Flood              83069
DDoS-TCP_Flood              69361
DDoS-PSHACK_Flood           63438
DDoS-SYN_Flood              62197
DDoS-RSTFINFlood            61387
DDoS-SynonymousIP_Flood     54308
DoS-UDP_Flood               50928
DoS-TCP_Flood               40991
DoS-SYN_Flood               30809
BenignTraffic               14901
Mirai-greeth_flood          14874
Mirai-udpplain              13800
Mirai-greip_flood           11246
DDoS-ICMP_Fragmentation      7005
DDoS-ACK_Fragmentation       4497
DDoS-UDP_Fragmentation       4479
MITM-ArpSpoofing             3147
DNS_Spoofing                 1682
Recon-HostDiscovery          1478
Recon-OSScan                 1166
DoS-HTTP_Flood                943
Recon-PortScan                898
VulnerabilityScan             589
DDoS-HTTP_Flood               515
DDoS-SlowLoris                305
DictionaryBruteForce          136
BrowserHijacking               82
SqlInjection                   74
CommandI

# Merge attacks in syntesized data

In [26]:
gan_label_v1 = mergeAttacks(gan_label_v1, attacks)
gan_label_v2 = mergeAttacks(gan_label_v2, attacks)

# Drop the major attacks; DDoS, DoS, Mirai

In [27]:
def dropMajor(data, label):
    """ Drop the designated attacks

    Args:
        data(Pandas dataframe): data without labels
        label(Pandas dataframe): corresponding labels to data

    Return:
        (Pandas dataframe): dataframe without designated attacks

    """
    df = pd.concat([data, label], axis=1)
    df_filtered = df.drop(df[df['label'].isin(['DDoS', 'DoS', 'Mirai'])].index)
    return df_filtered


In [28]:
gan_dropped_v1 = dropMajor(gan_data_v1, gan_label_v1)
gan_dropped_v2 = dropMajor(gan_data_v2, gan_label_v2)

In [29]:
gan_dropped_v1['label'].value_counts()

label
BenignTraffic           14901
Recon                    3550
MITM-ArpSpoofing         3147
DNS_Spoofing             1682
VulnerabilityScan         589
DictionaryBruteForce      136
BrowserHijacking           82
SqlInjection               74
CommandInjection           44
Backdoor_Malware           26
XSS                        15
Uploading_Attack            4
Name: count, dtype: int64

In [30]:
gan_dropped_v2['label'].value_counts()

label
BenignTraffic           4986
Recon                   1206
MITM-ArpSpoofing        1067
DNS_Spoofing             574
VulnerabilityScan        201
DictionaryBruteForce      44
BrowserHijacking          27
SqlInjection              23
CommandInjection          13
Backdoor_Malware           8
XSS                        6
Uploading_Attack           3
Name: count, dtype: int64

In [31]:
gan_dropped_data_v1 = gan_dropped_v1.drop(columns=['label'])
gan_dropped_label_v1 = pd.DataFrame(gan_dropped_v1['label'])
gan_dropped_data_v2 = gan_dropped_v2.drop(columns=['label'])
gan_dropped_label_v2 = pd.DataFrame(gan_dropped_v2['label'])

# Merge original and synthesized data together

In [32]:
train_data_v1 = pd.concat([train_data, gan_dropped_data_v1], axis=0)
train_label_v1 = pd.concat([train_label, gan_dropped_label_v1], axis=0)
train_data_v2 = pd.concat([train_data, gan_dropped_data_v2], axis=0)
train_label_v2 = pd.concat([train_label, gan_dropped_label_v2], axis=0)

In [45]:
train_label_v1.value_counts()

label
DDoS                    173777
DoS                      41276
BenignTraffic            20501
Mirai                    13435
Recon                     5200
MITM-ArpSpoofing          4761
DNS_Spoofing              2607
VulnerabilityScan          799
DictionaryBruteForce       199
BrowserHijacking           112
SqlInjection               105
CommandInjection            72
Backdoor_Malware            48
XSS                         33
Uploading_Attack            12
Name: count, dtype: int64

In [44]:
train_label_v2.value_counts()

label
DDoS                    173777
DoS                      41276
Mirai                    13435
BenignTraffic            10586
Recon                     2856
MITM-ArpSpoofing          2681
DNS_Spoofing              1499
VulnerabilityScan          411
DictionaryBruteForce       107
BrowserHijacking            57
SqlInjection                54
CommandInjection            41
Backdoor_Malware            30
XSS                         24
Uploading_Attack            11
Name: count, dtype: int64

# Random Forest

In [34]:
# make the dataframe into pandas series by selecting specific column
train_label_v1 = train_label_v1['label']
train_label_v2 = train_label_v2['label']
test_label = test_label['label']

In [35]:
random_forest_clf_v1 = RandomForestClassifier(n_estimators=100, random_state=23)
random_forest_clf_v2 = RandomForestClassifier(n_estimators=100, random_state=23)

In [36]:
# takes [25]secs in m1 mac
random_forest_clf_v1.fit(train_data_v1, train_label_v1)

In [37]:
random_forest_clf_v2.fit(train_data_v2, train_label_v2)

# prediction

In [46]:
prediction_v1 = random_forest_clf_v1.predict(test_data)

In [47]:
prediction_v2 = random_forest_clf_v2.predict(test_data)

# evaluate

In [48]:
accuracy_v1 = accuracy_score(test_label, prediction_v1)
print(accuracy_v1)

accuracy_v2 = accuracy_score(test_label, prediction_v2)
print(accuracy_v2)

0.994118050318777
0.9941866045108658


# Traffics that are evaluated as BenignTraffics

In [50]:
indices_unknown_v1 = np.where(prediction_v1 == "BenignTraffic")[0]
indices_unknown_v2 = np.where(prediction_v2 == "BenignTraffic")[0]

In [51]:
test_unknown_data_v1 = test_data.iloc[indices_unknown_v1]
test_unknown_label_v1 = test_label.iloc[indices_unknown_v1]

test_unknown_data_v2 = test_data.iloc[indices_unknown_v2]
test_unknown_label_v2 = test_label.iloc[indices_unknown_v2]

In [52]:
tmp_v1 = test_unknown_label_v1.value_counts()
print(tmp_v1)

label
BenignTraffic           5073
Recon                    266
MITM-ArpSpoofing         179
DNS_Spoofing             125
DictionaryBruteForce      18
SqlInjection              10
CommandInjection           4
BrowserHijacking           4
XSS                        3
Backdoor_Malware           2
DoS                        1
VulnerabilityScan          1
Name: count, dtype: int64


In [53]:
tmp_v2 = test_unknown_label_v2.value_counts()
print(tmp_v2)

label
BenignTraffic           5062
Recon                    258
MITM-ArpSpoofing         191
DNS_Spoofing             135
DictionaryBruteForce      21
SqlInjection              11
BrowserHijacking           4
Backdoor_Malware           3
CommandInjection           3
XSS                        2
VulnerabilityScan          1
Name: count, dtype: int64


# Export the unknown data

In [54]:
print(type(test_unknown_data_v1))
print(type(test_unknown_label_v1))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [55]:
test_unknown_data_v1.to_csv('./data/3_rf_test_unknown_data_v1.csv', index=False)
test_unknown_label_v1.to_csv('./data/3_rf_test_unknown_label_v1.csv', index=False)

In [56]:
test_unknown_data_v2.to_csv('./data/3_rf_test_unknown_data_v2.csv', index=False)
test_unknown_label_v2.to_csv('./data/3_rf_test_unknown_label_v2.csv', index=False)