# import modules

In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble  import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# import train and test dataset

In [2]:
train_csv_file_path = './data/part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv'
train_raw_data = pd.read_csv(train_csv_file_path)

test_csv_file_path = './data/part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv'
test_raw_data = pd.read_csv(test_csv_file_path)

In [3]:
train_data = train_raw_data.iloc[0:, :-1]
train_label = train_raw_data.iloc[0:, -1]

test_data = test_raw_data.iloc[0:, :-1]
test_label = test_raw_data.iloc[0:, -1]

In [4]:
train_label = pd.DataFrame(train_label)
test_label = pd.DataFrame(test_label)

In [5]:
unique = train_label["label"].unique()
attacks = ['DDoS', 'DoS', 'Mirai', 'Recon']

In [6]:
def attackLists(dataframe):
    """ find unique attacks

    Args: 
        dataframe(Pandas Dataframe): Contains specific attacks
    
    Return:
        attack_specifics: list of lists
    """

    unique = dataframe["label"].unique()

    attack_specifics = []

    DDoS = [element for element in unique if 'DDoS' in element]
    DoS = [element for element in unique if 'DoS' in element and 'DDoS' not in element]
    Mirai = [element for element in unique if 'Mirai' in element]
    Recon = [element for element in unique if 'Recon' in element]

    attack_specifics.append(DDoS)
    attack_specifics.append(DoS)
    attack_specifics.append(Mirai)
    attack_specifics.append(Recon)

    return unique, attack_specifics

In [7]:
train_unique, train_attack_specifics = attackLists(train_label)

print(f'types of attacks in training lable: {len(train_unique)}')

for index, attack in enumerate(train_attack_specifics):
    print(f'types of {attacks[index]}: {len(attack)}')

types of attacks in training lable: 34
types of DDoS: 12
types of DoS: 4
types of Mirai: 3
types of Recon: 4


In [8]:
def mergeAttacks(dataframe, attacks):
    """ Merge attacks into big category

    Args:
        dataframe(Pandas dataframe): Contains specific attacks

    Return:
        (Pandas dataframe): merged dataframe
    """

    dataframe.loc[dataframe['label'].str.contains('DDoS'), 'label'] = 'andrew-nah'

    for attack in attacks:
        if attack == 'DDoS':
            continue
        dataframe.loc[dataframe['label'].str.contains(attack), 'label'] = attack
    
    dataframe.loc[dataframe['label'].str.contains('andrew-nah'), 'label'] = 'DDoS'
    
    return dataframe

In [9]:
train_label =  mergeAttacks(train_label, attacks)
unique = train_label["label"].unique()
print(f'types of attacks in train_label after merged: {len(unique)}')

types of attacks in train_label after merged: 15


In [10]:
test_label = mergeAttacks(test_label, attacks)
unique_test = test_label["label"].unique()
print(f'types of attacks after merged: {len(unique_test)}')

types of attacks after merged: 15


# Random Forest

In [11]:
# make the datafram into pandas series by selecting specific column
train_label = train_label['label']
test_label = test_label['label']

In [12]:
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=23)

In [13]:
random_forest_clf.fit(train_data, train_label)

# takes [25]secs in m1 mac

# prediction

In [14]:
prediction = random_forest_clf.predict(test_data)

In [15]:
print(type(prediction))
print(prediction[:5])

<class 'numpy.ndarray'>
['DDoS' 'DDoS' 'DDoS' 'DDoS' 'DDoS']


# evaluate

In [16]:
accuracy = accuracy_score(test_label, prediction)
print(accuracy)

0.9941683233929755


# Traffics that are evaluated as BenignTraffics

In [17]:
indices_unknown = np.where(prediction == "BenignTraffic")[0]

In [18]:
test_unknown_data = test_data.iloc[indices_unknown]
test_unknown_label = test_label.iloc[indices_unknown]

In [19]:
tmp = test_unknown_label.value_counts()
print(tmp)

label
BenignTraffic           5067
Recon                    271
MITM-ArpSpoofing         199
DNS_Spoofing             138
DictionaryBruteForce      19
SqlInjection              13
BrowserHijacking           7
CommandInjection           5
Backdoor_Malware           4
DoS                        2
XSS                        2
VulnerabilityScan          1
Name: count, dtype: int64


# Export the unknown data

In [20]:
print(type(test_unknown_data))
print(type(test_unknown_label))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [21]:
test_unknown_data.to_csv('./data/2_rf_test_unknown_data.csv', index=False)
test_unknown_label.to_csv('./data/2_rf_test_unknown_label.csv', index=False)