- this file is used for making train data (1) and test data (5) from all sensor
- The training data will consist of an accumulation of 70% from each sensor
- the test data will consist of 30% from each sensors

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df_1 = pd.read_csv('scenario_dataset_1/dataset_result.binetflow')
df_2 = pd.read_csv('scenario_dataset_2/dataset_result.binetflow')
df_5 = pd.read_csv('scenario_dataset_5/dataset_result.binetflow')
df_9 = pd.read_csv('scenario_dataset_9/dataset_result.binetflow')
df_13 = pd.read_csv('scenario_dataset_13/dataset_result.binetflow')

df = [df_1, df_2, df_5, df_9, df_13]

In [3]:
train_sum = 0
test_sum = 0

for i in df:
    size = len(i)
    train_sum += int(size * 0.7)
    test_sum += int(size * 0.3)
    print(f"{size}\t{int(size * 0.7)}\t{int(size * 0.3)}")

print('train_sum', train_sum)
print('test_sum', test_sum)

2112224	1478556	633667
1465182	1025627	439554
92917	65041	27875
1573304	1101312	471991
1876489	1313542	562946
train_sum 4984078
test_sum 2136033


In [None]:
# simplify the label column
def categorize_label(label):
    label = str(label).lower()
    if 'botnet' in label:
        if 'spam' in label:
            return 'botnet_spam'
        else:
            return 'botnet'
    else:
        return 'normal'

# Apply the function to the 'Label' column
for i in df:
    i['Label'] = i['Label'].apply(categorize_label)

to make the data simpler, (following my paper) this code will remove the unecessary feature such as:
- dTos
- sTos
- ActivityLabel (only in NCC-2)
- BotnetName (only in NCC-2)
- SensorId (only in NCC-2)
- StartTime

In [5]:
for index in range(len(df)):
    df[index] = df[index].drop(
        columns=['dTos', 'sTos', 'ActivityLabel', 'BotnetName', 'SensorId', 'StartTime'], 
        errors='ignore'
    )

#### Splitting

In [6]:
normal_df = []
botnet_df = []
botnet_spam_df = []

for i in df:
    normal_df.append(i[i['Label'] == 'normal'])
    botnet_df.append(i[i['Label'] == 'botnet'])
    botnet_spam_df.append(i[i['Label'] == 'botnet_spam'])

In [7]:
normal_train, normal_test = [], []
botnet_train, botnet_test = [], []
botnet_spam_train, botnet_spam_test = [], []

for a, b, c in zip(normal_df, botnet_df, botnet_spam_df):
    temp_train, temp_test = train_test_split(a, test_size=0.3, random_state=42)
    normal_train.append(temp_train)
    normal_test.append(temp_test)

    temp_train, temp_test = train_test_split(b, test_size=0.3, random_state=42)
    botnet_train.append(temp_train)
    botnet_test.append(temp_test)

    temp_train, temp_test = train_test_split(c, test_size=0.3, random_state=42)
    botnet_spam_train.append(temp_train)
    botnet_spam_test.append(temp_test)

In [8]:
# combining training data into one
temp_train_df = []
for a, b, c in zip(normal_train, botnet_train, botnet_spam_train):
    temp_train_df.append(pd.concat([a, b, c], ignore_index=True))
train_df = pd.concat(temp_train_df, ignore_index=True)

# combining test data for each 
test_df = []
for a, b, c in zip(normal_test, botnet_test, botnet_spam_test):
    test_df.append(pd.concat([a, b, c], ignore_index=True))

# Shuffle the combined training and testing datasets
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
for index in range(len(test_df)):
    test_df[index] = test_df[index].sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
train_df.to_csv('final_dataset/train.csv', index=False)

In [10]:
filenames = ['1', '2', '5', '9', '13']
for a, b in zip(test_df, filenames):
    a.to_csv(f'final_dataset/test_{b}.csv', index=False)