# Random Under Sampling Balancing

In [1]:
# Independences
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Import files
df_train = pd.read_csv('Datasets/train-train.csv')

In [3]:
print('train_sum:', len(df_train))
print()
print('Before RUS Balancing:')
print(df_train['Label'].value_counts())

train_sum: 15030469

Before RUS Balancing:
Label
normal         14346045
botnet           614556
botnet_spam       69868
Name: count, dtype: int64


### Balancing

In [4]:
def stage1_label(label):
    return 0 if label == 'normal' else 1

def stage2_label(label):
    return 1 if 'spam' in label else 0

def multiclass_label(label):
    if label == 'normal':
        return 0
    elif 'spam' in label:
        return 2
    else:
        return 1

df_train['Multiclass_Label'] = df_train['Label'].apply(multiclass_label)
x_train = df_train.drop(columns=['Label', 'Multiclass_Label'])
y_train = df_train['Multiclass_Label']

In [5]:
rs = RandomUnderSampler() 
x_train_balance, y_train_balance = rs.fit_resample(x_train, y_train)
train_balance = x_train_balance.join(y_train_balance)

In [6]:
def labeling(label):
    if label == 0:
        return 'normal'
    elif label == 1:
        return 'botnet'
    else:
        return 'botnet_spam'

train_balance['Label'] = train_balance['Multiclass_Label'].apply(labeling)
train_balance = train_balance.drop(columns=['Multiclass_Label'])

In [7]:
train_balance.columns

Index(['TotBytes', 'Dur', 'udt', 'rsvp', 'State', 'arp', 'udp', 'SrcBytes',
       'DstAddr', 'ipv6-icmp', 'icmp', 'tcp', 'SrcAddr', 'Dport', 'esp',
       'Sport', 'llc', 'ipv6', 'rtp', 'pim', 'gre', 'unas', 'TotPkts', 'Dir',
       'ipnip', 'igmp', 'rtcp', 'ipx/spx', 'rarp', 'Label'],
      dtype='object')

In [8]:
print('train_sum:', len(train_balance))
print()
print('Before RUS Balancing:')
print(train_balance['Label'].value_counts())

train_sum: 209604

Before RUS Balancing:
Label
normal         69868
botnet         69868
botnet_spam    69868
Name: count, dtype: int64


In [None]:
# train_balance.to_csv(f'train-rus.csv', index=False)