In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import FastICA
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('UNSW_2018_IoT_Botnet_Final_10_best_Training.csv')

In [3]:
data.loc[data['category'] == 'Normal', 'category'] = 0
data.loc[data['category'] == 'DDoS', 'category'] = 1
data.loc[data['category'] == 'DoS', 'category'] = 2
data.loc[data['category'] == 'Reconnaissance', 'category'] = 3
data.loc[data['category'] == 'Theft', 'category'] = 4

In [6]:
X=data[['seq','stddev','N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP',
       'drate', 'srate', 'max']]
y=data[['category']]

In [8]:
np.shape(X)

(2934817, 10)

In [9]:
# Standardize the features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# Apply FastICA to perform dimensionality reduction
ica = FastICA(n_components=8)
reduced_data = ica.fit_transform(X_std)

ICA_df = pd.DataFrame(data=reduced_data, columns=['ICA1', 'ICA2', 'ICA3', 'ICA4', 'ICA5','ICA6','ICA7','ICA8'])
ICA_df['category'] = y



In [31]:
ica.get_params()

{'algorithm': 'parallel',
 'fun': 'logcosh',
 'fun_args': None,
 'max_iter': 200,
 'n_components': 8,
 'random_state': None,
 'tol': 0.0001,
 'w_init': None,
 'whiten': 'warn',
 'whiten_solver': 'svd'}

In [11]:
ICA_df['category'].value_counts()

1    1541315
2    1320148
3      72919
0        370
4         65
Name: category, dtype: int64

In [12]:
data['category'].value_counts()

1    1541315
2    1320148
3      72919
0        370
4         65
Name: category, dtype: int64

In [13]:
just_normal = ICA_df[ICA_df['category'] ==0]
just_ddos =ICA_df[ICA_df['category'] ==1]
just_dos =ICA_df[ICA_df['category'] ==2]
just_recon =ICA_df[ICA_df['category'] ==3]
just_theft =ICA_df[ICA_df['category'] ==4]

In [14]:
#################### Test
ddos_0_reduced=just_ddos.sample(n=115413, random_state=50)
dos_0_reduced = just_dos.sample(n=99090, random_state=50)
norm_0_reduced = just_normal.sample(n=31, random_state=50)
recon_0_reduced = just_recon.sample(n=5572, random_state=50)
theft_0_reduced = just_theft.sample(n=5, random_state=50)


data0_ICA_test = pd.concat([dos_0_reduced, norm_0_reduced,recon_0_reduced,theft_0_reduced, ddos_0_reduced], ignore_index=True)
data0_ICA_test = data0_ICA_test.sample(frac=1).reset_index(drop=True)
data0_ICA_test.to_csv('data0_ICA_test.csv',index=False)

######################## Train
dos_0_reduced = just_dos.sample(n=288752, random_state=55)
norm_0_reduced = just_normal.sample(n=84, random_state=55)
recon_0_reduced = just_recon.sample(n=15979, random_state=55)
theft_0_reduced = just_theft.sample(n=13, random_state=55)


data0_ICA_train = pd.concat([dos_0_reduced, norm_0_reduced,recon_0_reduced,theft_0_reduced], ignore_index=True)
data0_ICA_train = data0_ICA_train.sample(frac=1).reset_index(drop=True)
data0_ICA_train.to_csv('data0_ICA_train.csv',index=False)

In [16]:
#################### Test
ddos_1_reduced=just_ddos.sample(n=115382, random_state=50)
dos_1_reduced = just_dos.sample(n=99303, random_state=50)
norm_1_reduced = just_normal.sample(n=26, random_state=50)
recon_1_reduced = just_recon.sample(n=5398, random_state=50)
theft_1_reduced = just_theft.sample(n=2, random_state=50)


data1_ICA_test = pd.concat([dos_1_reduced, norm_1_reduced,recon_1_reduced,theft_1_reduced, ddos_1_reduced], ignore_index=True)
data1_ICA_test = data1_ICA_test.sample(frac=1).reset_index(drop=True)
data1_ICA_test.to_csv('data1_ICA_test.csv',index=False)

######################## Train
ddos_1_reduced=just_ddos.sample(n=337162, random_state=55)
dos_1_reduced = just_dos.sample(n=288752, random_state=55)
norm_1_reduced = just_normal.sample(n=84, random_state=55)
theft_1_reduced = just_theft.sample(n=14, random_state=55)


data1_ICA_train = pd.concat([dos_1_reduced, norm_1_reduced,ddos_1_reduced,theft_1_reduced], ignore_index=True)
data1_ICA_train = data1_ICA_train.sample(frac=1).reset_index(drop=True)
data1_ICA_train.to_csv('data1_ICA_train.csv',index=False)

In [19]:
#################### Test
ddos_2_reduced=just_ddos.sample(n=115459, random_state=50)
dos_2_reduced = just_dos.sample(n=99193, random_state=50)
norm_2_reduced = just_normal.sample(n=34, random_state=50)
recon_2_reduced = just_recon.sample(n=5420, random_state=50)
theft_2_reduced = just_theft.sample(n=5, random_state=50)


data2_ICA_test = pd.concat([dos_2_reduced, norm_2_reduced,recon_2_reduced,theft_2_reduced, ddos_2_reduced], ignore_index=True)
data2_ICA_test = data2_ICA_test.sample(frac=1).reset_index(drop=True)
data2_ICA_test.to_csv('data2_ICA_test.csv',index=False)

######################## Train
ddos_2_reduced=just_ddos.sample(n=337163, random_state=55)
dos_2_reduced = just_dos.sample(n=288752, random_state=55)
norm_2_reduced = just_normal.sample(n=84, random_state=55)
recon_2_reduced = just_recon.sample(n=15979, random_state=55)


data2_ICA_train = pd.concat([dos_2_reduced, norm_2_reduced,ddos_2_reduced,recon_2_reduced], ignore_index=True)
data2_ICA_train = data2_ICA_train.sample(frac=1).reset_index(drop=True)
data2_ICA_train.to_csv('data2_ICA_train.csv',index=False)

In [22]:
#################### Test
ddos_3_reduced=just_ddos.sample(n=115570, random_state=50)
dos_3_reduced = just_dos.sample(n=99016, random_state=50)
norm_3_reduced = just_normal.sample(n=32, random_state=50)
recon_3_reduced = just_recon.sample(n=5490, random_state=50)
theft_3_reduced = just_theft.sample(n=3, random_state=50)


data3_ICA_test = pd.concat([dos_3_reduced, norm_3_reduced,recon_3_reduced,theft_3_reduced, ddos_3_reduced], ignore_index=True)
data3_ICA_test = data3_ICA_test.sample(frac=1).reset_index(drop=True)
data3_ICA_test.to_csv('data3_ICA_test.csv',index=False)

######################## Train
ddos_3_reduced=just_ddos.sample(n=337163, random_state=55)
theft_3_reduced = just_theft.sample(n=14, random_state=55)
norm_3_reduced = just_normal.sample(n=84, random_state=55)
recon_3_reduced = just_recon.sample(n=15979, random_state=55)


data3_ICA_train = pd.concat([theft_3_reduced, norm_3_reduced,ddos_3_reduced,recon_3_reduced], ignore_index=True)
data3_ICA_train = data3_ICA_train.sample(frac=1).reset_index(drop=True)
data3_ICA_train.to_csv('data3_ICA_train.csv',index=False)

In [29]:
#################### Test
ddos_4_reduced=just_ddos.sample(n=115406, random_state=50)
dos_4_reduced = just_dos.sample(n=99354, random_state=50)
norm_4_reduced = just_normal.sample(n=26, random_state=50)
recon_4_reduced = just_recon.sample(n=5318, random_state=50)
theft_4_reduced = just_theft.sample(n=7, random_state=50)


data4_ICA_test = pd.concat([dos_4_reduced, norm_4_reduced,recon_4_reduced,theft_4_reduced, ddos_4_reduced], ignore_index=True)
data4_ICA_test = data4_ICA_test.sample(frac=1).reset_index(drop=True)
data4_ICA_test.to_csv('data4_ICA_test.csv',index=False)

######################## Train
ddos_4_reduced=just_ddos.sample(n=337163, random_state=55)
theft_4_reduced = just_theft.sample(n=14, random_state=55)
dos_4_reduced = just_dos.sample(n=288752, random_state=55)
recon_4_reduced = just_recon.sample(n=15979, random_state=55)


data4_ICA_train = pd.concat([theft_4_reduced, dos_4_reduced,ddos_4_reduced,recon_4_reduced], ignore_index=True)
data4_ICA_train = data4_ICA_train.sample(frac=1).reset_index(drop=True)
data4_ICA_train.to_csv('data4_ICA_train.csv',index=False)

In [30]:
data4_ICA_test['category'].value_counts()

1    115406
2     99354
3      5318
0        26
4         7
Name: category, dtype: int64