In [29]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.preprocessing import StandardScaler

In [30]:
data = pd.read_csv('UNSW_2018_IoT_Botnet_Final_10_best_Training.csv')

In [31]:
data.loc[data['category'] == 'Normal', 'category'] = 0
data.loc[data['category'] == 'DDoS', 'category'] = 1
data.loc[data['category'] == 'DoS', 'category'] = 2
data.loc[data['category'] == 'Reconnaissance', 'category'] = 3
data.loc[data['category'] == 'Theft', 'category'] = 4

In [32]:
X=data[['seq','stddev','N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP',
       'drate', 'srate', 'max']]
y=data[['category']]

In [33]:
# Apply NMF to perform dimensionality reduction
nmf = NMF(n_components=8, init='random',random_state=0)

In [34]:
reduced_data = nmf.fit_transform(X)

In [36]:
NMF_df = pd.DataFrame(data=reduced_data, columns=['nmf1', 'nmf2', 'nmf3', 'nmf4', 'nmf5','nmf6','nmf7','nmf8'])
NMF_df['category'] = y

In [37]:
NMF_df

Unnamed: 0,nmf1,nmf2,nmf3,nmf4,nmf5,nmf6,nmf7,nmf8,category
0,547.184058,0.000560,318.288274,0.000000,0.000000,53.483249,0.000051,0.002712,1
1,602.424511,0.000275,323.985627,0.000000,0.000000,0.000000,0.000000,0.000502,1
2,58.465549,0.000361,93.520367,0.000131,0.001831,102.510486,0.000000,0.005175,1
3,144.360982,0.000535,147.081719,0.000000,0.000000,98.262419,0.000024,0.001991,2
4,218.568462,0.001205,150.533854,0.000163,0.002243,26.550930,0.000000,0.005311,1
...,...,...,...,...,...,...,...,...,...
2934812,593.627023,0.000271,322.471447,0.000000,0.000000,0.000000,0.000000,0.002935,2
2934813,532.141864,0.000567,293.399862,0.000000,0.000000,13.610288,0.000028,0.003472,1
2934814,319.818631,0.000241,205.543089,0.000000,0.000000,61.425866,0.000009,0.004135,2
2934815,407.518832,0.000189,230.563617,0.000000,0.000000,16.162645,0.000004,0.004273,2


In [38]:
NMF_df['category'].value_counts()

1    1541315
2    1320148
3      72919
0        370
4         65
Name: category, dtype: int64

In [39]:
just_normal = NMF_df[NMF_df['category'] ==0]
just_ddos =NMF_df[NMF_df['category'] ==1]
just_dos =NMF_df[NMF_df['category'] ==2]
just_recon =NMF_df[NMF_df['category'] ==3]
just_theft =NMF_df[NMF_df['category'] ==4]

In [40]:
#################### Test
ddos_0_reduced=just_ddos.sample(n=115413, random_state=50)
dos_0_reduced = just_dos.sample(n=99090, random_state=50)
norm_0_reduced = just_normal.sample(n=31, random_state=50)
recon_0_reduced = just_recon.sample(n=5572, random_state=50)
theft_0_reduced = just_theft.sample(n=5, random_state=50)


data0_ICA_test = pd.concat([dos_0_reduced, norm_0_reduced,recon_0_reduced,theft_0_reduced, ddos_0_reduced], ignore_index=True)
data0_ICA_test = data0_ICA_test.sample(frac=1).reset_index(drop=True)
data0_ICA_test.to_csv('data0_NMF_test.csv',index=False)

######################## Train
dos_0_reduced = just_dos.sample(n=288752, random_state=55)
norm_0_reduced = just_normal.sample(n=84, random_state=55)
recon_0_reduced = just_recon.sample(n=15979, random_state=55)
theft_0_reduced = just_theft.sample(n=13, random_state=55)


data0_ICA_train = pd.concat([dos_0_reduced, norm_0_reduced,recon_0_reduced,theft_0_reduced], ignore_index=True)
data0_ICA_train = data0_ICA_train.sample(frac=1).reset_index(drop=True)
data0_ICA_train.to_csv('data0_NMF_train.csv',index=False)

In [41]:
#################### Test
ddos_1_reduced=just_ddos.sample(n=115382, random_state=50)
dos_1_reduced = just_dos.sample(n=99303, random_state=50)
norm_1_reduced = just_normal.sample(n=26, random_state=50)
recon_1_reduced = just_recon.sample(n=5398, random_state=50)
theft_1_reduced = just_theft.sample(n=2, random_state=50)


data1_ICA_test = pd.concat([dos_1_reduced, norm_1_reduced,recon_1_reduced,theft_1_reduced, ddos_1_reduced], ignore_index=True)
data1_ICA_test = data1_ICA_test.sample(frac=1).reset_index(drop=True)
data1_ICA_test.to_csv('data1_NMF_test.csv',index=False)

######################## Train
ddos_1_reduced=just_ddos.sample(n=337162, random_state=55)
dos_1_reduced = just_dos.sample(n=288752, random_state=55)
norm_1_reduced = just_normal.sample(n=84, random_state=55)
theft_1_reduced = just_theft.sample(n=14, random_state=55)


data1_ICA_train = pd.concat([dos_1_reduced, norm_1_reduced,ddos_1_reduced,theft_1_reduced], ignore_index=True)
data1_ICA_train = data1_ICA_train.sample(frac=1).reset_index(drop=True)
data1_ICA_train.to_csv('data1_NMF_train.csv',index=False)

In [42]:
#################### Test
ddos_2_reduced=just_ddos.sample(n=115459, random_state=50)
dos_2_reduced = just_dos.sample(n=99193, random_state=50)
norm_2_reduced = just_normal.sample(n=34, random_state=50)
recon_2_reduced = just_recon.sample(n=5420, random_state=50)
theft_2_reduced = just_theft.sample(n=5, random_state=50)


data2_ICA_test = pd.concat([dos_2_reduced, norm_2_reduced,recon_2_reduced,theft_2_reduced, ddos_2_reduced], ignore_index=True)
data2_ICA_test = data2_ICA_test.sample(frac=1).reset_index(drop=True)
data2_ICA_test.to_csv('data2_NMF_test.csv',index=False)

######################## Train
ddos_2_reduced=just_ddos.sample(n=337163, random_state=55)
dos_2_reduced = just_dos.sample(n=288752, random_state=55)
norm_2_reduced = just_normal.sample(n=84, random_state=55)
recon_2_reduced = just_recon.sample(n=15979, random_state=55)


data2_ICA_train = pd.concat([dos_2_reduced, norm_2_reduced,ddos_2_reduced,recon_2_reduced], ignore_index=True)
data2_ICA_train = data2_ICA_train.sample(frac=1).reset_index(drop=True)
data2_ICA_train.to_csv('data2_NMF_train.csv',index=False)

In [43]:
#################### Test
ddos_3_reduced=just_ddos.sample(n=115570, random_state=50)
dos_3_reduced = just_dos.sample(n=99016, random_state=50)
norm_3_reduced = just_normal.sample(n=32, random_state=50)
recon_3_reduced = just_recon.sample(n=5490, random_state=50)
theft_3_reduced = just_theft.sample(n=3, random_state=50)


data3_ICA_test = pd.concat([dos_3_reduced, norm_3_reduced,recon_3_reduced,theft_3_reduced, ddos_3_reduced], ignore_index=True)
data3_ICA_test = data3_ICA_test.sample(frac=1).reset_index(drop=True)
data3_ICA_test.to_csv('data3_NMF_test.csv',index=False)

######################## Train
ddos_3_reduced=just_ddos.sample(n=337163, random_state=55)
theft_3_reduced = just_theft.sample(n=14, random_state=55)
norm_3_reduced = just_normal.sample(n=84, random_state=55)
recon_3_reduced = just_recon.sample(n=15979, random_state=55)


data3_ICA_train = pd.concat([theft_3_reduced, norm_3_reduced,ddos_3_reduced,recon_3_reduced], ignore_index=True)
data3_ICA_train = data3_ICA_train.sample(frac=1).reset_index(drop=True)
data3_ICA_train.to_csv('data3_NMF_train.csv',index=False)

In [44]:
#################### Test
ddos_4_reduced=just_ddos.sample(n=115406, random_state=50)
dos_4_reduced = just_dos.sample(n=99354, random_state=50)
norm_4_reduced = just_normal.sample(n=26, random_state=50)
recon_4_reduced = just_recon.sample(n=5318, random_state=50)
theft_4_reduced = just_theft.sample(n=7, random_state=50)


data4_ICA_test = pd.concat([dos_4_reduced, norm_4_reduced,recon_4_reduced,theft_4_reduced, ddos_4_reduced], ignore_index=True)
data4_ICA_test = data4_ICA_test.sample(frac=1).reset_index(drop=True)
data4_ICA_test.to_csv('data4_NMF_test.csv',index=False)

######################## Train
ddos_4_reduced=just_ddos.sample(n=337163, random_state=55)
theft_4_reduced = just_theft.sample(n=14, random_state=55)
dos_4_reduced = just_dos.sample(n=288752, random_state=55)
recon_4_reduced = just_recon.sample(n=15979, random_state=55)


data4_ICA_train = pd.concat([theft_4_reduced, dos_4_reduced,ddos_4_reduced,recon_4_reduced], ignore_index=True)
data4_ICA_train = data4_ICA_train.sample(frac=1).reset_index(drop=True)
data4_ICA_train.to_csv('data4_NMF_train.csv',index=False)