# Load Dataset

In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


In [3]:
# Load datasets
botnet_df_v2 = pd.read_csv('BoTNeTIoT-L01-v2.csv')
botnet_df_no_duplicates = pd.read_csv('BotNeTIoT-L01_label_NoDuplicates.csv')

In [4]:
# Sample data to only an amount of 1000 rows
sample_botnet_df_v2 = botnet_df_v2.sample(n=1000, random_state=1)
sample_botnet_df_no_duplicates = botnet_df_no_duplicates.sample(n=1000, random_state=1)

# Save the sampled data
sample_botnet_df_v2.to_csv('sample_botnet_df_v2.csv', index=False)
sample_botnet_df_no_duplicates.to_csv('sample_botnet_df_no_duplicates.csv', index=False)

# botnet_df_v2 features
print('botnet_df_v2 features:')
print(sample_botnet_df_v2.columns)
print('\n')
print('*'*50)

botnet_df_v2 features:
Index(['MI_dir_L0.1_weight', 'MI_dir_L0.1_mean', 'MI_dir_L0.1_variance',
       'H_L0.1_weight', 'H_L0.1_mean', 'H_L0.1_variance', 'HH_L0.1_weight',
       'HH_L0.1_mean', 'HH_L0.1_std', 'HH_L0.1_magnitude', 'HH_L0.1_radius',
       'HH_L0.1_covariance', 'HH_L0.1_pcc', 'HH_jit_L0.1_weight',
       'HH_jit_L0.1_mean', 'HH_jit_L0.1_variance', 'HpHp_L0.1_weight',
       'HpHp_L0.1_mean', 'HpHp_L0.1_std', 'HpHp_L0.1_magnitude',
       'HpHp_L0.1_radius', 'HpHp_L0.1_covariance', 'HpHp_L0.1_pcc',
       'Device_Name', 'Attack', 'Attack_subType', 'label'],
      dtype='object')


**************************************************


In [15]:
print('-'*10, 'HEAD', '-'*10)
print(botnet_df_v2.head())
print('\n')

---------- HEAD ----------
   MI_dir_L0.1_weight  MI_dir_L0.1_mean  MI_dir_L0.1_variance  H_L0.1_weight  \
0            1.000000         98.000000          0.000000e+00       1.000000   
1            1.931640         98.000000          1.818989e-12       1.931640   
2            2.904273         86.981750          2.311822e+02       2.904273   
3            3.902546         83.655268          2.040614e+02       3.902546   
4            4.902545         81.685828          1.775746e+02       4.902545   

   H_L0.1_mean  H_L0.1_variance  HH_L0.1_weight  HH_L0.1_mean   HH_L0.1_std  \
0    98.000000     0.000000e+00         1.00000          98.0  0.000000e+00   
1    98.000000     1.818989e-12         1.93164          98.0  1.348699e-06   
2    86.981750     2.311822e+02         1.00000          66.0  0.000000e+00   
3    83.655268     2.040614e+02         1.00000          74.0  0.000000e+00   
4    81.685828     1.775746e+02         2.00000          74.0  9.536743e-07   

   HH_L0.1_magnit

In [16]:
print('-'*10, 'DESCRIBE', '-'*10)
print(botnet_df_v2.describe())
print('\n')

---------- DESCRIBE ----------
       MI_dir_L0.1_weight  MI_dir_L0.1_mean  MI_dir_L0.1_variance  \
count        7.062606e+06      7.062606e+06          7.062606e+06   
mean         3.400682e+03      1.794441e+02          1.931062e+04   
std          2.897012e+03      1.537109e+02          2.636844e+04   
min          1.000000e+00      6.000000e+01          0.000000e+00   
25%          1.000000e+00      6.000000e+01          0.000000e+00   
50%          3.644882e+03      7.412707e+01          9.807711e+01   
75%          6.354692e+03      3.486463e+02          4.887076e+04   
max          8.946997e+03      1.401994e+03          4.520011e+05   

       H_L0.1_weight   H_L0.1_mean  H_L0.1_variance  HH_L0.1_weight  \
count   7.062606e+06  7.062606e+06     7.062606e+06    7.062606e+06   
mean    3.400682e+03  1.794441e+02     1.931066e+04    1.892359e+03   
std     2.897012e+03  1.537107e+02     2.636842e+04    2.523083e+03   
min     1.000000e+00  6.000000e+01     0.000000e+00    1.000000

In [17]:
print('-'*10, 'INFO', '-'*10)
print(botnet_df_v2.info())
print('\n')

---------- INFO ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7062606 entries, 0 to 7062605
Data columns (total 27 columns):
 #   Column                Dtype  
---  ------                -----  
 0   MI_dir_L0.1_weight    float64
 1   MI_dir_L0.1_mean      float64
 2   MI_dir_L0.1_variance  float64
 3   H_L0.1_weight         float64
 4   H_L0.1_mean           float64
 5   H_L0.1_variance       float64
 6   HH_L0.1_weight        float64
 7   HH_L0.1_mean          float64
 8   HH_L0.1_std           float64
 9   HH_L0.1_magnitude     float64
 10  HH_L0.1_radius        float64
 11  HH_L0.1_covariance    float64
 12  HH_L0.1_pcc           float64
 13  HH_jit_L0.1_weight    float64
 14  HH_jit_L0.1_mean      float64
 15  HH_jit_L0.1_variance  float64
 16  HpHp_L0.1_weight      float64
 17  HpHp_L0.1_mean        float64
 18  HpHp_L0.1_std         float64
 19  HpHp_L0.1_magnitude   float64
 20  HpHp_L0.1_radius      float64
 21  HpHp_L0.1_covariance  float64
 22  HpHp_L0.1_pcc  

## Check for null values and duplicates.

In [18]:
# Check for null values and missing data
print('-'*10, 'MISSING VALUES', '-'*10)
print(botnet_df_v2.isnull().sum())
print('\n')

---------- MISSING VALUES ----------
MI_dir_L0.1_weight      0
MI_dir_L0.1_mean        0
MI_dir_L0.1_variance    0
H_L0.1_weight           0
H_L0.1_mean             0
H_L0.1_variance         0
HH_L0.1_weight          0
HH_L0.1_mean            0
HH_L0.1_std             0
HH_L0.1_magnitude       0
HH_L0.1_radius          0
HH_L0.1_covariance      0
HH_L0.1_pcc             0
HH_jit_L0.1_weight      0
HH_jit_L0.1_mean        0
HH_jit_L0.1_variance    0
HpHp_L0.1_weight        0
HpHp_L0.1_mean          0
HpHp_L0.1_std           0
HpHp_L0.1_magnitude     0
HpHp_L0.1_radius        0
HpHp_L0.1_covariance    0
HpHp_L0.1_pcc           0
Device_Name             0
Attack                  0
Attack_subType          0
label                   0
dtype: int64




In [19]:
# Check for duplicates
print('-'*10, 'DUPLICATES', '-'*10)
print(botnet_df_v2.duplicated().sum())
print('\n')

---------- DUPLICATES ----------
621659




In [20]:
# Check for unique values
print('-'*10, 'UNIQUE VALUES', '-'*10)
print(botnet_df_v2.nunique())
print('\n')

---------- UNIQUE VALUES ----------
MI_dir_L0.1_weight      2276268
MI_dir_L0.1_mean        2265915
MI_dir_L0.1_variance    2266031
H_L0.1_weight           2276383
H_L0.1_mean             2266034
H_L0.1_variance         2266175
HH_L0.1_weight          1530792
HH_L0.1_mean            1368526
HH_L0.1_std             1392808
HH_L0.1_magnitude       1369829
HH_L0.1_radius          1383985
HH_L0.1_covariance       454133
HH_L0.1_pcc              431687
HH_jit_L0.1_weight      1530792
HH_jit_L0.1_mean        1997489
HH_jit_L0.1_variance    1507219
HpHp_L0.1_weight         666797
HpHp_L0.1_mean           223133
HpHp_L0.1_std            256578
HpHp_L0.1_magnitude      264999
HpHp_L0.1_radius         273714
HpHp_L0.1_covariance     215749
HpHp_L0.1_pcc            194141
Device_Name                   9
Attack                        3
Attack_subType                9
label                         2
dtype: int64




In [21]:
# Check data types
print('-'*10, 'DATA TYPES', '-'*10)
print(botnet_df_v2.dtypes)
print('\n')

---------- DATA TYPES ----------
MI_dir_L0.1_weight      float64
MI_dir_L0.1_mean        float64
MI_dir_L0.1_variance    float64
H_L0.1_weight           float64
H_L0.1_mean             float64
H_L0.1_variance         float64
HH_L0.1_weight          float64
HH_L0.1_mean            float64
HH_L0.1_std             float64
HH_L0.1_magnitude       float64
HH_L0.1_radius          float64
HH_L0.1_covariance      float64
HH_L0.1_pcc             float64
HH_jit_L0.1_weight      float64
HH_jit_L0.1_mean        float64
HH_jit_L0.1_variance    float64
HpHp_L0.1_weight        float64
HpHp_L0.1_mean          float64
HpHp_L0.1_std           float64
HpHp_L0.1_magnitude     float64
HpHp_L0.1_radius        float64
HpHp_L0.1_covariance    float64
HpHp_L0.1_pcc           float64
Device_Name              object
Attack                   object
Attack_subType           object
label                     int64
dtype: object




## Before proceeding to the correlation analysis, we need to convert the categorical features to numerical for better processing