# Load Dataset

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


In [2]:
# Load datasets
botnet_df_v2 = pd.read_csv('BoTNeTIoT-L01-v2.csv')
botnet_df_no_duplicates = pd.read_csv('BotNeTIoT-L01_label_NoDuplicates.csv')

In [5]:
# Sample data to only an amount of 1000 rows
sample_botnet_df_v2 = botnet_df_v2.sample(n=1000, random_state=1)
sample_botnet_df_no_duplicates = botnet_df_no_duplicates.sample(n=1000, random_state=1)

# Save the sampled data
sample_botnet_df_v2.to_csv('sample_botnet_df_v2.csv', index=False)
sample_botnet_df_no_duplicates.to_csv('sample_botnet_df_no_duplicates.csv', index=False)

# botnet_df_v2 features
print('botnet_df_v2 features:')
print(sample_botnet_df_v2.columns)
print('\n')
print('*'*50)

botnet_df_v2 features:
Index(['MI_dir_L0.1_weight', 'MI_dir_L0.1_mean', 'MI_dir_L0.1_variance',
       'H_L0.1_weight', 'H_L0.1_mean', 'H_L0.1_variance', 'HH_L0.1_weight',
       'HH_L0.1_mean', 'HH_L0.1_std', 'HH_L0.1_magnitude', 'HH_L0.1_radius',
       'HH_L0.1_covariance', 'HH_L0.1_pcc', 'HH_jit_L0.1_weight',
       'HH_jit_L0.1_mean', 'HH_jit_L0.1_variance', 'HpHp_L0.1_weight',
       'HpHp_L0.1_mean', 'HpHp_L0.1_std', 'HpHp_L0.1_magnitude',
       'HpHp_L0.1_radius', 'HpHp_L0.1_covariance', 'HpHp_L0.1_pcc',
       'Device_Name', 'Attack', 'Attack_subType', 'label'],
      dtype='object')


**************************************************


In [6]:
print('-'*10, 'HEAD', '-'*10)
print(sample_botnet_df_v2.head())
print('\n')

---------- HEAD ----------
         MI_dir_L0.1_weight  MI_dir_L0.1_mean  MI_dir_L0.1_variance  \
5193259         6412.533913        438.400005          48269.772437   
3927186            1.000000         60.000000              0.000000   
2620008         6086.053458         69.344549             43.568915   
6952954            5.455274        288.458823          32981.400870   
5430966         7680.011326         70.946996             34.217549   

         H_L0.1_weight  H_L0.1_mean  H_L0.1_variance  HH_L0.1_weight  \
5193259    6412.533913   438.400005     48269.772437     4796.285520   
3927186       1.000000    60.000000         0.000000        1.000000   
2620008    6086.053458    69.344549        43.568915     4059.361398   
6952954       5.455274   288.458823     32981.400870        1.015808   
5430966    7680.011326    70.946996        34.217549     5951.725926   

         HH_L0.1_mean  HH_L0.1_std  HH_L0.1_magnitude  ...  HpHp_L0.1_mean  \
5193259    565.873179     8.009717 

In [7]:
print('-'*10, 'DESCRIBE', '-'*10)
print(sample_botnet_df_v2.describe())
print('\n')

---------- DESCRIBE ----------
       MI_dir_L0.1_weight  MI_dir_L0.1_mean  MI_dir_L0.1_variance  \
count         1000.000000       1000.000000           1000.000000   
mean          3403.377800        176.600370          19230.777108   
std           2882.209441        151.815803          26036.151182   
min              1.000000         60.000000              0.000000   
25%              1.000000         60.000000              0.000000   
50%           3715.019481         74.104370             49.682294   
75%           6352.035071        344.240510          49855.542664   
max           8881.818074        445.399305         107005.417900   

       H_L0.1_weight  H_L0.1_mean  H_L0.1_variance  HH_L0.1_weight  \
count    1000.000000  1000.000000      1000.000000     1000.000000   
mean     3403.377800   176.600370     19230.777108     1833.363209   
std      2882.209441   151.815803     26036.151182     2506.832212   
min         1.000000    60.000000         0.000000        1.000000 

In [8]:
print('-'*10, 'INFO', '-'*10)
print(sample_botnet_df_v2.info())
print('\n')

---------- INFO ----------
<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 5193259 to 5483726
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   MI_dir_L0.1_weight    1000 non-null   float64
 1   MI_dir_L0.1_mean      1000 non-null   float64
 2   MI_dir_L0.1_variance  1000 non-null   float64
 3   H_L0.1_weight         1000 non-null   float64
 4   H_L0.1_mean           1000 non-null   float64
 5   H_L0.1_variance       1000 non-null   float64
 6   HH_L0.1_weight        1000 non-null   float64
 7   HH_L0.1_mean          1000 non-null   float64
 8   HH_L0.1_std           1000 non-null   float64
 9   HH_L0.1_magnitude     1000 non-null   float64
 10  HH_L0.1_radius        1000 non-null   float64
 11  HH_L0.1_covariance    1000 non-null   float64
 12  HH_L0.1_pcc           1000 non-null   float64
 13  HH_jit_L0.1_weight    1000 non-null   float64
 14  HH_jit_L0.1_mean      1000 non-null   flo

## Check for null values and duplicates.

In [11]:
# Check for null values and missing data
print('-'*10, 'MISSING VALUES', '-'*10)
print(sample_botnet_df_v2.isnull().sum())
print('\n')

---------- MISSING VALUES ----------
MI_dir_L0.1_weight      0
MI_dir_L0.1_mean        0
MI_dir_L0.1_variance    0
H_L0.1_weight           0
H_L0.1_mean             0
H_L0.1_variance         0
HH_L0.1_weight          0
HH_L0.1_mean            0
HH_L0.1_std             0
HH_L0.1_magnitude       0
HH_L0.1_radius          0
HH_L0.1_covariance      0
HH_L0.1_pcc             0
HH_jit_L0.1_weight      0
HH_jit_L0.1_mean        0
HH_jit_L0.1_variance    0
HpHp_L0.1_weight        0
HpHp_L0.1_mean          0
HpHp_L0.1_std           0
HpHp_L0.1_magnitude     0
HpHp_L0.1_radius        0
HpHp_L0.1_covariance    0
HpHp_L0.1_pcc           0
Device_Name             0
Attack                  0
Attack_subType          0
label                   0
dtype: int64




In [12]:
# Check for duplicates
print('-'*10, 'DUPLICATES', '-'*10)
print(sample_botnet_df_v2.duplicated().sum())
print('\n')

---------- DUPLICATES ----------
0




In [13]:
# Check for unique values
print('-'*10, 'UNIQUE VALUES', '-'*10)
print(sample_botnet_df_v2.nunique())
print('\n')

---------- UNIQUE VALUES ----------
MI_dir_L0.1_weight      747
MI_dir_L0.1_mean        747
MI_dir_L0.1_variance    747
H_L0.1_weight           747
H_L0.1_mean             747
H_L0.1_variance         747
HH_L0.1_weight          495
HH_L0.1_mean            457
HH_L0.1_std             462
HH_L0.1_magnitude       458
HH_L0.1_radius          463
HH_L0.1_covariance      169
HH_L0.1_pcc             161
HH_jit_L0.1_weight      495
HH_jit_L0.1_mean        999
HH_jit_L0.1_variance    495
HpHp_L0.1_weight        181
HpHp_L0.1_mean           51
HpHp_L0.1_std            72
HpHp_L0.1_magnitude      58
HpHp_L0.1_radius         81
HpHp_L0.1_covariance     57
HpHp_L0.1_pcc            45
Device_Name               9
Attack                    3
Attack_subType            9
label                     2
dtype: int64




In [15]:
# Check data types
print('-'*10, 'DATA TYPES', '-'*10)
print(sample_botnet_df_v2.dtypes)
print('\n')

---------- DATA TYPES ----------
MI_dir_L0.1_weight      float64
MI_dir_L0.1_mean        float64
MI_dir_L0.1_variance    float64
H_L0.1_weight           float64
H_L0.1_mean             float64
H_L0.1_variance         float64
HH_L0.1_weight          float64
HH_L0.1_mean            float64
HH_L0.1_std             float64
HH_L0.1_magnitude       float64
HH_L0.1_radius          float64
HH_L0.1_covariance      float64
HH_L0.1_pcc             float64
HH_jit_L0.1_weight      float64
HH_jit_L0.1_mean        float64
HH_jit_L0.1_variance    float64
HpHp_L0.1_weight        float64
HpHp_L0.1_mean          float64
HpHp_L0.1_std           float64
HpHp_L0.1_magnitude     float64
HpHp_L0.1_radius        float64
HpHp_L0.1_covariance    float64
HpHp_L0.1_pcc           float64
Device_Name              object
Attack                   object
Attack_subType           object
label                     int64
dtype: object




## Before proceeding to the correlation analysis, we need to convert the categorical features to numerical for better processing