# Exploratory Data Analysis: CIC-Bell-DNS-EXF-2021

In [1]:
#-----------
# REFERENCE
#-----------

# https://www.unb.ca/cic/datasets/dns-exf-2021.html

# Samaneh Mahdavifar, Amgad Hanafy Salem, Princy Victor, Miguel Garzon, Amir H. Razavi, Natasha Hellberg, Arash Habibi Lashkari, “Lightweight Hybrid Detection of Data Exfiltration using DNS based on Machine Learning”, The 11th IEEE International Conference on Communication and Network Security (ICCNS), Dec. 3-5, 2021, Beijing Jiaotong University, Weihai, China.

In [2]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Stateful Heavy Attack

In [3]:
# ----------------------
# stateful_heavy_attack
# ----------------------


PATH = '/kaggle/input/cicbelldnsexf2021/Attack_heavy_Benign/Attacks/'

stateful_heavy_attack_audio = pd.read_csv(PATH+'stateful_features-heavy_audio.pcap.csv')
stateful_heavy_attack_compressed = pd.read_csv(PATH+'stateful_features-heavy_compressed.pcap.csv')
stateful_heavy_attack_exe = pd.read_csv(PATH+'stateful_features-heavy_exe.pcap.csv')
stateful_heavy_attack_image = pd.read_csv(PATH+'stateful_features-heavy_image.pcap.csv')
stateful_heavy_attack_text = pd.read_csv(PATH+'stateful_features-heavy_text.pcap.csv')
stateful_heavy_attack_video = pd.read_csv(PATH+'stateful_features-heavy_video.pcap.csv')

number_of_records = len(stateful_heavy_attack_audio)+len(stateful_heavy_attack_compressed)+\
                        len(stateful_heavy_attack_exe)+len(stateful_heavy_attack_image)+\
                        len(stateful_heavy_attack_text)+len(stateful_heavy_attack_video)

# verifying number of stateful heavy attack records
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
print('Total Stateful Heavy Attacks:', number_of_records)

Total Stateful Heavy Attacks: 72028


In [4]:
# ----------------------
# stateful_heavy_attack
# ----------------------

dfs = [stateful_heavy_attack_audio, stateful_heavy_attack_compressed, 
      stateful_heavy_attack_exe, stateful_heavy_attack_image, 
      stateful_heavy_attack_text, stateful_heavy_attack_video]

columns = stateful_heavy_attack_audio.columns

# checking if all dataframes, of stateful heavy attack type, have same column names
columns_stateful_heavy_attack_audio = stateful_heavy_attack_audio.columns
columns_stateful_heavy_attack_compressed = stateful_heavy_attack_compressed.columns
columns_stateful_heavy_attack_exe = stateful_heavy_attack_exe.columns
columns_stateful_heavy_attack_image = stateful_heavy_attack_image.columns
columns_stateful_heavy_attack_text = stateful_heavy_attack_text.columns
columns_stateful_heavy_attack_video = stateful_heavy_attack_video.columns

dfs_columns = [ columns_stateful_heavy_attack_compressed,
            columns_stateful_heavy_attack_exe,
            columns_stateful_heavy_attack_image,
            columns_stateful_heavy_attack_text,
            columns_stateful_heavy_attack_video ]

for cols in dfs_columns:
    print('', columns_stateful_heavy_attack_audio.isin(cols).all())

 True
 True
 True
 True
 True


In [5]:
# ----------------------
# stateful_heavy_attack
# ----------------------

# merging dataframes 

stateful_heavy_attack = pd.concat([stateful_heavy_attack_audio, 
                                   stateful_heavy_attack_compressed,
                                   stateful_heavy_attack_exe,
                                   stateful_heavy_attack_image,
                                   stateful_heavy_attack_text,
                                   stateful_heavy_attack_video])

# verifying number of stateful heavy attack records after concatenation
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
len(stateful_heavy_attack)

72028

In [6]:
stateful_heavy_attack.head()

Unnamed: 0,rr,A_frequency,NS_frequency,CNAME_frequency,SOA_frequency,NULL_frequency,PTR_frequency,HINFO_frequency,MX_frequency,TXT_frequency,AAAA_frequency,SRV_frequency,OPT_frequency,rr_type,rr_count,rr_name_entropy,rr_name_length,distinct_ns,distinct_ip,unique_country,unique_asn,distinct_domains,reverse_dns,a_records,unique_ttl,ttl_mean,ttl_variance
0,0.0,0,0,0,0,0,2,0,0,0,0,0,0,{'PTR'},0,3.146084,25,0,set(),set(),set(),{},unknown,0,"[1, 1]",1.0,0.0
1,0.0,0,0,0,0,0,0,0,0,0,0,0,0,set(),0,2.689019,15,0,set(),set(),set(),{},unknown,0,[1],1.0,0.0
2,0.0,0,0,0,0,0,7,0,0,0,0,0,0,{'PTR'},0,3.102731,24,0,set(),set(),set(),{},unknown,0,"[1, 1, 1, 1, 1, 1, 1]",1.0,0.0
3,0.0,0,0,0,0,0,1,0,0,0,0,0,0,{'PTR'},0,2.745694,16,0,set(),set(),set(),{},unknown,0,[255],255.0,0.0
4,0.0,0,0,0,0,0,2,0,0,0,0,0,0,{'PTR'},0,3.146319,25,0,set(),set(),set(),{},unknown,0,"[1, 1]",1.0,0.0


In [7]:
stateful_heavy_attack.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72028 entries, 0 to 10896
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rr                72028 non-null  float64
 1   A_frequency       72028 non-null  int64  
 2   NS_frequency      72028 non-null  int64  
 3   CNAME_frequency   72028 non-null  int64  
 4   SOA_frequency     72028 non-null  int64  
 5   NULL_frequency    72028 non-null  int64  
 6   PTR_frequency     72028 non-null  int64  
 7   HINFO_frequency   72028 non-null  int64  
 8   MX_frequency      72028 non-null  int64  
 9   TXT_frequency     72028 non-null  int64  
 10  AAAA_frequency    72028 non-null  int64  
 11  SRV_frequency     72028 non-null  int64  
 12  OPT_frequency     72028 non-null  int64  
 13  rr_type           72028 non-null  object 
 14  rr_count          72028 non-null  int64  
 15  rr_name_entropy   72028 non-null  float64
 16  rr_name_length    72028 non-null  int64 

In [8]:
for column in columns:
    print(column, ': ', stateful_heavy_attack[column].notnull().unique())

rr :  [ True]
A_frequency :  [ True]
NS_frequency :  [ True]
CNAME_frequency :  [ True]
SOA_frequency :  [ True]
NULL_frequency :  [ True]
PTR_frequency :  [ True]
HINFO_frequency :  [ True]
MX_frequency :  [ True]
TXT_frequency :  [ True]
AAAA_frequency :  [ True]
SRV_frequency :  [ True]
OPT_frequency :  [ True]
rr_type :  [ True]
rr_count :  [ True]
rr_name_entropy :  [ True]
rr_name_length :  [ True]
distinct_ns :  [ True]
distinct_ip :  [ True]
unique_country :  [ True]
unique_asn :  [ True]
distinct_domains :  [ True]
reverse_dns :  [ True]
a_records :  [ True]
unique_ttl :  [ True]
ttl_mean :  [ True]
ttl_variance :  [ True]


In [9]:
numeric_columns = stateful_heavy_attack.select_dtypes(exclude=[object]).columns
object_columns =  stateful_heavy_attack.select_dtypes(include=[object]).columns

In [10]:
stateful_heavy_attack[numeric_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72028 entries, 0 to 10896
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rr               72028 non-null  float64
 1   A_frequency      72028 non-null  int64  
 2   NS_frequency     72028 non-null  int64  
 3   CNAME_frequency  72028 non-null  int64  
 4   SOA_frequency    72028 non-null  int64  
 5   NULL_frequency   72028 non-null  int64  
 6   PTR_frequency    72028 non-null  int64  
 7   HINFO_frequency  72028 non-null  int64  
 8   MX_frequency     72028 non-null  int64  
 9   TXT_frequency    72028 non-null  int64  
 10  AAAA_frequency   72028 non-null  int64  
 11  SRV_frequency    72028 non-null  int64  
 12  OPT_frequency    72028 non-null  int64  
 13  rr_count         72028 non-null  int64  
 14  rr_name_entropy  72028 non-null  float64
 15  rr_name_length   72028 non-null  int64  
 16  distinct_ns      72028 non-null  int64  
 17  a_records   

In [11]:
stateful_heavy_attack[object_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72028 entries, 0 to 10896
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rr_type           72028 non-null  object
 1   distinct_ip       72028 non-null  object
 2   unique_country    72028 non-null  object
 3   unique_asn        72028 non-null  object
 4   distinct_domains  72028 non-null  object
 5   reverse_dns       72028 non-null  object
 6   unique_ttl        72028 non-null  object
dtypes: object(7)
memory usage: 4.4+ MB


In [12]:
stateful_heavy_attack[numeric_columns].var()

rr                    0.039157
A_frequency           0.039125
NS_frequency          0.000000
CNAME_frequency       0.000000
SOA_frequency         0.000000
NULL_frequency        0.000000
PTR_frequency         5.544608
HINFO_frequency       0.000000
MX_frequency          0.000000
TXT_frequency         0.000028
AAAA_frequency        0.000056
SRV_frequency         0.000000
OPT_frequency         0.000000
rr_count              0.000333
rr_name_entropy       0.053900
rr_name_length       12.222764
distinct_ns           0.000097
a_records             0.000000
ttl_mean           2120.546396
ttl_variance          0.016393
dtype: float64

In [13]:
for obj_col in object_columns:
    print(obj_col, ':', '\n', stateful_heavy_attack[obj_col].unique(), '\n\n\n')

rr_type : 
 ["{'PTR'}" 'set()' "{'A'}" '{None}' "{'TXT'}" "{'AAAA', 'A'}"] 



distinct_ip : 
 ['set()'] 



unique_country : 
 ['set()' "{'US'}"] 



unique_asn : 
 ['set()' "{'AS15169'}" "{'AS3598'}"] 



distinct_domains : 
 ['{}' "{'172.217.164.227': {'ssl.gstatic.com'}}"
 "{'131.107.255.255': {'dns.msftncsi.com'}}"
 "{'172.217.164.238': {'play.google.com'}}"
 "{'172.217.164.238': {'redirector.gvt1.com'}}"] 



reverse_dns : 
 ['unknown' 'yyz12s05-in-f3.1e100.net' 'dns.msftncsi.com'
 'yyz12s05-in-f14.1e100.net'] 



unique_ttl : 
 ['[1, 1]' '[1]' '[1, 1, 1, 1, 1, 1, 1]' '[255]' '[128]'
 '[1, 1, 1, 1, 1, 1]' '[1, 1, 1, 1]' '[128, 128, 128]'
 '[1, 1, 1, 1, 1, 1, 1, 1]' '[1, 1, 1, 1, 1]' '[1, 1, 1]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]'
 '[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]' '[128, 128, 128, 128, 128, 128]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1]' '[64, 64, 64]'
 '[64, 64, 64, 64, 64, 64, 64, 64, 64]' '[128, 128]' '[128, 122]'
 '[64, 64, 64, 64, 64,

## Stateful Heavy Benign

In [14]:
# ---------------------
# stateful_heavy_benign
# ---------------------

PATH = '/kaggle/input/cicbelldnsexf2021/Attack_heavy_Benign/Benign/'
stateful_heavy_benign1 = pd.read_csv(PATH+'stateful_features-benign_heavy_1.pcap.csv')
stateful_heavy_benign2 = pd.read_csv(PATH+'stateful_features-benign_heavy_2.pcap.csv')
stateful_heavy_benign3 = pd.read_csv(PATH+'stateful_features-benign_heavy_3.pcap.csv')

PATH = '/kaggle/input/cicbelldnsexf2021/Benign/'
stateful_benign1 = pd.read_csv(PATH+'stateful_features-benign_1.pcap.csv')
stateful_benign2 = pd.read_csv(PATH+'stateful_features-benign_2.pcap.csv')

number_of_records = len(stateful_heavy_benign1)+len(stateful_heavy_benign2)+\
                        len(stateful_heavy_benign3)+\
                        len(stateful_benign1)+len(stateful_benign2)

# verifying number of stateful heavy benign records
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
print('Total Stateful Heavy Benign:', number_of_records)

Total Stateful Heavy Benign: 156014


In [15]:
# ---------------------
# stateful_heavy_benign
# ---------------------

dfs = [stateful_heavy_benign1, stateful_heavy_benign2, stateful_heavy_benign3,
               stateful_benign1, stateful_benign2]

columns = stateful_heavy_benign1.columns

# checking if all dataframes, of stateful heavy benign type, have same column names
columns_stateful_heavy_benign1 = stateful_heavy_benign1.columns
columns_stateful_heavy_benign2 = stateful_heavy_benign2.columns
columns_stateful_heavy_benign3 = stateful_heavy_benign3.columns
columns_stateful_benign1 = stateful_benign1.columns
columns_stateful_benign2 = stateful_benign2.columns


dfs_columns = [ columns_stateful_heavy_benign2,
                columns_stateful_heavy_benign3, 
                columns_stateful_benign1,
                columns_stateful_benign2]

for cols in dfs_columns:
    print('', columns_stateful_heavy_benign1.isin(cols).all())

 True
 True
 True
 True


In [16]:
# ----------------------
# stateful_heavy_benign
# ----------------------

# merging dataframes 

stateful_heavy_benign = pd.concat([stateful_heavy_benign1, 
                                   stateful_heavy_benign2,
                                   stateful_heavy_benign3,
                                   stateful_benign1,
                                   stateful_benign2
                                   ])

# verifying number of stateful heavy benign records after concatenation
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
len(stateful_heavy_benign)

156014

In [17]:
for column in columns:
    print(column, ': ', stateful_heavy_benign[column].notnull().unique())

rr :  [ True]
A_frequency :  [ True]
NS_frequency :  [ True]
CNAME_frequency :  [ True]
SOA_frequency :  [ True]
NULL_frequency :  [ True]
PTR_frequency :  [ True]
HINFO_frequency :  [ True]
MX_frequency :  [ True]
TXT_frequency :  [ True]
AAAA_frequency :  [ True]
SRV_frequency :  [ True]
OPT_frequency :  [ True]
rr_type :  [ True]
rr_count :  [ True]
rr_name_entropy :  [ True]
rr_name_length :  [ True]
distinct_ns :  [ True]
distinct_ip :  [ True]
unique_country :  [ True]
unique_asn :  [ True]
distinct_domains :  [ True]
reverse_dns :  [ True]
a_records :  [ True]
unique_ttl :  [ True]
ttl_mean :  [ True]
ttl_variance :  [ True]


In [18]:
stateful_heavy_benign.head()

Unnamed: 0,rr,A_frequency,NS_frequency,CNAME_frequency,SOA_frequency,NULL_frequency,PTR_frequency,HINFO_frequency,MX_frequency,TXT_frequency,AAAA_frequency,SRV_frequency,OPT_frequency,rr_type,rr_count,rr_name_entropy,rr_name_length,distinct_ns,distinct_ip,unique_country,unique_asn,distinct_domains,reverse_dns,a_records,unique_ttl,ttl_mean,ttl_variance
0,0.0,0,0,0,0,0,3,0,0,0,0,0,0,{'PTR'},0,3.222463,27,0,set(),set(),set(),{},unknown,0,"[1, 1, 1]",1.0,0.0
1,0.0,0,0,0,0,0,2,0,0,0,0,0,0,{'PTR'},0,3.223147,27,0,set(),set(),set(),{},unknown,0,"[1, 1]",1.0,0.0
2,0.0,0,0,0,0,0,10,0,0,0,0,0,0,{'PTR'},0,3.102731,24,0,set(),set(),set(),{},unknown,0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1.0,0.0
3,0.0,0,0,0,0,0,0,0,0,0,0,0,0,{None},0,3.463985,32,0,set(),set(),set(),{},unknown,0,"[64, 64, 64]",64.0,0.0
4,0.0,0,0,0,0,0,2,0,0,0,0,0,0,{'PTR'},0,3.222243,27,0,set(),set(),set(),{},unknown,0,"[1, 1]",1.0,0.0


In [19]:
stateful_heavy_benign.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156014 entries, 0 to 34559
Data columns (total 27 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   rr                156014 non-null  float64
 1   A_frequency       156014 non-null  int64  
 2   NS_frequency      156014 non-null  int64  
 3   CNAME_frequency   156014 non-null  int64  
 4   SOA_frequency     156014 non-null  int64  
 5   NULL_frequency    156014 non-null  int64  
 6   PTR_frequency     156014 non-null  int64  
 7   HINFO_frequency   156014 non-null  int64  
 8   MX_frequency      156014 non-null  int64  
 9   TXT_frequency     156014 non-null  int64  
 10  AAAA_frequency    156014 non-null  int64  
 11  SRV_frequency     156014 non-null  int64  
 12  OPT_frequency     156014 non-null  int64  
 13  rr_type           156014 non-null  object 
 14  rr_count          156014 non-null  int64  
 15  rr_name_entropy   156014 non-null  float64
 16  rr_name_length    156

In [20]:
numeric_columns = stateful_heavy_benign.select_dtypes(exclude=[object]).columns
object_columns =  stateful_heavy_benign.select_dtypes(include=[object]).columns

In [21]:
stateful_heavy_benign[numeric_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156014 entries, 0 to 34559
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   rr               156014 non-null  float64
 1   A_frequency      156014 non-null  int64  
 2   NS_frequency     156014 non-null  int64  
 3   CNAME_frequency  156014 non-null  int64  
 4   SOA_frequency    156014 non-null  int64  
 5   NULL_frequency   156014 non-null  int64  
 6   PTR_frequency    156014 non-null  int64  
 7   HINFO_frequency  156014 non-null  int64  
 8   MX_frequency     156014 non-null  int64  
 9   TXT_frequency    156014 non-null  int64  
 10  AAAA_frequency   156014 non-null  int64  
 11  SRV_frequency    156014 non-null  int64  
 12  OPT_frequency    156014 non-null  int64  
 13  rr_count         156014 non-null  int64  
 14  rr_name_entropy  156014 non-null  float64
 15  rr_name_length   156014 non-null  int64  
 16  distinct_ns      156014 non-null  int64

In [22]:
stateful_heavy_benign[object_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156014 entries, 0 to 34559
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   rr_type           156014 non-null  object
 1   distinct_ip       156014 non-null  object
 2   unique_country    156014 non-null  object
 3   unique_asn        156014 non-null  object
 4   distinct_domains  156014 non-null  object
 5   reverse_dns       156014 non-null  object
 6   unique_ttl        156014 non-null  object
dtypes: object(7)
memory usage: 9.5+ MB


In [23]:
stateful_heavy_benign[numeric_columns].var()

rr                    1.904849
A_frequency           1.904840
NS_frequency          0.000000
CNAME_frequency       0.000000
SOA_frequency         0.000000
NULL_frequency        0.000000
PTR_frequency         4.147252
HINFO_frequency       0.000000
MX_frequency          0.000000
TXT_frequency         0.000013
AAAA_frequency        0.000051
SRV_frequency         0.000000
OPT_frequency         0.000000
rr_count              0.630726
rr_name_entropy       0.150845
rr_name_length       42.540177
distinct_ns           0.243661
a_records             0.000000
ttl_mean           3367.752697
ttl_variance         19.063997
dtype: float64

In [24]:
for obj_col in object_columns:
    print(obj_col, ':', '\n', stateful_heavy_benign[obj_col].unique(), '\n\n\n')

rr_type : 
 ["{'PTR'}" '{None}' "{'A'}" 'set()' "{'TXT'}" "{'AAAA', 'A'}"] 



distinct_ip : 
 ['set()'] 



unique_country : 
 ['set()' "{'DE'}" "{'US'}" "{'CH'}" "{'NL'}" "{'FR'}" "{'RU'}" "{'TR'}"
 "{'CY'}" "{'RO'}" "{'BY'}" "{'AT'}" "{'BE'}" "{'CZ'}" "{'UZ'}" "{'EE'}"
 "{'PL'}" "{'GB'}" "{'CA'}" "{'HU'}" "{'MK'}" "{'SE'}" "{'KZ'}" "{'UA'}"
 "{'AU'}" "{'AE'}" "{'IN'}" "{'IR'}" "{'CN'}" "{'ES'}" "{'MX'}" "{'PE'}"
 "{'LT'}" "{'TW'}" "{'IT'}" "{'MY'}" "{'KE'}" "{'GR'}" "{'BG'}" "{'VN'}"
 "{'IL'}" "{'KW'}" "{'BZ'}" "{'HK'}" "{'RS'}" "{'JP'}" "{'SC'}" "{'ID'}"
 "{'ZA'}" "{'TH'}" "{'AR'}" "{'NZ'}" "{'MD'}" "{'SG'}" "{'MZ'}" "{'NO'}"
 "{'CO'}" "{'NI'}" "{'EG'}" "{'PT'}" "{'DZ'}" "{'DK'}" "{'PR'}" "{'IE'}"
 "{'AZ'}" "{'VE'}" "{'OM'}" "{'CL'}" "{'SK'}" "{'BF'}" "{'GE'}" "{'ET'}"
 "{'EU'}" "{'KR'}" "{'TZ'}" "{'TJ'}" "{'FI'}" "{'AI'}" "{'MA'}" "{'PA'}"
 "{''}" "{'BA'}" "{'SA'}" "{'AM'}" "{'PK'}" "{'SI'}" "{'MT'}" "{'CW'}"
 "{'VG'}" "{'MN'}" "{'EC'}" "{'IM'}" "{'BD'}" "{'NL', 'US'}" "{'QA'}"
 "

## Stateful Light Attack

In [25]:
# ---------------------
# stateful_light_attack
# ---------------------

PATH = '/kaggle/input/cicbelldnsexf2021/Attack_Light_Benign/Attacks/'
stateful_light_attack_audio = pd.read_csv(PATH+'stateful_features-light_audio.pcap.csv')
stateful_light_attack_compressed = pd.read_csv(PATH+'stateful_features-light_compressed.pcap.csv')
stateful_light_attack_exe = pd.read_csv(PATH+'stateful_features-light_exe.pcap.csv')
stateful_light_attack_image = pd.read_csv(PATH+'stateful_features-light_image.pcap.csv')
stateful_light_attack_text = pd.read_csv(PATH+'stateful_features-light_text.pcap.csv')
stateful_light_attack_video = pd.read_csv(PATH+'stateful_features-light_video.pcap.csv')

number_of_records = len(stateful_light_attack_audio)+len(stateful_light_attack_compressed)+\
                        len(stateful_light_attack_exe)+len(stateful_light_attack_image)+\
                        len(stateful_light_attack_text)+len(stateful_light_attack_video)
        
# verifying number of stateful light attack records
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
print('Total Stateful Light Attack:', number_of_records)

Total Stateful Light Attack: 11295


In [26]:
# ---------------------
# stateful_light_attack
# ---------------------

dfs = [stateful_light_attack_audio, stateful_light_attack_compressed, 
      stateful_light_attack_exe, stateful_light_attack_image, 
      stateful_light_attack_text, stateful_light_attack_video]

columns = stateful_light_attack_audio.columns

# checking if all dataframes, of stateful light attack type, have same column names
columns_stateful_light_attack_audio = stateful_light_attack_audio.columns
columns_stateful_light_attack_compressed = stateful_light_attack_compressed.columns
columns_stateful_light_attack_exe = stateful_light_attack_exe.columns
columns_stateful_light_attack_image = stateful_light_attack_image.columns
columns_stateful_light_attack_text = stateful_light_attack_text.columns
columns_stateful_light_attack_video = stateful_light_attack_video.columns

dfs_columns = [ columns_stateful_light_attack_compressed,
            columns_stateful_light_attack_exe,
            columns_stateful_light_attack_image,
            columns_stateful_light_attack_text,
            columns_stateful_light_attack_video ]

for cols in dfs_columns:
    print('', columns_stateful_light_attack_audio.isin(cols).all())

 True
 True
 True
 True
 True


In [27]:
# ----------------------
# stateful_light_attack
# ----------------------

# merging dataframes 

stateful_light_attack = pd.concat([stateful_light_attack_audio, 
                                   stateful_light_attack_compressed,
                                   stateful_light_attack_exe,
                                   stateful_light_attack_image,
                                   stateful_light_attack_text,
                                   stateful_light_attack_video])

# verifying number of stateful light attack records after concatenation
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
len(stateful_light_attack)

11295

In [28]:
for column in columns:
    print(column, ': ', stateful_light_attack[column].notnull().unique())

rr :  [ True]
A_frequency :  [ True]
NS_frequency :  [ True]
CNAME_frequency :  [ True]
SOA_frequency :  [ True]
NULL_frequency :  [ True]
PTR_frequency :  [ True]
HINFO_frequency :  [ True]
MX_frequency :  [ True]
TXT_frequency :  [ True]
AAAA_frequency :  [ True]
SRV_frequency :  [ True]
OPT_frequency :  [ True]
rr_type :  [ True]
rr_count :  [ True]
rr_name_entropy :  [ True]
rr_name_length :  [ True]
distinct_ns :  [ True]
distinct_ip :  [ True]
unique_country :  [ True]
unique_asn :  [ True]
distinct_domains :  [ True]
reverse_dns :  [ True]
a_records :  [ True]
unique_ttl :  [ True]
ttl_mean :  [ True]
ttl_variance :  [ True]


In [29]:
stateful_light_attack.head()

Unnamed: 0,rr,A_frequency,NS_frequency,CNAME_frequency,SOA_frequency,NULL_frequency,PTR_frequency,HINFO_frequency,MX_frequency,TXT_frequency,AAAA_frequency,SRV_frequency,OPT_frequency,rr_type,rr_count,rr_name_entropy,rr_name_length,distinct_ns,distinct_ip,unique_country,unique_asn,distinct_domains,reverse_dns,a_records,unique_ttl,ttl_mean,ttl_variance
0,0.0,0,0,0,0,0,2,0,0,0,0,0,0,{'PTR'},0,3.222463,27,0,set(),set(),set(),{},unknown,0,"[1, 1]",1.0,0.0
1,0.0,0,0,0,0,0,10,0,0,0,0,0,0,{'PTR'},0,3.102731,24,0,set(),set(),set(),{},unknown,0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1.0,0.0
2,0.0,0,0,0,0,0,2,0,0,0,0,0,0,{'PTR'},0,3.146084,25,0,set(),set(),set(),{},unknown,0,"[1, 1]",1.0,0.0
3,0.0,0,0,0,0,0,2,0,0,0,0,0,0,{'PTR'},0,3.146319,25,0,set(),set(),set(),{},unknown,0,"[1, 1]",1.0,0.0
4,0.0,0,0,0,0,0,2,0,0,0,0,0,0,{'PTR'},0,3.223294,27,0,set(),set(),set(),{},unknown,0,"[1, 1]",1.0,0.0


In [30]:
stateful_light_attack.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11295 entries, 0 to 1244
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rr                11295 non-null  float64
 1   A_frequency       11295 non-null  int64  
 2   NS_frequency      11295 non-null  int64  
 3   CNAME_frequency   11295 non-null  int64  
 4   SOA_frequency     11295 non-null  int64  
 5   NULL_frequency    11295 non-null  int64  
 6   PTR_frequency     11295 non-null  int64  
 7   HINFO_frequency   11295 non-null  int64  
 8   MX_frequency      11295 non-null  int64  
 9   TXT_frequency     11295 non-null  int64  
 10  AAAA_frequency    11295 non-null  int64  
 11  SRV_frequency     11295 non-null  int64  
 12  OPT_frequency     11295 non-null  int64  
 13  rr_type           11295 non-null  object 
 14  rr_count          11295 non-null  int64  
 15  rr_name_entropy   11295 non-null  float64
 16  rr_name_length    11295 non-null  int64  

In [31]:
numeric_columns = stateful_light_attack.select_dtypes(exclude=[object]).columns
object_columns =  stateful_light_attack.select_dtypes(include=[object]).columns

In [32]:
stateful_light_attack[numeric_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11295 entries, 0 to 1244
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rr               11295 non-null  float64
 1   A_frequency      11295 non-null  int64  
 2   NS_frequency     11295 non-null  int64  
 3   CNAME_frequency  11295 non-null  int64  
 4   SOA_frequency    11295 non-null  int64  
 5   NULL_frequency   11295 non-null  int64  
 6   PTR_frequency    11295 non-null  int64  
 7   HINFO_frequency  11295 non-null  int64  
 8   MX_frequency     11295 non-null  int64  
 9   TXT_frequency    11295 non-null  int64  
 10  AAAA_frequency   11295 non-null  int64  
 11  SRV_frequency    11295 non-null  int64  
 12  OPT_frequency    11295 non-null  int64  
 13  rr_count         11295 non-null  int64  
 14  rr_name_entropy  11295 non-null  float64
 15  rr_name_length   11295 non-null  int64  
 16  distinct_ns      11295 non-null  int64  
 17  a_records    

In [33]:
stateful_light_attack[object_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11295 entries, 0 to 1244
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rr_type           11295 non-null  object
 1   distinct_ip       11295 non-null  object
 2   unique_country    11295 non-null  object
 3   unique_asn        11295 non-null  object
 4   distinct_domains  11295 non-null  object
 5   reverse_dns       11295 non-null  object
 6   unique_ttl        11295 non-null  object
dtypes: object(7)
memory usage: 705.9+ KB


In [34]:
stateful_light_attack[numeric_columns].var()

rr                    0.034564
A_frequency           0.034564
NS_frequency          0.000000
CNAME_frequency       0.000000
SOA_frequency         0.000000
NULL_frequency        0.000000
PTR_frequency         7.910261
HINFO_frequency       0.000000
MX_frequency          0.000000
TXT_frequency         0.000089
AAAA_frequency        0.000000
SRV_frequency         0.000000
OPT_frequency         0.000000
rr_count              0.000266
rr_name_entropy       0.053946
rr_name_length       12.438706
distinct_ns           0.000177
a_records             0.000000
ttl_mean           2214.212320
ttl_variance          0.021510
dtype: float64

In [35]:
for obj_col in object_columns:
    print(obj_col, ':', '\n', stateful_light_attack[obj_col].unique(), '\n\n\n')

rr_type : 
 ["{'PTR'}" '{None}' 'set()' "{'A'}" "{'TXT'}"] 



distinct_ip : 
 ['set()'] 



unique_country : 
 ['set()' "{'US'}"] 



unique_asn : 
 ['set()' "{'AS3598'}"] 



distinct_domains : 
 ['{}' "{'131.107.255.255': {'dns.msftncsi.com'}}"] 



reverse_dns : 
 ['unknown' 'dns.msftncsi.com'] 



unique_ttl : 
 ['[1, 1]' '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]' '[1, 1, 1, 1, 1, 1]'
 '[1, 1, 1, 1, 1, 1, 1]' '[1, 1, 1, 1]' '[1, 1, 1, 1, 1]'
 '[1, 1, 1, 1, 1, 1, 1, 1]' '[255]' '[64, 64, 64, 64]' '[1]'
 '[128, 128, 128, 128, 128, 128]' '[128, 128, 128]' '[64]' '[64, 64, 64]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]' '[1, 1, 1]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1]' '[128]'
 '[128, 128, 128, 128, 128, 128, 128]' '[128, 128]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]' '[255, 255]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]'
 '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

## Stateful Light Benign

In [36]:
# ---------------------
# stateful_light_benign
# ---------------------

PATH = '/kaggle/input/cicbelldnsexf2021/Attack_Light_Benign/Benign/'
stateful_light_benign = pd.read_csv(PATH+'stateful_features-_light_benign.pcap.csv')

PATH = '/kaggle/input/cicbelldnsexf2021/Benign/'
stateful_benign1 = pd.read_csv(PATH+'stateful_features-benign_1.pcap.csv')
stateful_benign2 = pd.read_csv(PATH+'stateful_features-benign_2.pcap.csv')

number_of_records = len(stateful_light_benign)+len(stateful_benign1)+len(stateful_benign2)

# verifying number of stateful light benign records
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
print('Total Stateful Light Benign:', number_of_records)

Total Stateful Light Benign: 109766


In [37]:
# ---------------------
# stateful_light_benign
# ---------------------

dfs = [stateful_light_benign, stateful_benign1, stateful_benign2]

columns = stateful_light_benign.columns

# checking if all dataframes, of stateful light benign type, have same column names
columns_stateful_light_benign = stateful_light_benign.columns
columns_stateful_benign1 = stateful_benign1.columns
columns_stateful_benign2 = stateful_benign2.columns


dfs_columns = [ columns_stateful_benign1,
                columns_stateful_benign2
              ]

for cols in dfs_columns:
    print('', columns_stateful_light_benign.isin(cols).all())

 True
 True


In [38]:
# ----------------------
# stateful_light_benign
# ----------------------

# merging dataframes 

stateful_light_benign = pd.concat([stateful_light_benign, 
                                   stateful_benign1, 
                                   stateful_benign2
                                  ])

# verifying number of stateful light benign records after concatenation
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
len(stateful_light_benign)

109766

In [39]:
for column in columns:
    print(column, ': ', stateful_light_benign[column].notnull().unique())

rr :  [ True]
A_frequency :  [ True]
NS_frequency :  [ True]
CNAME_frequency :  [ True]
SOA_frequency :  [ True]
NULL_frequency :  [ True]
PTR_frequency :  [ True]
HINFO_frequency :  [ True]
MX_frequency :  [ True]
TXT_frequency :  [ True]
AAAA_frequency :  [ True]
SRV_frequency :  [ True]
OPT_frequency :  [ True]
rr_type :  [ True]
rr_count :  [ True]
rr_name_entropy :  [ True]
rr_name_length :  [ True]
distinct_ns :  [ True]
distinct_ip :  [ True]
unique_country :  [ True]
unique_asn :  [ True]
distinct_domains :  [ True]
reverse_dns :  [ True]
a_records :  [ True]
unique_ttl :  [ True]
ttl_mean :  [ True]
ttl_variance :  [ True]


In [40]:
stateful_light_benign.head()

Unnamed: 0,rr,A_frequency,NS_frequency,CNAME_frequency,SOA_frequency,NULL_frequency,PTR_frequency,HINFO_frequency,MX_frequency,TXT_frequency,AAAA_frequency,SRV_frequency,OPT_frequency,rr_type,rr_count,rr_name_entropy,rr_name_length,distinct_ns,distinct_ip,unique_country,unique_asn,distinct_domains,reverse_dns,a_records,unique_ttl,ttl_mean,ttl_variance
0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,{'PTR'},0,3.222243,27,0,set(),set(),set(),{},unknown,0,[1],1.0,0.0
1,0.0,0,0,0,0,0,6,0,0,0,0,0,0,{'PTR'},0,3.146084,25,0,set(),set(),set(),{},unknown,0,"[1, 1, 1, 1, 1, 1]",1.0,0.0
2,0.0,0,0,0,0,0,6,0,0,0,0,0,0,{'PTR'},0,3.223294,27,0,set(),set(),set(),{},unknown,0,"[1, 1, 1, 1, 1, 1]",1.0,0.0
3,0.0,0,0,0,0,0,6,0,0,0,0,0,0,{'PTR'},0,3.222463,27,0,set(),set(),set(),{},unknown,0,"[1, 1, 1, 1, 1, 1]",1.0,0.0
4,0.0,0,0,0,0,0,10,0,0,0,0,0,0,{'PTR'},0,3.102731,24,0,set(),set(),set(),{},unknown,0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1.0,0.0


In [41]:
stateful_light_benign.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109766 entries, 0 to 34559
Data columns (total 27 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   rr                109766 non-null  float64
 1   A_frequency       109766 non-null  int64  
 2   NS_frequency      109766 non-null  int64  
 3   CNAME_frequency   109766 non-null  int64  
 4   SOA_frequency     109766 non-null  int64  
 5   NULL_frequency    109766 non-null  int64  
 6   PTR_frequency     109766 non-null  int64  
 7   HINFO_frequency   109766 non-null  int64  
 8   MX_frequency      109766 non-null  int64  
 9   TXT_frequency     109766 non-null  int64  
 10  AAAA_frequency    109766 non-null  int64  
 11  SRV_frequency     109766 non-null  int64  
 12  OPT_frequency     109766 non-null  int64  
 13  rr_type           109766 non-null  object 
 14  rr_count          109766 non-null  int64  
 15  rr_name_entropy   109766 non-null  float64
 16  rr_name_length    109

In [42]:
numeric_columns = stateful_light_benign.select_dtypes(exclude=[object]).columns
object_columns =  stateful_light_benign.select_dtypes(include=[object]).columns

In [43]:
stateful_light_benign[numeric_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109766 entries, 0 to 34559
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   rr               109766 non-null  float64
 1   A_frequency      109766 non-null  int64  
 2   NS_frequency     109766 non-null  int64  
 3   CNAME_frequency  109766 non-null  int64  
 4   SOA_frequency    109766 non-null  int64  
 5   NULL_frequency   109766 non-null  int64  
 6   PTR_frequency    109766 non-null  int64  
 7   HINFO_frequency  109766 non-null  int64  
 8   MX_frequency     109766 non-null  int64  
 9   TXT_frequency    109766 non-null  int64  
 10  AAAA_frequency   109766 non-null  int64  
 11  SRV_frequency    109766 non-null  int64  
 12  OPT_frequency    109766 non-null  int64  
 13  rr_count         109766 non-null  int64  
 14  rr_name_entropy  109766 non-null  float64
 15  rr_name_length   109766 non-null  int64  
 16  distinct_ns      109766 non-null  int64

In [44]:
stateful_light_benign[object_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109766 entries, 0 to 34559
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   rr_type           109766 non-null  object
 1   distinct_ip       109766 non-null  object
 2   unique_country    109766 non-null  object
 3   unique_asn        109766 non-null  object
 4   distinct_domains  109766 non-null  object
 5   reverse_dns       109766 non-null  object
 6   unique_ttl        109766 non-null  object
dtypes: object(7)
memory usage: 6.7+ MB


In [45]:
stateful_light_benign[numeric_columns].var()

rr                    1.874918
A_frequency           1.874905
NS_frequency          0.000000
CNAME_frequency       0.000000
SOA_frequency         0.000000
NULL_frequency        0.000000
PTR_frequency         4.143794
HINFO_frequency       0.000000
MX_frequency          0.000000
TXT_frequency         0.000009
AAAA_frequency        0.000073
SRV_frequency         0.000000
OPT_frequency         0.000000
rr_count              0.627047
rr_name_entropy       0.154295
rr_name_length       43.240511
distinct_ns           0.239664
a_records             0.000000
ttl_mean           3360.945784
ttl_variance         27.915706
dtype: float64

In [46]:
for obj_col in object_columns:
    print(obj_col, ':', '\n', stateful_light_benign[obj_col].unique(), '\n\n\n')

rr_type : 
 ["{'PTR'}" "{'A'}" '{None}' 'set()' "{'AAAA', 'A'}" "{'TXT'}"] 



distinct_ip : 
 ['set()'] 



unique_country : 
 ['set()' "{'DE'}" "{'US'}" "{'CH'}" "{'NL'}" "{'FR'}" "{'RU'}" "{'TR'}"
 "{'CY'}" "{'RO'}" "{'BY'}" "{'AT'}" "{'BE'}" "{'CZ'}" "{'UZ'}" "{'EE'}"
 "{'PL'}" "{'GB'}" "{'CA'}" "{'HU'}" "{'MK'}" "{'SE'}" "{'KZ'}" "{'UA'}"
 "{'AU'}" "{'AE'}" "{'IN'}" "{'KW'}" "{'IR'}" "{'CN'}" "{'ES'}" "{'MX'}"
 "{'PE'}" "{'LT'}" "{'TW'}" "{'IT'}" "{'MY'}" "{'KE'}" "{'GR'}" "{'BG'}"
 "{'VN'}" "{'IL'}" "{'BZ'}" "{'HK'}" "{'RS'}" "{'JP'}" "{'SC'}" "{'ID'}"
 "{'ZA'}" "{'US', 'DE'}" "{'TH'}" "{'AR'}" "{'NZ'}" "{'MD'}" "{'SG'}"
 "{'MZ'}" "{'NO'}" "{'CO'}" "{'NI'}" "{'EG'}" "{'PT'}" "{'DZ'}" "{'DK'}"
 "{'PR'}" "{'IE'}" "{'AZ'}" "{'VE'}" "{'OM'}" "{'CL'}" "{'SK'}" "{'BF'}"
 "{'GE'}" "{'ET'}" "{'EU'}" "{'KR'}" "{'TZ'}" "{'TJ'}" "{'FI'}" "{'AI'}"
 "{'MA'}" "{'PA'}" "{''}" "{'BA'}" "{'SA'}" "{'AM'}" "{'PK'}" "{'SI'}"
 "{'MT'}" "{'CW'}" "{'VG'}" "{'MN'}" "{'EC'}" "{'IM'}" "{'BD'}" "{'QA'}"
 "

## Stateless Heavy Attack

In [47]:
# stateless_heavy_attack
PATH = '/kaggle/input/cicbelldnsexf2021/Attack_heavy_Benign/Attacks/'
stateless_heavy_attack_audio = pd.read_csv(PATH+'stateless_features-heavy_audio.pcap.csv')
stateless_heavy_attack_compressed = pd.read_csv(PATH+'stateless_features-heavy_compressed.pcap.csv')
stateless_heavy_attack_exe = pd.read_csv(PATH+'stateless_features-heavy_exe.pcap.csv')
stateless_heavy_attack_image = pd.read_csv(PATH+'stateless_features-heavy_image.pcap.csv')
stateless_heavy_attack_text = pd.read_csv(PATH+'stateless_features-heavy_text.pcap.csv')
stateless_heavy_attack_video = pd.read_csv(PATH+'stateless_features-heavy_video.pcap.csv')

number_of_records = len(stateless_heavy_attack_audio)+len(stateless_heavy_attack_compressed)+\
                         len(stateless_heavy_attack_exe)+len(stateless_heavy_attack_image)+\
                         len(stateless_heavy_attack_text)+len(stateless_heavy_attack_video)

# verifying number of stateless heavy attack records
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
print('Total Stateless Heavy Attacks:', number_of_records)

Total Stateless Heavy Attacks: 251670


In [48]:
# ----------------------
# stateless_heavy_attack
# ----------------------

dfs = [stateless_heavy_attack_audio, stateless_heavy_attack_compressed, 
      stateless_heavy_attack_exe, stateless_heavy_attack_image, 
      stateless_heavy_attack_text, stateless_heavy_attack_video]

columns = stateless_heavy_attack_audio.columns

# checking if all dataframes, of stateful heavy attack type, have same column names
columns_stateless_heavy_attack_audio = stateless_heavy_attack_audio.columns
columns_stateless_heavy_attack_compressed = stateless_heavy_attack_compressed.columns
columns_stateless_heavy_attack_exe = stateless_heavy_attack_exe.columns
columns_stateless_heavy_attack_image = stateless_heavy_attack_image.columns
columns_stateless_heavy_attack_text = stateless_heavy_attack_text.columns
columns_stateless_heavy_attack_video = stateless_heavy_attack_video.columns

dfs_columns = [ columns_stateless_heavy_attack_compressed,
            columns_stateless_heavy_attack_exe,
            columns_stateless_heavy_attack_image,
            columns_stateless_heavy_attack_text,
            columns_stateless_heavy_attack_video ]

for cols in dfs_columns:
    print('', columns_stateless_heavy_attack_audio.isin(cols).all())

 True
 True
 True
 True
 True


In [49]:
# ----------------------
# stateless_heavy_attack
# ----------------------

# merging dataframes 

stateless_heavy_attack = pd.concat([stateless_heavy_attack_audio, 
                                   stateless_heavy_attack_compressed,
                                   stateless_heavy_attack_exe,
                                   stateless_heavy_attack_image,
                                   stateless_heavy_attack_text,
                                   stateless_heavy_attack_video])

# verifying number of stateless heavy attack records after concatenation
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
len(stateless_heavy_attack)

251670

In [50]:
for column in columns:
    print(column, ': ', stateless_heavy_attack[column].notnull().unique())

timestamp :  [ True]
FQDN_count :  [ True]
subdomain_length :  [ True]
upper :  [ True]
lower :  [ True]
numeric :  [ True]
entropy :  [ True]
special :  [ True]
labels :  [ True]
labels_max :  [ True]
labels_average :  [ True]
longest_word :  [ True]
sld :  [ True]
len :  [ True]
subdomain :  [ True]


In [51]:
stateless_heavy_attack.head()

Unnamed: 0,timestamp,FQDN_count,subdomain_length,upper,lower,numeric,entropy,special,labels,labels_max,labels_average,longest_word,sld,len,subdomain
0,2020-11-22 14:52:31.248351,25,8,0,10,9,2.556642,6,6,7,3.333333,2,192,12,1
1,2020-11-22 14:52:31.658668,25,8,0,10,9,2.556642,6,6,7,3.333333,2,192,12,1
2,2020-11-22 14:52:31.898872,15,0,11,0,3,3.625,1,1,15,15.0,C,DESKTOP-3JF04TC,16,0
3,2020-11-22 14:52:32.071032,24,7,0,10,8,2.054029,6,6,7,3.166667,4,224,11,1
4,2020-11-22 14:52:32.481373,24,7,0,10,8,2.054029,6,6,7,3.166667,4,224,11,1


In [52]:
stateless_heavy_attack.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 251670 entries, 0 to 38011
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   timestamp         251670 non-null  object 
 1   FQDN_count        251670 non-null  int64  
 2   subdomain_length  251670 non-null  int64  
 3   upper             251670 non-null  int64  
 4   lower             251670 non-null  int64  
 5   numeric           251670 non-null  int64  
 6   entropy           251670 non-null  float64
 7   special           251670 non-null  int64  
 8   labels            251670 non-null  int64  
 9   labels_max        251670 non-null  int64  
 10  labels_average    251670 non-null  float64
 11  longest_word      251670 non-null  object 
 12  sld               251670 non-null  object 
 13  len               251670 non-null  int64  
 14  subdomain         251670 non-null  int64  
dtypes: float64(2), int64(10), object(3)
memory usage: 30.7+ MB


In [53]:
numeric_columns = stateless_heavy_attack.select_dtypes(exclude=[object]).columns
object_columns =  stateless_heavy_attack.select_dtypes(include=[object]).columns

In [54]:
stateless_heavy_attack[numeric_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 251670 entries, 0 to 38011
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   FQDN_count        251670 non-null  int64  
 1   subdomain_length  251670 non-null  int64  
 2   upper             251670 non-null  int64  
 3   lower             251670 non-null  int64  
 4   numeric           251670 non-null  int64  
 5   entropy           251670 non-null  float64
 6   special           251670 non-null  int64  
 7   labels            251670 non-null  int64  
 8   labels_max        251670 non-null  int64  
 9   labels_average    251670 non-null  float64
 10  len               251670 non-null  int64  
 11  subdomain         251670 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 25.0 MB


In [55]:
stateless_heavy_attack[object_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 251670 entries, 0 to 38011
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   timestamp     251670 non-null  object
 1   longest_word  251670 non-null  object
 2   sld           251670 non-null  object
dtypes: object(3)
memory usage: 7.7+ MB


In [56]:
stateless_heavy_attack[numeric_columns].var()

FQDN_count           8.673164
subdomain_length     5.342062
upper               33.330024
lower                5.129758
numeric              6.345085
entropy              0.123122
special              1.644210
labels               1.261852
labels_max          20.530816
labels_average      27.023740
len                 16.162201
subdomain            0.047681
dtype: float64

In [57]:
for obj_col in object_columns:
    print(obj_col, ':', '\n', stateless_heavy_attack[obj_col].unique(), '\n\n\n')

timestamp : 
 ['2020-11-22 14:52:31.248351' '2020-11-22 14:52:31.658668'
 '2020-11-22 14:52:31.898872' ... '2020-11-24 23:16:25.531377'
 '2020-11-24 23:16:25.943739' '2020-11-24 23:16:26.353886'] 



longest_word : 
 ['2' 'C' '4' 'local' 'dns' '9' 'A' 'ad' 'N' 'microsoft' 'L' 'live'
 'windows' 'sfx' 'google' 'M' 'tap' 'bing' 'tic' 'skype' 'cert' 'onenote'
 'tnc' 'edge' '1' 'rocket' 'client'] 



sld : 
 ['192' 'DESKTOP-3JF04TC' '224' 'local' 'dns' '239' '172'
 'FHFAEBEECACACACACACACACACACACAAA' 'wpad'
 'FHEPFCELEHFCEPFFFACACACACACACABN' 'microsoft' 'DC'
 'FHEPFCELEHFCEPFFFACACACACACACABL' 'live' 'windowsupdate' 'sfx'
 'googleapis' 'EEEFFDELFEEPFACNDCEJECDIFGEDFFBM'
 'EJFDEBFEEBFACACACACACACACACACAAA' 'isatap' 'bing' 'gstatic' 'skype'
 'FHEPFCELEHFCEPFFFACACACACACACACA' 'digicert' 'onenote' 'msftncsi'
 'msedge' 'windows' 'google' 'gvt1' 'logrocket'
 'FEFDEDEMEJEFEOFECACACACACACACACA' 'tsclient'] 





## Stateless Heavy Benign

In [58]:
# ---------------------
# stateless_heavy_benign
# ---------------------

PATH = '/kaggle/input/cicbelldnsexf2021/Attack_heavy_Benign/Benign/'
stateless_heavy_benign1 = pd.read_csv(PATH+'stateless_features-benign_heavy_1.pcap.csv')
stateless_heavy_benign2 = pd.read_csv(PATH+'stateless_features-benign_heavy_2.pcap.csv')
stateless_heavy_benign3 = pd.read_csv(PATH+'stateless_features-benign_heavy_3.pcap.csv')

PATH = '/kaggle/input/cicbelldnsexf2021/Benign/'
stateless_benign1 = pd.read_csv(PATH+'stateless_features-benign_1.pcap.csv')
stateless_benign2 = pd.read_csv(PATH+'stateless_features-benign_2.pcap.csv')

number_of_records = len(stateless_heavy_benign1)+len(stateless_heavy_benign2)+\
                        len(stateless_heavy_benign3)+\
                        len(stateless_benign1)+len(stateless_benign2)

# verifying number of stateless heavy benign records
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
print('Total Stateless Heavy Benign:', number_of_records)

Total Stateless Heavy Benign: 402767


In [59]:
# ---------------------
# stateless_heavy_benign
# ---------------------

dfs = [stateless_heavy_benign1, stateless_heavy_benign2, stateless_heavy_benign3,
               stateless_benign1, stateless_benign2]

columns = stateless_heavy_benign1.columns

# checking if all dataframes, of stateless heavy benign type, have same column names
columns_stateless_heavy_benign1 = stateless_heavy_benign1.columns
columns_stateless_heavy_benign2 = stateless_heavy_benign2.columns
columns_stateless_heavy_benign3 = stateless_heavy_benign3.columns
columns_stateless_benign1 = stateless_benign1.columns
columns_stateless_benign2 = stateless_benign2.columns


dfs_columns = [ columns_stateless_heavy_benign2,
                columns_stateless_heavy_benign3, 
                columns_stateless_benign1,
                columns_stateless_benign2]

for cols in dfs_columns:
    print('', columns_stateless_heavy_benign1.isin(cols).all())

 True
 True
 True
 True


In [60]:
# ----------------------
# stateless_heavy_benign
# ----------------------

# merging dataframes 

stateless_heavy_benign = pd.concat([stateless_heavy_benign1, 
                                   stateless_heavy_benign2,
                                   stateless_heavy_benign3,
                                   stateless_benign1,
                                   stateless_benign2
                                   ])

# verifying number of stateless heavy benign records after concatenation
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
len(stateless_heavy_benign)

402767

In [61]:
for column in columns:
    print(column, ': ', stateless_heavy_benign[column].notnull().unique())

timestamp :  [ True]
FQDN_count :  [ True]
subdomain_length :  [ True]
upper :  [ True]
lower :  [ True]
numeric :  [ True]
entropy :  [ True]
special :  [ True]
labels :  [ True]
labels_max :  [ True]
labels_average :  [ True]
longest_word :  [ True False]
sld :  [ True]
len :  [ True]
subdomain :  [ True]


In [62]:
print(stateless_heavy_benign['longest_word'].notnull().value_counts())

True     402740
False        27
Name: longest_word, dtype: int64


In [63]:
stateless_heavy_benign.head()

Unnamed: 0,timestamp,FQDN_count,subdomain_length,upper,lower,numeric,entropy,special,labels,labels_max,labels_average,longest_word,sld,len,subdomain
0,2020-11-22 10:52:54.331014,27,10,0,10,11,2.570417,6,6,7,3.666667,2,192,14,1
1,2020-11-22 10:52:54.743949,27,10,0,10,11,2.48148,6,6,7,3.666667,2,192,14,1
2,2020-11-22 10:52:55.154578,27,10,0,10,11,2.48148,6,6,7,3.666667,2,192,14,1
3,2020-11-22 10:52:55.566957,24,7,0,10,8,2.054029,6,6,7,3.166667,4,224,11,1
4,2020-11-22 10:52:55.927177,32,0,32,0,0,2.735132,0,1,32,32.0,N,FHEPFCELEHFCEPFFFACACACACACACABN,33,0


In [64]:
stateless_heavy_benign.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 402767 entries, 0 to 88573
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   timestamp         402767 non-null  object 
 1   FQDN_count        402767 non-null  int64  
 2   subdomain_length  402767 non-null  int64  
 3   upper             402767 non-null  int64  
 4   lower             402767 non-null  int64  
 5   numeric           402767 non-null  int64  
 6   entropy           402767 non-null  float64
 7   special           402767 non-null  int64  
 8   labels            402767 non-null  int64  
 9   labels_max        402767 non-null  int64  
 10  labels_average    402767 non-null  float64
 11  longest_word      402740 non-null  object 
 12  sld               402767 non-null  object 
 13  len               402767 non-null  int64  
 14  subdomain         402767 non-null  int64  
dtypes: float64(2), int64(10), object(3)
memory usage: 49.2+ MB


In [65]:
numeric_columns = stateless_heavy_benign.select_dtypes(exclude=[object]).columns
object_columns =  stateless_heavy_benign.select_dtypes(include=[object]).columns

In [66]:
stateless_heavy_benign[numeric_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 402767 entries, 0 to 88573
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   FQDN_count        402767 non-null  int64  
 1   subdomain_length  402767 non-null  int64  
 2   upper             402767 non-null  int64  
 3   lower             402767 non-null  int64  
 4   numeric           402767 non-null  int64  
 5   entropy           402767 non-null  float64
 6   special           402767 non-null  int64  
 7   labels            402767 non-null  int64  
 8   labels_max        402767 non-null  int64  
 9   labels_average    402767 non-null  float64
 10  len               402767 non-null  int64  
 11  subdomain         402767 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 39.9 MB


In [67]:
stateless_heavy_benign[object_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 402767 entries, 0 to 88573
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   timestamp     402767 non-null  object
 1   longest_word  402740 non-null  object
 2   sld           402767 non-null  object
dtypes: object(3)
memory usage: 12.3+ MB


In [68]:
stateless_heavy_benign[numeric_columns].var()

FQDN_count          46.031196
subdomain_length    16.303479
upper               13.681285
lower               14.181049
numeric             21.466613
entropy              0.219152
special              5.172049
labels               3.375105
labels_max          17.156227
labels_average      13.203356
len                 18.099550
subdomain            0.248221
dtype: float64

In [69]:
for obj_col in object_columns:
    print(obj_col, ':', '\n', stateless_heavy_benign[obj_col].unique(), '\n\n\n')

timestamp : 
 ['2020-11-22 10:52:54.331014' '2020-11-22 10:52:54.743949'
 '2020-11-22 10:52:55.154578' ... '2020-11-21 04:57:36.949071'
 '2020-11-21 04:57:37.359119' '2020-11-21 04:57:38.977982'] 



longest_word : 
 ['2' '4' 'N' ... 'ceremony' 'bravi' 'dresden'] 



sld : 
 ['192' '224' 'FHEPFCELEHFCEPFFFACACACACACACABN' ... 'pnftqrdk'
 'dypbeabfcrkhj' 'gvt1'] 





## Stateless Light Attack

In [70]:
# ----------------------
# stateless_light_attack
# ----------------------

PATH = '/kaggle/input/cicbelldnsexf2021/Attack_Light_Benign/Attacks/'

stateless_light_attack_audio = pd.read_csv(PATH+'stateless_features-light_audio.pcap.csv')
stateless_light_attack_compressed = pd.read_csv(PATH+'stateless_features-light_compressed.pcap.csv')
stateless_light_attack_exe = pd.read_csv(PATH+'stateless_features-light_exe.pcap.csv')
stateless_light_attack_image = pd.read_csv(PATH+'stateless_features-light_image.pcap.csv')
stateless_light_attack_text = pd.read_csv(PATH+'stateless_features-light_text.pcap.csv')
stateless_light_attack_video = pd.read_csv(PATH+'stateless_features-light_video.pcap.csv')

number_of_records = len(stateless_light_attack_audio)+len(stateless_light_attack_compressed)+\
                         len(stateless_light_attack_exe)+len(stateless_light_attack_image)+\
                         len(stateless_light_attack_text)+len(stateless_light_attack_video)

# verifying number of stateless light attack records
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
print('Total Stateless Light Attack:', number_of_records)

Total Stateless Light Attack: 42683


In [71]:
# ---------------------
# stateless_light_attack
# ---------------------

dfs = [stateless_light_attack_audio, stateless_light_attack_compressed, 
      stateless_light_attack_exe, stateless_light_attack_image, 
      stateless_light_attack_text, stateless_light_attack_video]

columns = stateless_light_attack_audio.columns

# checking if all dataframes, of stateful light attack type, have same column names
columns_stateless_light_attack_audio = stateless_light_attack_audio.columns
columns_stateless_light_attack_compressed = stateless_light_attack_compressed.columns
columns_stateless_light_attack_exe = stateless_light_attack_exe.columns
columns_stateless_light_attack_image = stateless_light_attack_image.columns
columns_stateless_light_attack_text = stateless_light_attack_text.columns
columns_stateless_light_attack_video = stateless_light_attack_video.columns

dfs_columns = [ columns_stateless_light_attack_compressed,
            columns_stateless_light_attack_exe,
            columns_stateless_light_attack_image,
            columns_stateless_light_attack_text,
            columns_stateless_light_attack_video ]

for cols in dfs_columns:
    print('', columns_stateless_light_attack_audio.isin(cols).all())

 True
 True
 True
 True
 True


In [72]:
# ----------------------
# stateless_light_attack
# ----------------------

# merging dataframes 

stateless_light_attack = pd.concat([stateless_light_attack_audio, 
                                   stateless_light_attack_compressed,
                                   stateless_light_attack_exe,
                                   stateless_light_attack_image,
                                   stateless_light_attack_text,
                                   stateless_light_attack_video])

# verifying number of stateless light attack records after concatenation
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
len(stateless_light_attack)

42683

In [73]:
for column in columns:
    print(column, ': ', stateless_light_attack[column].notnull().unique())

timestamp :  [ True]
FQDN_count :  [ True]
subdomain_length :  [ True]
upper :  [ True]
lower :  [ True]
numeric :  [ True]
entropy :  [ True]
special :  [ True]
labels :  [ True]
labels_max :  [ True]
labels_average :  [ True]
longest_word :  [ True]
sld :  [ True]
len :  [ True]
subdomain :  [ True]


In [74]:
stateless_light_attack.head()

Unnamed: 0,timestamp,FQDN_count,subdomain_length,upper,lower,numeric,entropy,special,labels,labels_max,labels_average,longest_word,sld,len,subdomain
0,2020-11-21 19:13:27.034607,27,10,0,10,11,2.570417,6,6,7,3.666667,2,192,14,1
1,2020-11-21 19:13:27.444721,27,10,0,10,11,2.570417,6,6,7,3.666667,2,192,14,1
2,2020-11-21 19:13:27.857202,24,7,0,10,8,2.054029,6,6,7,3.166667,4,224,11,1
3,2020-11-21 19:13:28.267312,24,7,0,10,8,2.054029,6,6,7,3.166667,4,224,11,1
4,2020-11-21 19:13:29.503346,25,8,0,10,9,2.556642,6,6,7,3.333333,2,192,12,1


In [75]:
stateless_light_attack.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42683 entries, 0 to 4370
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   timestamp         42683 non-null  object 
 1   FQDN_count        42683 non-null  int64  
 2   subdomain_length  42683 non-null  int64  
 3   upper             42683 non-null  int64  
 4   lower             42683 non-null  int64  
 5   numeric           42683 non-null  int64  
 6   entropy           42683 non-null  float64
 7   special           42683 non-null  int64  
 8   labels            42683 non-null  int64  
 9   labels_max        42683 non-null  int64  
 10  labels_average    42683 non-null  float64
 11  longest_word      42683 non-null  object 
 12  sld               42683 non-null  object 
 13  len               42683 non-null  int64  
 14  subdomain         42683 non-null  int64  
dtypes: float64(2), int64(10), object(3)
memory usage: 5.2+ MB


In [76]:
numeric_columns = stateless_light_attack.select_dtypes(exclude=[object]).columns
object_columns =  stateless_light_attack.select_dtypes(include=[object]).columns

In [77]:
stateless_light_attack[numeric_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42683 entries, 0 to 4370
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   FQDN_count        42683 non-null  int64  
 1   subdomain_length  42683 non-null  int64  
 2   upper             42683 non-null  int64  
 3   lower             42683 non-null  int64  
 4   numeric           42683 non-null  int64  
 5   entropy           42683 non-null  float64
 6   special           42683 non-null  int64  
 7   labels            42683 non-null  int64  
 8   labels_max        42683 non-null  int64  
 9   labels_average    42683 non-null  float64
 10  len               42683 non-null  int64  
 11  subdomain         42683 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 4.2 MB


In [78]:
stateless_light_attack[object_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42683 entries, 0 to 4370
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   timestamp     42683 non-null  object
 1   longest_word  42683 non-null  object
 2   sld           42683 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [79]:
stateless_light_attack[numeric_columns].var()

FQDN_count           8.299787
subdomain_length     5.268947
upper               33.389115
lower                5.005458
numeric              6.215458
entropy              0.120786
special              1.612241
labels               1.238987
labels_max          20.547299
labels_average      27.060016
len                 16.042261
subdomain            0.046780
dtype: float64

In [80]:
for obj_col in object_columns:
    print(obj_col, ':', '\n', stateless_light_attack[obj_col].unique(), '\n\n\n')

timestamp : 
 ['2020-11-21 19:13:27.034607' '2020-11-21 19:13:27.444721'
 '2020-11-21 19:13:27.857202' ... '2020-11-22 03:38:33.091830'
 '2020-11-22 03:38:33.575700' '2020-11-22 03:38:33.985499'] 



longest_word : 
 ['2' '4' 'C' 'local' 'N' '9' 'L' 'microsoft' 'A' 'ad' 'live' 'bing' 'cert'
 'tap' 'windows' 'M' 'tnc' 'google' 'office' 'cres' 'shark'] 



sld : 
 ['192' '224' 'DESKTOP-3JF04TC' 'local' 'FHEPFCELEHFCEPFFFACACACACACACABN'
 '239' 'FHEPFCELEHFCEPFFFACACACACACACABL' '172' 'DC' 'microsoft'
 'FHFAEBEECACACACACACACACACACACAAA' 'wpad' 'live' 'bing' 'digicert'
 'EJFDEBFEEBFACACACACACACACACACAAA' 'isatap' 'windowsupdate'
 'EEEFFDELFEEPFACNDEDBDDFCFBEIEBBM' 'msftncsi' 'googleapis' 'office'
 'cicresearch' 'wireshark'] 





## Stateless Light Benign

In [81]:
# ----------------------
# stateless_light_benign
# ----------------------

PATH = '/kaggle/input/cicbelldnsexf2021/Attack_Light_Benign/Benign/'
stateless_light_benign = pd.read_csv(PATH+'stateless_features-light_benign.pcap.csv')

PATH = '/kaggle/input/cicbelldnsexf2021/Benign/'
stateless_benign1 = pd.read_csv(PATH+'stateless_features-benign_1.pcap.csv')
stateless_benign2 = pd.read_csv(PATH+'stateless_features-benign_2.pcap.csv')

number_of_records = len(stateless_light_benign)+len(stateless_benign1)+len(stateless_benign2)

# verifying number of stateless light benign records
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
print('Total Stateless Light Benign:', number_of_records)

Total Stateless Light Benign: 281164


In [82]:
# ---------------------
# stateless_light_benign
# ---------------------

dfs = [stateless_light_benign, stateless_benign1, stateless_benign2]

columns = stateless_light_benign.columns

# checking if all dataframes, of stateless light benign type, have same column names
columns_stateless_light_benign = stateless_light_benign.columns
columns_stateless_benign1 = stateless_benign1.columns
columns_stateless_benign2 = stateless_benign2.columns


dfs_columns = [ columns_stateless_benign1,
                columns_stateless_benign2
              ]

for cols in dfs_columns:
    print('', columns_stateless_light_benign.isin(cols).all())

 True
 True


In [83]:
# ----------------------
# stateless_light_benign
# ----------------------

# merging dataframes 

stateless_light_benign = pd.concat([stateless_light_benign, 
                                   stateless_benign1, 
                                   stateless_benign2
                                  ])

# verifying number of stateless light benign records after concatenation
# https://www.unb.ca/cic/datasets/dns-exf-2021.html
len(stateless_light_benign)

281164

In [84]:
for column in columns:
    print(column, ': ', stateless_light_benign[column].notnull().unique())

timestamp :  [ True]
FQDN_count :  [ True]
subdomain_length :  [ True]
upper :  [ True]
lower :  [ True]
numeric :  [ True]
entropy :  [ True]
special :  [ True]
labels :  [ True]
labels_max :  [ True]
labels_average :  [ True]
longest_word :  [ True False]
sld :  [ True]
len :  [ True]
subdomain :  [ True]


In [85]:
print(stateless_light_benign['longest_word'].notnull().value_counts())

True     281143
False        21
Name: longest_word, dtype: int64


In [86]:
stateless_light_benign.head()

Unnamed: 0,timestamp,FQDN_count,subdomain_length,upper,lower,numeric,entropy,special,labels,labels_max,labels_average,longest_word,sld,len,subdomain
0,2020-11-21 14:18:54.998708,27,10,0,10,11,2.570417,6,6,7,3.666667,2,192,14,1
1,2020-11-21 14:18:55.411010,25,8,0,10,9,2.556642,6,6,7,3.333333,2,192,12,1
2,2020-11-21 14:18:55.821592,25,8,0,10,9,2.556642,6,6,7,3.333333,2,192,12,1
3,2020-11-21 14:18:56.234074,27,10,0,10,11,2.767195,6,6,7,3.666667,2,192,14,1
4,2020-11-21 14:18:56.644294,27,10,0,10,11,2.767195,6,6,7,3.666667,2,192,14,1


In [87]:
stateless_light_benign.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 281164 entries, 0 to 88573
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   timestamp         281164 non-null  object 
 1   FQDN_count        281164 non-null  int64  
 2   subdomain_length  281164 non-null  int64  
 3   upper             281164 non-null  int64  
 4   lower             281164 non-null  int64  
 5   numeric           281164 non-null  int64  
 6   entropy           281164 non-null  float64
 7   special           281164 non-null  int64  
 8   labels            281164 non-null  int64  
 9   labels_max        281164 non-null  int64  
 10  labels_average    281164 non-null  float64
 11  longest_word      281143 non-null  object 
 12  sld               281164 non-null  object 
 13  len               281164 non-null  int64  
 14  subdomain         281164 non-null  int64  
dtypes: float64(2), int64(10), object(3)
memory usage: 34.3+ MB


In [88]:
numeric_columns = stateless_light_benign.select_dtypes(exclude=[object]).columns
object_columns =  stateless_light_benign.select_dtypes(include=[object]).columns

In [89]:
stateless_light_benign[numeric_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 281164 entries, 0 to 88573
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   FQDN_count        281164 non-null  int64  
 1   subdomain_length  281164 non-null  int64  
 2   upper             281164 non-null  int64  
 3   lower             281164 non-null  int64  
 4   numeric           281164 non-null  int64  
 5   entropy           281164 non-null  float64
 6   special           281164 non-null  int64  
 7   labels            281164 non-null  int64  
 8   labels_max        281164 non-null  int64  
 9   labels_average    281164 non-null  float64
 10  len               281164 non-null  int64  
 11  subdomain         281164 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 27.9 MB


In [90]:
stateless_light_benign[object_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 281164 entries, 0 to 88573
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   timestamp     281164 non-null  object
 1   longest_word  281143 non-null  object
 2   sld           281164 non-null  object
dtypes: object(3)
memory usage: 8.6+ MB


In [91]:
stateless_light_benign[numeric_columns].var()

FQDN_count          46.848618
subdomain_length    16.257019
upper               13.786562
lower               13.688014
numeric             21.492431
entropy              0.220245
special              5.146893
labels               3.368047
labels_max          16.904929
labels_average      13.166181
len                 18.124606
subdomain            0.247775
dtype: float64

for obj_col in object_columns:
    print(obj_col, ':', '\n', stateless_light_benign[obj_col].unique(), '\n\n\n')