In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn import preprocessing

In [4]:
header_orig = ["ID","ts_year","ts_month","ts_day","ts_hour","ts_min","ts_second","te_year","te_month",
          "te_day","te_hour","te_min","te_second","duration","Src IP Addr","Dst IP Addr","Src Port","Dst Port","Protocol","_flag1","_flag2",
          "_flag3","_flag4","_flag5","_flag6","fwd","stos","Packets","Bytes","opkt","obyt","_in","out",
          "sas","das","smk","dmk","dtos","_dir","nh","nhb","svln","dvln","ismc","odmc","idmc","osmc",
          "mpls1","mpls2","mpls3","mpls4","mpls5","mpls6","mpls7","mpls8","mpls9","mpls10","cl","sl",
          "al","ra","eng","exid","tr","icmp_dst_ip_b","icmp_src_ip","udp_dst_p","tcp_f_s","tcp_f_n_a",
          "tcp_f_n_f","tcp_f_n_r","tcp_f_n_p","tcp_f_n_u","tcp_dst_p","tcp_src_dst_f_s","tcp_src_tftp",
          "tcp_src_kerb","tcp_src_rpc","tcp_dst_p_src","smtp_dst","udp_p_r_range","p_range_dst",
          "udp_src_p_0","class","attack_a"]

In [3]:
blasterWorm = pd.read_csv('BLASTER_WORM_v2.csv',names=header_orig)

In [4]:
fragmentation = pd.read_csv('FRAGMENTATION_v2.csv',names=header_orig)

In [5]:
httpFlood = pd.read_csv('HTTP_FLOOD_v2.csv',names = header_orig)

In [6]:
icmpFlood = pd.read_csv('ICMP_FLOOD_v2.csv',names=header_orig)

In [7]:
reaperWorm = pd.read_csv('REAPER_WORM_v2.csv',names = header_orig)

In [8]:
redWorm = pd.read_csv('RED_WORM_v2.csv',names=header_orig)

In [9]:
udpFlood = pd.read_csv('UDP_FLOOD_v2.csv',names=header_orig)

In [10]:
landAttack = pd.read_csv('LAND_ATTACK_v2.csv',names=header_orig)

In [11]:
smurf = pd.read_csv('SMURF_v2.csv',names=header_orig)

In [12]:
scanning = pd.read_csv('SCANNING_SPREAD_v2.csv',names=header_orig)

In [13]:
spam = pd.read_csv('SPAM_v2.csv',names=header_orig)

In [14]:
synFlood = pd.read_csv('SYN_FLOOD.csv',names=header_orig)

In [15]:
litnet=[blasterWorm, fragmentation,httpFlood,icmpFlood, reaperWorm,redWorm,udpFlood,landAttack,smurf,scanning,spam,synFlood]

In [16]:
blasterWorm["attackType"]="blasterWorm"
fragmentation["attackType"]="fragmentation"
httpFlood["attackType"]="httpFlood"
icmpFlood["attackType"]="icmpFlood"
reaperWorm["attackType"]="reaperWorm"
redWorm["attackType"]="redWorm"
udpFlood["attackType"]="udpFlood"
landAttack["attackType"]="landAttack"
smurf["attackType"]="smurf"
scanning["attackType"]="scanning"
spam["attackType"]="spam"
synFlood["attackType"]="synFlood"

In [17]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

In [18]:
def tcp_flags(elem):
    if elem in ("U", "A", "P", "R", "S", "F"):  #flag is set
        return 1
    else:
        return 0

# U --> urgent 
# A --> Acknowledgment 
# P --> Push
# R --> Reset
# S --> Synchronise
# F --> Finish

In [19]:
def modifying_litnet_dataset_into_common_format(df):
    # only extracting the necessary columns and inserting new columns/ dropping some old ones
    df = df[['ts_year', 'ts_month', 'ts_day', 'ts_hour', 'ts_min',
       'ts_second','duration',"Src IP Addr","Dst IP Addr","Src Port","Dst Port","Protocol","Packets","Bytes","_flag1","_flag2",
          "_flag3","_flag4","_flag5","_flag6",'class','attackType']]
    df.rename(columns={'Protocol': 'Proto', 'Dst Port': 'Dst Pt','Src Port': 'Src Pt','attack_name':"attackType","duration":"Duration"}, inplace=True)
    df['Date first seen'] = df['ts_year'].astype(str) + '-' + df['ts_month'].astype(str)+'-'+df['ts_day'].astype(str)+" "+df['ts_hour'].astype(str)+":"+df['ts_min'].astype(str)+":"+df['ts_second'].astype(str)
    df['Date first seen'] = pd.to_datetime(df['Date first seen'])
    df = df.drop(['ts_year', 'ts_month','ts_day','ts_hour','ts_min','ts_second'], axis=1)
    # one hot encoding the flags and renaming to U,A,P,R,S,F for better understanding
    flags_df = df[["_flag1", "_flag2", "_flag3", "_flag4", "_flag5", "_flag6"]]
    df[["_flag1", "_flag2", "_flag3", "_flag4", "_flag5", "_flag6"]] = flags_df.applymap(tcp_flags)
    column_mapping = {'_flag1':'U','_flag2':'A','_flag3':'P','_flag4':'R','_flag5':'S','_flag6':'F'}
    df.rename(columns = column_mapping, inplace = True)
    # changing class values to normal and victim
    df['class'] = np.where(df['class'] == 'none', 'normal', 'victim')
    # Set attackType to none where class is normal
    df["attackType"] = np.where(df["class"] == "normal", "none", df["attackType"])
    df = df.reindex(columns=["Date first seen","Duration","Proto","Src IP Addr","Src Pt","Dst IP Addr","Dst Pt","Packets","Bytes","U","A","P","R","S","F","class","attackType"])
    return df

In [20]:
blasterWorm_procd=modifying_litnet_dataset_into_common_format(blasterWorm)

In [21]:
fragmentation_procd=modifying_litnet_dataset_into_common_format(fragmentation)

In [22]:
httpFlood_procd=modifying_litnet_dataset_into_common_format(httpFlood)

In [23]:
icmpFlood_procd=modifying_litnet_dataset_into_common_format(icmpFlood)

In [24]:
reaperWorm_procd=modifying_litnet_dataset_into_common_format(reaperWorm)

In [25]:
redWorm_procd=modifying_litnet_dataset_into_common_format(redWorm)

In [26]:
udpFlood_procd=modifying_litnet_dataset_into_common_format(udpFlood)

In [27]:
landAttack_procd=modifying_litnet_dataset_into_common_format(landAttack)

In [28]:
smurf_procd=modifying_litnet_dataset_into_common_format(smurf)

In [29]:
scanning_procd=modifying_litnet_dataset_into_common_format(scanning)

In [30]:
spam_procd=modifying_litnet_dataset_into_common_format(spam)

In [31]:
synFlood_procd=modifying_litnet_dataset_into_common_format(synFlood)

In [32]:
litnet_procd=[blasterWorm_procd, fragmentation_procd,httpFlood_procd,icmpFlood_procd, reaperWorm_procd,redWorm_procd,udpFlood_procd,landAttack_procd,smurf_procd,scanning_procd,spam_procd,synFlood_procd]

In [33]:
def check_for_null(df):
    any_nulls = df.isna().any().any()
    print(any_nulls)

In [34]:
for df in litnet_procd:
    check_for_null(df)

False
False
False
False
False
False
False
False
False
False
False
False


In [35]:
def check_for_duplicates(df):
    has_duplicates = df.duplicated().any()
    print(has_duplicates)

In [36]:
for df in litnet_procd:
    check_for_duplicates(df)

True
True
True
True
True
True
True
True
True
False
True
True


In [37]:
blasterWorm_procd=blasterWorm_procd[~blasterWorm_procd.duplicated()]

In [38]:
fragmentation_procd=fragmentation_procd[~fragmentation_procd.duplicated()]

In [39]:
httpFlood_procd=httpFlood_procd[~httpFlood_procd.duplicated()]

In [40]:
icmpFlood_procd=icmpFlood_procd[~icmpFlood_procd.duplicated()]

In [41]:
reaperWorm_procd=reaperWorm_procd[~reaperWorm_procd.duplicated()]

In [42]:
redWorm_procd=redWorm_procd[~redWorm_procd.duplicated()]

In [43]:
udpFlood_procd=udpFlood_procd[~udpFlood_procd.duplicated()]

In [44]:
landAttack_procd=landAttack_procd[~landAttack_procd.duplicated()]

In [45]:
smurf_procd=smurf_procd[~smurf_procd.duplicated()]

In [46]:
spam_procd=spam_procd[~spam_procd.duplicated()]

In [47]:
synFlood_procd=synFlood_procd[~synFlood_procd.duplicated()]

In [48]:
litnet_procd=[blasterWorm_procd, fragmentation_procd,httpFlood_procd,icmpFlood_procd, reaperWorm_procd,redWorm_procd,udpFlood_procd,landAttack_procd,smurf_procd,scanning_procd,spam_procd,synFlood_procd]

In [49]:
for df in litnet_procd:
    check_for_duplicates(df)

False
False
False
False
False
False
False
False
False
False
False
False


In [50]:
def counter(df):
  name =[x for x in globals() if globals()[x] is df][0]
  print(name)
  print("total: ",df.shape[0])
  print("normal: ",(df['attackType'] =="none").sum())
  print("attack: ",(df['attackType'] !="none").sum())

In [51]:
def custom_summary(dataset_list):
    for df in dataset_list:
        counter(df)
        print("----------------------------------------")

In [52]:
custom_summary(litnet_procd)

blasterWorm_procd
total:  3119415
normal:  3095124
attack:  24291
----------------------------------------
fragmentation_procd
total:  1327349
normal:  1326874
attack:  475
----------------------------------------
httpFlood_procd
total:  4108999
normal:  4086040
attack:  22959
----------------------------------------
icmpFlood_procd
total:  4401200
normal:  4330170
attack:  71030
----------------------------------------
reaperWorm_procd
total:  4672898
normal:  4671722
attack:  1176
----------------------------------------
redWorm_procd
total:  5637301
normal:  4381599
attack:  1255702
----------------------------------------
udpFlood_procd
total:  630123
normal:  536540
attack:  93583
----------------------------------------
landAttack_procd
total:  4032754
normal:  3980337
attack:  52417
----------------------------------------
smurf_procd
total:  4401200
normal:  4330170
attack:  71030
----------------------------------------
scanning_procd
total:  6687
normal:  455
attack:  6232
--

In [53]:
CIDDS_int_1=pd.read_csv('CIDDS_int_1.csv')
CIDDS_int_1=CIDDS_int_1[~CIDDS_int_1.duplicated()]

In [54]:
CIDDS_int_2=pd.read_csv('CIDDS_int_2.csv')
CIDDS_int_2=CIDDS_int_2[~CIDDS_int_2.duplicated()]

In [55]:
CIDDS_int_3=pd.read_csv('CIDDS_int_3.csv')
CIDDS_int_3=CIDDS_int_3[~CIDDS_int_3.duplicated()]

In [56]:
CIDDS_int_4=pd.read_csv('CIDDS_int_4.csv')
CIDDS_int_4=CIDDS_int_4[~CIDDS_int_4.duplicated()]

In [57]:
def modifying_cidds_dataset_into_common_format(df):
    df = df.drop(['Flows', 'Tos',"attackID","attackDescription"], axis=1)
    df['Date first seen'] = pd.to_datetime(df['Date first seen'])
    flag_chars = ['U', 'A', 'P', 'R', 'S', 'F']
    df['attackType'] = df['attackType'].replace('---', 'none')
    # For each flag character, create a new column and populate with 1 or 0
    for char in flag_chars:
        df[char] = df['Flags'].apply(lambda x: int(char in x))

    df = df.drop('Flags', axis=1)

    df = df.reindex(columns=["Date first seen","Duration","Proto","Src IP Addr","Src Pt","Dst IP Addr","Dst Pt","Packets","Bytes","U","A","P","R","S","F","class","attackType"])
    return df

In [58]:
CIDDS_int_1_procd=modifying_cidds_dataset_into_common_format(CIDDS_int_1)

In [59]:
CIDDS_int_2_procd=modifying_cidds_dataset_into_common_format(CIDDS_int_2)

In [60]:
CIDDS_int_3_procd=modifying_cidds_dataset_into_common_format(CIDDS_int_3)

In [61]:
CIDDS_int_4_procd=modifying_cidds_dataset_into_common_format(CIDDS_int_4)

In [62]:
cidds_procd=[CIDDS_int_1_procd,CIDDS_int_2_procd,CIDDS_int_3_procd,CIDDS_int_4_procd]

In [63]:
for df in cidds_procd:
    check_for_null(df)

False
False
False
False


In [64]:
for df in cidds_procd:
    check_for_duplicates(df)

False
False
False
False


In [65]:
def counter_cidds(df):
  name =[x for x in globals() if globals()[x] is df][0]
  print(name)
  print("total: ",df.shape[0])
  print("normal: ",(df['class'] =="normal").sum())
  print("attack: ",((df['class'] =="attacker") | (df['class'] =="victim") | (df['class'] =="suspicious")).sum())
  print("unknown: ",(df['class'] =="unknown").sum())

In [66]:
def custom_summary_cidds(dataset_list):
    for df in dataset_list:
        counter_cidds(df)
        print("----------------------------------------")

In [67]:
custom_summary_cidds(cidds_procd)

CIDDS_int_1_procd
total:  8248753
normal:  7010112
attack:  1238641
unknown:  0
----------------------------------------
CIDDS_int_2_procd
total:  10036995
normal:  8514291
attack:  1522704
unknown:  0
----------------------------------------
CIDDS_int_3_procd
total:  6347711
normal:  6347711
attack:  0
unknown:  0
----------------------------------------
df
total:  6173941
normal:  6173941
attack:  0
unknown:  0
----------------------------------------


In [68]:
CIDDS_ext_1=pd.read_csv('CIDDS-001-external-week1.csv')
CIDDS_ext_1=CIDDS_ext_1[~CIDDS_ext_1.duplicated()]

In [69]:
CIDDS_ext_2=pd.read_csv('CIDDS-001-external-week2.csv')
CIDDS_ext_2=CIDDS_ext_2[~CIDDS_ext_2.duplicated()]

In [70]:
CIDDS_ext_3=pd.read_csv('CIDDS-001-external-week3.csv')
CIDDS_ext_3=CIDDS_ext_3[~CIDDS_ext_3.duplicated()]

In [71]:
CIDDS_ext_4=pd.read_csv('CIDDS-001-external-week4.csv')
CIDDS_ext_4=CIDDS_ext_4[~CIDDS_ext_4.duplicated()]

In [72]:
CIDDS_ext_1_procd=modifying_cidds_dataset_into_common_format(CIDDS_ext_1)

In [73]:
CIDDS_ext_2_procd=modifying_cidds_dataset_into_common_format(CIDDS_ext_2)

In [74]:
CIDDS_ext_3_procd=modifying_cidds_dataset_into_common_format(CIDDS_ext_3)

In [75]:
CIDDS_ext_4_procd=modifying_cidds_dataset_into_common_format(CIDDS_ext_4)

In [76]:
cidds_ext_procd=[CIDDS_ext_1_procd,CIDDS_ext_2_procd,CIDDS_ext_3_procd,CIDDS_ext_4_procd]

In [77]:
for df in cidds_ext_procd:
    check_for_null(df)

False
False
False
False


In [78]:
for df in cidds_ext_procd:
    check_for_duplicates(df)

False
False
False
False


In [79]:
custom_summary_cidds(cidds_ext_procd)

CIDDS_ext_1_procd
total:  172838
normal:  49606
attack:  107344
unknown:  15888
----------------------------------------
CIDDS_ext_2_procd
total:  159373
normal:  28436
attack:  121366
unknown:  9571
----------------------------------------
CIDDS_ext_3_procd
total:  153026
normal:  6180
attack:  113009
unknown:  33837
----------------------------------------
df
total:  186004
normal:  50018
attack:  117359
unknown:  18627
----------------------------------------


In [84]:
blasterWorm_procd.to_csv('./PreprocessedCSVs/blasterWormProcessed.csv', index=False)

In [86]:
fragmentation_procd.to_csv('./PreprocessedCSVs/fragmentationProcessed.csv', index=False)

In [87]:
httpFlood_procd.to_csv('./PreprocessedCSVs/httpFloodProcessed.csv', index=False)

In [88]:
icmpFlood_procd.to_csv('./PreprocessedCSVs/icmpFloodProcessed.csv', index=False)

In [89]:
reaperWorm_procd.to_csv('./PreprocessedCSVs/reaperWormProcessed.csv', index=False)

In [90]:
redWorm_procd.to_csv('./PreprocessedCSVs/redWormProcessed.csv', index=False)

In [91]:
udpFlood_procd.to_csv('./PreprocessedCSVs/udpFloodProcessed.csv', index=False)

In [92]:
landAttack_procd.to_csv('./PreprocessedCSVs/landAttackProcessed.csv', index=False)

In [93]:
smurf_procd.to_csv('./PreprocessedCSVs/smurfProcessed.csv', index=False)

In [94]:
scanning_procd.to_csv('./PreprocessedCSVs/scanningProcessed.csv', index=False)

In [95]:
spam_procd.to_csv('./PreprocessedCSVs/spamProcessed.csv', index=False)

In [96]:
synFlood_procd.to_csv('./PreprocessedCSVs/synFloodProcessed.csv', index=False)

In [97]:
CIDDS_int_1_procd.to_csv('./PreprocessedCSVs/ciddsint1Processed.csv', index=False)

In [98]:
CIDDS_int_2_procd.to_csv('./PreprocessedCSVs/ciddsint2Processed.csv', index=False)

In [99]:
CIDDS_int_3_procd.to_csv('./PreprocessedCSVs/ciddsint3Processed.csv', index=False)

In [100]:
CIDDS_int_4_procd.to_csv('./PreprocessedCSVs/ciddsint4Processed.csv', index=False)

In [101]:
CIDDS_ext_1_procd.to_csv('./PreprocessedCSVs/ciddslogext1Processed.csv', index=False)

In [102]:
CIDDS_ext_2_procd.to_csv('./PreprocessedCSVs/ciddslogext2Processed.csv', index=False)

In [103]:
CIDDS_ext_3_procd.to_csv('./PreprocessedCSVs/ciddslogext3Processed.csv', index=False)

In [104]:
CIDDS_ext_4_procd.to_csv('./PreprocessedCSVs/ciddslogext4Processed.csv', index=False)

In [110]:
concattestout= pd.concat([blasterWorm_procd, CIDDS_int_1_procd], axis=0)

In [111]:
concattestout.shape

(11368168, 17)

In [113]:
blasterWorm_procd.shape[0]+CIDDS_int_1_procd.shape[0]

11368168

In [114]:
concattestout2= pd.concat([blasterWorm_procd, CIDDS_ext_1_procd], axis=0)

In [115]:
concattestout2.shape

(3292253, 17)

In [116]:
blasterWorm_procd.shape[0]+CIDDS_ext_1_procd.shape[0]

3292253

In [117]:
concattestout3= pd.concat([CIDDS_ext_1_procd, CIDDS_int_1_procd], axis=0)

In [119]:
concattestout3.shape

(8421591, 17)

In [120]:
CIDDS_int_1_procd.shape[0]+CIDDS_ext_1_procd.shape[0]

8421591

In [7]:
blasterWormFinalTest = pd.read_csv('./PreprocessedCSVs/blasterWormProcessed.csv')

In [8]:
blasterWormFinalTest.head()

Unnamed: 0,Date first seen,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,U,A,P,R,S,F,class,attackType
0,2020-01-25 03:44:57,0.28,TCP,193.219.74.172,59758,83.169.5.121,3306,10,1080,0,1,1,0,0,1,normal,none
1,2020-01-25 03:44:52,4.73,TCP,184.73.156.247,443,193.219.75.85,58895,10,7300,0,1,0,0,0,1,normal,none
2,2020-01-25 03:44:57,0.0,TCP,83.171.40.3,6666,80.82.77.132,56667,5,200,0,1,0,1,0,0,normal,none
3,2020-01-25 03:44:38,19.29,TCP,95.108.213.17,62443,158.129.192.240,80,20,1080,0,1,0,0,1,1,normal,none
4,2020-01-25 03:44:55,1.82,TCP,124.115.207.226,56637,158.129.192.178,33896,10,825,0,1,1,1,0,0,normal,none
