In [2]:
import os
import pandas as pd
import numpy as np
import pprint
from IPython.display import Markdown, display
from datetime import datetime
from datetime import timezone
import pickle

##local python file holding the paths to the directories I store the log files in
from directories_to_use import argus_text_files_dir, getTestingDir, getRadiflowNormal, getRadiflowAttack

In [33]:
"""
Read Argus text files into one single dataframe
"""
def readDataIntoDataframe(argus_text_files_dir):
    first_time = True
    for t_file in os.listdir(argus_text_files_dir):
        #print(t_file)
        if first_time:
            new_df = pd.read_csv(argus_text_files_dir + t_file)
            date = removeDateFromName(t_file)
            new_df["StartTime"] = new_df["StartTime"].apply(lambda x : date + x)
            new_df["LastTime"] = new_df["LastTime"].apply(lambda x : date + x)
            first_time = False
        else:
            temp_df = pd.read_csv(argus_text_files_dir + t_file)
            date = removeDateFromName(t_file)
            temp_df["StartTime"] = temp_df["StartTime"].apply(lambda x : date + x)
            temp_df["LastTime"] = temp_df["LastTime"].apply(lambda x : date + x)
            new_df = pd.concat([new_df, temp_df], ignore_index=True)
    return new_df

def removeDateFromName(filename):
    year = filename.split("_")[-1][0:4]
    month = filename.split("_")[-1][4:6]
    day = filename.split("_")[-1][6:8]
    full_date = day +"-"+ month +"-"+ year + " "
    return full_date

#data2017_df = readDataIntoDataframe(argus_text_files_dir)
#print(data2017_df.shape)

data_radiflow_normal_df = readDataIntoDataframe(getRadiflowNormal())
data_radiflow_attack_df = readDataIntoDataframe(getRadiflowAttack())

In [34]:
"""
Loops through all the columns in the dataframe and removes any that only contain nans
"""
def removeNanColumns(dataframe):
    print("Shape before: ", dataframe.shape)
    columns_removed = []
    for col in dataframe:
        unique_vals = dataframe[col].unique()
        if unique_vals.shape[0] == 1:
            if np.isnan(unique_vals[0]):
                dataframe = dataframe.drop([col], axis=1)
                columns_removed.append(col)
    print("Shape after: ", dataframe.shape)
    print("Columns with only Nans: " + str(columns_removed))
    return dataframe

"""
Remove columns that only contain one unique value.
"""
def removeSingleColumns(dataframe):
    print("Shape before: ", dataframe.shape)
    columns_removed = []
    for col in dataframe:
        unique_vals = dataframe[col].unique()
        if unique_vals.shape[0] == 1:
            dataframe = dataframe.drop([col], axis=1)
            columns_removed.append(col)
    print("Shape after: ", dataframe.shape)
    print("Columns with only one unique value: " + str(columns_removed))
    return dataframe
"""
Prints the columns in a dataframe and all its unique values
"""
def printColumnsAndUniqueVals(dataframe):
    pp = pprint.PrettyPrinter(indent=4)
    for col in dataframe.columns:
        printmd("**" + col + "**: " + str(dataframe[col].unique()))
        
"""
Basically prints in markdown form, can also render HTML
"""
def printmd(string):
    display(Markdown(string))


data_radiflow_normal_df = removeNanColumns(data_radiflow_normal_df)
data_radiflow_normal_df = removeSingleColumns(data_radiflow_normal_df)
data_radiflow_attack_df = removeNanColumns(data_radiflow_attack_df)
data_radiflow_attack_df = removeSingleColumns(data_radiflow_attack_df)
            

Shape before:  (31253, 118)
Shape after:  (31253, 73)
Columns with only Nans: ['SrcMac', 'DstMac', 'SrcOui', 'DstOui', 'sCo', 'dCo', 'sMpls', 'dMpls', 'sAS', 'dAS', 'iAS', 'NStrok', 'sNStrok', 'dNStrok', 'SIntPkt', 'SIntDist', 'SIntPktAct', 'SIntActDist', 'SIntPktIdl', 'SIntIdlDist', 'DIntPkt', 'DIntDist', 'DIntPktAct', 'DIntActDist', 'DIntPktIdl', 'DIntIdlDist', 'SrcJitter', 'SrcJitAct', 'DstJitter', 'DstJitAct', 'Label', 'srcUdata', 'dstUdata', 'sVpri', 'dVpri', 'SRange', 'ERange', 'Inode', 'sPktSz', 'sMaxPktSz', 'dPktSz', 'dMaxPktSz', 'sMinPktSz', 'dMinPktSz', 'dMinPktSz.1']
Shape before:  (31253, 73)
Shape after:  (31253, 61)
Columns with only one unique value: ['Trans', 'IdleTime', 'StdDev', 'AutoId', 'TotAppByte', 'SAppBytes', 'DAppBytes', 'PCRatio', 'Retrans', 'SrcRetra', 'DstRetra', 'pRetran']
Shape before:  (10328, 118)
Shape after:  (10328, 74)
Columns with only Nans: ['SrcMac', 'DstMac', 'SrcOui', 'DstOui', 'sCo', 'dCo', 'sMpls', 'dMpls', 'sAS', 'dAS', 'iAS', 'NStrok', 'sNSt

In [18]:
"""Print columns and the percentage of nans present in each column"""
pd.options.display.max_rows = 4000
data_radiflow_normal_df.isnull().mean() * 100

StartTime     0.0
LastTime      0.0
Flgs          0.0
Dur           0.0
RunTime       0.0
Mean          0.0
Sum           0.0
Min           0.0
Max           0.0
SrcAddr       0.0
DstAddr       0.0
Proto         0.0
Sport         0.0
Dport         0.0
Cause         0.0
TotPkts       0.0
SrcPkts       0.0
DstPkts       0.0
TotBytes      0.0
SrcBytes      0.0
DstBytes      0.0
Load          0.0
SrcLoad       0.0
DstLoad       0.0
Loss          0.0
SrcLoss       0.0
DstLoss       0.0
pLoss         0.0
Rate          0.0
SrcRate       0.0
DstRate       0.0
Dir           0.0
State         0.0
TcpRtt        0.0
SynAck        0.0
AckDat        0.0
Offset        0.0
sMeanPktSz    0.0
dMeanPktSz    0.0
dtype: float64

In [35]:
"""
Drop columns with too many nulls
"""
def dropChosenColumns(dataframe, column_names):
    for column in column_names:
        if column in dataframe:
            dataframe = dataframe.drop([column], axis=1)
    return dataframe
        
"""
Replace Nans with 0
"""
def replaceNansWithZero(dataframe, column_names):
    for column in column_names:
        dataframe[column] = dataframe[column].fillna(0)
    return dataframe

"""
Remove rows with nans for chosen columns
"""
def removeNanRows(dataframe, column_names):
    for column in column_names:
        dataframe = dataframe[dataframe[column].notna()]
    return dataframe


data_radiflow_normal_rowless = removeNanRows(data_radiflow_normal_df, ['Sport', 'Dport', "sTtl"])
data_radiflow_normal_rowless = dropChosenColumns(data_radiflow_normal_rowless, ["TcpOpt", "sVid", "dVid", "Seq", "sIpId", "dIpId", "SrcTCPBase", "DstTCPBase", "SrcWin", "DstWin", "dDSb", "dTos", "dHops", "SrcGap", "DstGap", "dTtl", "sVlan", "dVlan"])
print(data_radiflow_normal_rowless.shape)



data_radiflow_normal_df = dropChosenColumns(data_radiflow_normal_df, ["TcpOpt", "sVid", "dVid", "Seq", "sIpId", "dIpId", "SrcTCPBase", "DstTCPBase", "SrcWin", "DstWin", "sDSb", "dDSb", "dTos", "sTos", "dHops", "SrcGap", "DstGap", "sTtl", "dTtl", "sHops", "sVlan", "dVlan"])
data_radiflow_normal_df = removeNanRows(data_radiflow_normal_df, ['Sport', 'Dport'])
print(data_radiflow_normal_df.shape)




(24884, 43)
(30837, 39)


In [36]:
"""
Needs to know the category e.g. IPs and value to add to the dict of dicts (unique_vals)
Returns: the unique int assigned to the value
"""
def convertToNum(category, val, unique_vals):
    if category not in unique_vals:
        unique_vals[category] = {}
    if val.strip() not in unique_vals[category].keys():
        new_val = len(unique_vals[category].keys())
        unique_vals[category][val.strip()] = len(unique_vals[category].keys())
        return new_val
    else:
        return unique_vals[category][val.strip()]
    
"""
Port numbers are a special case. They're integers stored as strings.
They are either hex numbers or standard int strings - therefore, we 
need to check for hex before casting.
"""  
def convertPortToNum(val):
    if type(val) == str:
        isHex = '0x' in val
    elif type(val) == float:
        isHex = False
    elif type(val) == int:
        isHex = False
    if isHex:
        return int(val, base=16)
    else:
        return int(val)
    
    

"""
Takes a string in the following format:
14-06-2017 11:25:58.288831
Returns: Timestamp
"""
def createTimestamp(datetime_string):
    row_date = datetime.strptime(datetime_string, "%d-%m-%Y %H:%M:%S.%f")
    timestamp = row_date.replace(tzinfo=timezone.utc).timestamp()
    return timestamp

unique_vals = dict()




In [88]:
data_radiflow_normal_df["Dir"] = data_radiflow_normal_df["Dir"].apply(lambda x : convertToNum("dir", x, unique_vals))
data_radiflow_normal_rowless["Dir"] = data_radiflow_normal_rowless["Dir"].apply(lambda x : convertToNum("dir", x, unique_vals))

In [37]:
data_radiflow_normal_df["StartTime"] = data_radiflow_normal_df["StartTime"].apply(lambda x : createTimestamp("24-06-2021 " + x.split(".c")[1].strip()))
data_radiflow_normal_df["LastTime"] = data_radiflow_normal_df["LastTime"].apply(lambda x : createTimestamp("24-06-2021 " + x.split(".c")[1].strip()))
data_radiflow_normal_df["SrcAddr"] = data_radiflow_normal_df["SrcAddr"].apply(lambda x : convertToNum("ips", x, unique_vals))
data_radiflow_normal_df["DstAddr"] = data_radiflow_normal_df["DstAddr"].apply(lambda x : convertToNum("ips", x, unique_vals))
data_radiflow_normal_df["Sport"] = data_radiflow_normal_df["Sport"].apply(lambda x : convertPortToNum(x))
data_radiflow_normal_df["Dport"] = data_radiflow_normal_df["Dport"].apply(lambda x : convertPortToNum(x))

data_radiflow_normal_rowless["StartTime"] = data_radiflow_normal_rowless["StartTime"].apply(lambda x : createTimestamp("24-06-2021 " + x.split(".c")[1].strip()))
data_radiflow_normal_rowless["LastTime"] = data_radiflow_normal_rowless["LastTime"].apply(lambda x : createTimestamp("24-06-2021 " + x.split(".c")[1].strip()))
data_radiflow_normal_rowless["SrcAddr"] = data_radiflow_normal_rowless["SrcAddr"].apply(lambda x : convertToNum("ips", x, unique_vals))
data_radiflow_normal_rowless["DstAddr"] = data_radiflow_normal_rowless["DstAddr"].apply(lambda x : convertToNum("ips", x, unique_vals))
data_radiflow_normal_rowless["Sport"] = data_radiflow_normal_rowless["Sport"].apply(lambda x : convertPortToNum(x))
data_radiflow_normal_rowless["Dport"] = data_radiflow_normal_rowless["Dport"].apply(lambda x : convertPortToNum(x))

In [38]:
"""
Takes in two dataframes and makes sure that the columns
of test_df are the same as those of train_df. Returns a
modified dataframe.
"""
def alignToTrainingData(train_df, test_df):
    for col in test_df.columns:
        if col not in train_df.columns:
            test_df = test_df.drop([col], axis=1)
    return test_df

#data2019_mod_df = data2019_df.copy(deep=True)
data_radiflow_attack_rowless = alignToTrainingData(data_radiflow_normal_rowless, data_radiflow_attack_df)
print(data_radiflow_attack_rowless.shape)

data_radiflow_attack_df = alignToTrainingData(data_radiflow_normal_df, data_radiflow_attack_df)
print(data_radiflow_attack_df.shape)

(10328, 43)
(10328, 39)


In [39]:
data_radiflow_attack_rowless = removeNanRows(data_radiflow_attack_rowless, ['Sport', 'Dport', "sTtl"])
data_radiflow_attack_df = removeNanRows(data_radiflow_attack_df, ['Sport', 'Dport'])

In [40]:
data_radiflow_attack_df.isnull().mean() * 100

StartTime     0.0
LastTime      0.0
Flgs          0.0
Dur           0.0
RunTime       0.0
Mean          0.0
Sum           0.0
Min           0.0
Max           0.0
SrcAddr       0.0
DstAddr       0.0
Proto         0.0
Sport         0.0
Dport         0.0
Cause         0.0
TotPkts       0.0
SrcPkts       0.0
DstPkts       0.0
TotBytes      0.0
SrcBytes      0.0
DstBytes      0.0
Load          0.0
SrcLoad       0.0
DstLoad       0.0
Loss          0.0
SrcLoss       0.0
DstLoss       0.0
pLoss         0.0
Rate          0.0
SrcRate       0.0
DstRate       0.0
Dir           0.0
State         0.0
TcpRtt        0.0
SynAck        0.0
AckDat        0.0
Offset        0.0
sMeanPktSz    0.0
dMeanPktSz    0.0
dtype: float64

In [41]:
data_radiflow_attack_df["StartTime"] = data_radiflow_attack_df["StartTime"].apply(lambda x : createTimestamp("23-06-2021 " + x.split(".c")[1].strip()))
data_radiflow_attack_df["LastTime"] = data_radiflow_attack_df["LastTime"].apply(lambda x : createTimestamp("23-06-2021 " + x.split(".c")[1].strip()))
data_radiflow_attack_df["SrcAddr"] = data_radiflow_attack_df["SrcAddr"].apply(lambda x : convertToNum("ips", x, unique_vals))
data_radiflow_attack_df["DstAddr"] = data_radiflow_attack_df["DstAddr"].apply(lambda x : convertToNum("ips", x, unique_vals))
data_radiflow_attack_df["Sport"] = data_radiflow_attack_df["Sport"].apply(lambda x : convertPortToNum(x))
data_radiflow_attack_df["Dport"] = data_radiflow_attack_df["Dport"].apply(lambda x : convertPortToNum(x))

data_radiflow_attack_rowless["StartTime"] = data_radiflow_attack_rowless["StartTime"].apply(lambda x : createTimestamp("23-06-2021 " + x.split(".c")[1].strip()))
data_radiflow_attack_rowless["LastTime"] = data_radiflow_attack_rowless["LastTime"].apply(lambda x : createTimestamp("23-06-2021 " + x.split(".c")[1].strip()))
data_radiflow_attack_rowless["SrcAddr"] = data_radiflow_attack_rowless["SrcAddr"].apply(lambda x : convertToNum("ips", x, unique_vals))
data_radiflow_attack_rowless["DstAddr"] = data_radiflow_attack_rowless["DstAddr"].apply(lambda x : convertToNum("ips", x, unique_vals))
data_radiflow_attack_rowless["Sport"] = data_radiflow_attack_rowless["Sport"].apply(lambda x : convertPortToNum(x))
data_radiflow_attack_rowless["Dport"] = data_radiflow_attack_rowless["Dport"].apply(lambda x : convertPortToNum(x))

In [89]:
data_radiflow_attack_df["Dir"] = data_radiflow_attack_df["Dir"].apply(lambda x : convertToNum("dir", x, unique_vals))
data_radiflow_attack_rowless["Dir"] = data_radiflow_attack_rowless["Dir"].apply(lambda x : convertToNum("dir", x, unique_vals))

In [44]:
data_radiflow_attack_df["Classification"] = 0
data_radiflow_attack_rowless["Classification"] = 0

data_radiflow_normal_df["Classification"] = 0
data_radiflow_normal_rowless["Classification"] = 0

In [57]:
first_attack = (data_radiflow_attack_df["StartTime"] >= createTimestamp("23-06-2021 12:45:00.00")) & (data_radiflow_attack_df["StartTime"] < createTimestamp("23-06-2021 12:52:00.00"))
data_radiflow_attack_df.loc[first_attack, "Classification"] = 1

second_attack = (data_radiflow_attack_df["StartTime"] >= createTimestamp("23-06-2021 12:56:00.00")) & (data_radiflow_attack_df["StartTime"] < createTimestamp("23-06-2021 13:02:00.00"))
data_radiflow_attack_df.loc[second_attack, "Classification"] = 2

third_attack = (data_radiflow_attack_df["StartTime"] >= createTimestamp("23-06-2021 13:06:00.00")) & (data_radiflow_attack_df["StartTime"] < createTimestamp("23-06-2021 13:12:00.00"))
data_radiflow_attack_df.loc[third_attack, "Classification"] = 3

In [58]:
first_attack = (data_radiflow_attack_rowless["StartTime"] >= createTimestamp("23-06-2021 12:45:00.00")) & (data_radiflow_attack_rowless["StartTime"] < createTimestamp("23-06-2021 12:52:00.00"))
data_radiflow_attack_rowless.loc[first_attack, "Classification"] = 1

second_attack = (data_radiflow_attack_rowless["StartTime"] >= createTimestamp("23-06-2021 12:56:00.00")) & (data_radiflow_attack_rowless["StartTime"] < createTimestamp("23-06-2021 13:02:00.00"))
data_radiflow_attack_rowless.loc[second_attack, "Classification"] = 2

third_attack = (data_radiflow_attack_rowless["StartTime"] >= createTimestamp("23-06-2021 13:06:00.00")) & (data_radiflow_attack_rowless["StartTime"] < createTimestamp("23-06-2021 13:12:00.00"))
data_radiflow_attack_rowless.loc[third_attack, "Classification"] = 3

In [91]:
data_full_df = pd.concat([data_radiflow_normal_df, data_radiflow_attack_df], ignore_index=True, sort=False)
data_full_rowless = pd.concat([data_radiflow_normal_rowless, data_radiflow_attack_rowless], ignore_index=True, sort=False)

print(data_full_df.shape)
print(data_full_rowless.shape)

(40980, 40)
(33062, 44)


In [92]:
unique_flags = []
def add_unique(flags_str):
    if flags_str not in unique_flags:
        unique_flags.append(flags_str)
        return flags_str
    
individual_flags = []
def extractIndividualFlags(unique_flags):
    for flag_str in unique_flags:
        flag_list = flag_str.split(" ")
        for flag in flag_list:
            if (flag != "") and (flag not in individual_flags):
                individual_flags.extend([char for char in flag if char not in individual_flags])
                
data_full_df["Flgs"].apply(lambda x : add_unique(x.strip()))
print(unique_flags)
extractIndividualFlags(unique_flags)
#Ugly but needs to be done manually. This is also a possible unique value.
individual_flags.append("*2")
print(individual_flags)

['e', '*', '* d', '* *', '* s', '*  S', '* g', '* dD', '* *D', '*  D', '*U']
['e', '*', 'd', 's', 'S', 'g', 'D', 'U', '*2']


In [93]:
def add_new_columns(df, individual_flags):
    for flg in individual_flags:
        df["Flg-"+flg] = 0
    return df

add_new_columns(data_full_df, individual_flags)
add_new_columns(data_full_rowless, individual_flags)
print(data_full_df.head())

      StartTime      LastTime        Flgs  Dur  RunTime  Mean  Sum  Min  Max  \
0  1.624540e+09  1.624540e+09   e          0.0      0.0   0.0  0.0  0.0  0.0   
1  1.624540e+09  1.624540e+09   e          0.0      0.0   0.0  0.0  0.0  0.0   
2  1.624540e+09  1.624540e+09   e          0.0      0.0   0.0  0.0  0.0  0.0   
3  1.624540e+09  1.624540e+09   e          0.0      0.0   0.0  0.0  0.0  0.0   
4  1.624540e+09  1.624540e+09   e          0.0      0.0   0.0  0.0  0.0  0.0   

   SrcAddr  ...  Classification  Flg-e  Flg-*  Flg-d Flg-s  Flg-S  Flg-g  \
0        0  ...               0      0      0      0     0      0      0   
1        1  ...               0      0      0      0     0      0      0   
2        2  ...               0      0      0      0     0      0      0   
3        1  ...               0      0      0      0     0      0      0   
4        1  ...               0      0      0      0     0      0      0   

   Flg-D  Flg-U  Flg-*2  
0      0      0       0  
1      0  

In [94]:
def convertFlagToNum(row, flg_str):
    flg_str = flg_str.replace(" ", "")
    for flg_char in flg_str:
        if flg_char != '*':
            row["Flg-" + flg_char] = 1
        else:
            if len(flg_str) == 1:
                row["Flg-*"] = 1
            else:
                if flg_str.count("*") == 2:
                    row["Flg-*"] = 1
                    row["Flg-*2"] = 1
                elif flg_str.index("*") != 0:
                    row["Flg-*2"] = 1
                elif flg_str.index("*") == 0:
                    row["Flg-*"] = 1
    return row
                    
data_full_df = data_full_df.apply(lambda x : convertFlagToNum(x, x.Flgs), axis=1)
data_full_rowless = data_full_rowless.apply(lambda x : convertFlagToNum(x, x.Flgs), axis=1)

In [95]:
data_full_df = data_full_df.drop(["Flgs"], axis=1)
data_full_rowless = data_full_rowless.drop(["Flgs"], axis=1)

In [96]:
cause_full = pd.get_dummies(data_full_df["Cause"])
data_full_df = data_full_df.drop(["Cause"], axis=1)
data_full_df = data_full_df.join(cause_full)

In [97]:
cause_full = pd.get_dummies(data_full_rowless["Cause"])
data_full_rowless = data_full_rowless.drop(["Cause"], axis=1)
data_full_rowless = data_full_rowless.join(cause_full)

In [98]:
state_full = pd.get_dummies(data_full_df["State"])
data_full_df = data_full_df.drop(["State"], axis=1)
data_full_df = data_full_df.join(state_full)

In [99]:
state_full = pd.get_dummies(data_full_rowless["State"])
data_full_rowless = data_full_rowless.drop(["State"], axis=1)
data_full_rowless = data_full_rowless.join(state_full)

In [100]:
cols = data_full_df.columns
cols_list = list(cols)
cols_list.remove("Classification")
cols_list.append("Classification")
data_full_df = data_full_df[cols_list]

In [101]:
cols = data_full_rowless.columns
cols_list = list(cols)
cols_list.remove("Classification")
cols_list.append("Classification")
data_full_rowless = data_full_rowless[cols_list]

In [102]:
data_full_df.to_csv("data_full_radiflow_df.csv", index=False)

In [103]:
data_full_rowless.to_csv("data_full_rowless_radiflow_df.csv", index=False)

In [104]:
"""
This code is taken straight from: https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file/32216025
"""

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [105]:
save_obj(unique_vals, "unique_vals_radiflow")

In [106]:
import json
ip_dict_reversed = {}
for ip, num in unique_vals["ips"].items():
    ip_dict_reversed[num] = ip

with open('ipdict_radiflow.json', 'w+') as fp:
    json.dump(ip_dict_reversed, fp)

In [107]:
features = list(data_full_df.columns)
save_obj(features, 'radiflow_features')

features_rowless = list(data_full_rowless)
save_obj(features_rowless, 'radiflow_rowless_features')

In [108]:
data_full_arr = data_full_df.to_numpy()
data_full_rowless_arr = data_full_rowless.to_numpy()

In [112]:
np.save('radiflow_datafull.npy', data_full_arr)
np.save('radiflow_rowless.npy', data_full_rowless_arr)

In [117]:
data_full_df[["StartTime", "LastTime", "dMeanPktSz"]]

Unnamed: 0,StartTime,LastTime,dMeanPktSz
0,1.624540e+09,1.624540e+09,0.000000
1,1.624540e+09,1.624540e+09,0.000000
2,1.624540e+09,1.624540e+09,0.000000
3,1.624540e+09,1.624540e+09,0.000000
4,1.624540e+09,1.624540e+09,0.000000
...,...,...,...
40975,1.624454e+09,1.624454e+09,96.250000
40976,1.624454e+09,1.624454e+09,96.250000
40977,1.624454e+09,1.624454e+09,401.600006
40978,1.624454e+09,1.624454e+09,66.000000
