In [1]:
from fastai import *
from fastai.tabular import *

In [29]:
from google.colab import drive
drive.mount('/content/drive/')

# Change this path for your project
path = "/content/drive/My Drive/FastAI/DoH-Exfiltration-Detection/data/"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Labeling

In [21]:
def label(df, target, OS, DoH, DoH_label, Software, Session):
  df.loc[df['Source'] == target, 'Source'] = 'Target'
  df.loc[df['Destination'] == target, 'Destination'] = 'Target'

  # Label if it's inbound or outbound
  df['From_Target'] = False
  df.loc[df['Source'] == 'Target', 'From_Target'] = True

  # Label os
  df.loc[df['Source'] == 'Target', 'OS'] = OS
  df.loc[df['Source'] == 'Target', 'Software'] = Software

  # Label comm type
  df['Comm_Type'] = 'NA'
  df.loc[df['Protocol'] == 'DNS', 'Comm_Type'] = 'DNS'
  df.loc[(df['Protocol'] == 'HTTP') | (df['Protocol'] == 'HTTP/XML') | (df['Protocol'] == 'ICMP')| (df['Protocol'] == 'SSL') | (df['Protocol'] == 'TCP') | (df['Protocol'] == 'TLSv1.2') | (df['Protocol'] == 'TLSv1.3') | (df['Protocol'] == 'UDP'), 'Comm_Type'] = 'Browsing'
  df.loc[(df['Source'] == DoH) | (df['Destination'] == DoH), 'Comm_Type'] = DoH_label
  
  df['Session'] = Session
  
  df = df.loc[df['Comm_Type'] != 'NA']

  return df

In [43]:
df_browsing_1 = pd.read_csv(path + 'source/' + 'browsing-1.csv')
df_browsing_2 = pd.read_csv(path + 'source/' + 'browsing-2.csv')

df_simple = pd.read_csv(path + 'source/' + 'simple.csv')
df_variations = pd.read_csv(path + 'source/' + 'variations.csv')
df_stealth = pd.read_csv(path + 'source/' + 'stealth.csv')

df_unknown_1 = pd.read_csv(path + 'source/' + 'unknown-1.csv')
df_test = pd.read_csv(path + 'source/' + 'test.csv')

In [44]:
df_browsing_1 = label(df_browsing_1, '192.168.1.100', 'Windows 10', '1.1.1.1', 'DoH', 'Chrome', 1)
df_browsing_2 = label(df_browsing_2, '192.168.207.132', 'Windows 10', '1.1.1.1', 'DoH', 'Chrome', 2)

df_simple = label(df_simple, '10.0.2.15', 'Linux', '1.1.1.1', 'Exfiltration', 'Python', 3)
df_variations = label(df_variations, '10.0.2.15', 'Linux', '1.1.1.1', 'Exfiltration', 'Python', 4)
df_stealth = label(df_stealth, '10.0.2.15', 'Linux', '1.1.1.1', 'Exfiltration', 'Python', 5)

df_unknown_1 = label(df_unknown_1, '192.168.205.208', 'Windows 10', '', 'DoH', 'Firefox', 6)
df_test = label(df_test, '10.0.2.15', 'Windows 10', '1.1.1.1', 'DoH', 'Python', 7)

# Feature Engineering

This script is a bit old and was made more efficient and generalized for a different project, but it gets some of the job done for now still.

In [24]:
from re import *

def re_pull_attr(sig, info):
  search = re.search('('+sig+')(\d+)', info)
  return int(search.group(2)) if search != None else np.NaN

def re_pull_handshake(info):
  search = re.search('(\[\w+\])|(\[\w+, \w+\])', info)
  if search != None:
    return search.group(1) if search.group(1) != None else search.group(2)
  else:
    return np.NaN

def index_df(df):
  tmp_df = df.copy()
  tmp_df['Index'] = np.NaN
  tmp_df = tmp_df[['Index']+tmp_df.columns[:-1].tolist()]

  arr = tmp_df.values
  for i in range(len(arr)):
    arr[i][0] = str(i)
  return np_to_df(arr, tmp_df, False)

def get_session(session, arr):
  ses = []
  for i in range(len(arr)):
    if arr[i][12] == session:
      ses.append(arr[i])
  return array(ses)

def get_conversation(ip, arr):
  conversation = []
  for i in range(len(arr)):
    ip_q = arr[i][3] if arr[i][3] != 'Target' else arr[i][4]
    if ip_q == ip:
      conversation.append(arr[i])
  return array(conversation)

def filter_info(info):
  return re.sub(r'[0-9]+', '', info)

def np_to_df(arr, df, rm_index=True):
  return pd.DataFrame(data=arr[:,(1 if rm_index else 0):], index=arr[:,0], columns=df.columns[(1 if rm_index else 0):])




def engineer(df, sessions):
  df = df[['No.', 'Time', 'Source', 'Destination', 'Protocol', 'Length', 'Info', 'From_Target', 'OS', 'Software', 'Comm_Type', 'Session']]
  df = index_df(df)

  ## Create columns
  #################

  # Info
  # 13 - 18
  df = df.join(pd.DataFrame(columns=['Info_Filtered', 'Handshake', 'Seq', 'Ack', 'Win', 'Len']))
  # Conversation
  # 19, 20-28, 29-37, 38-46, 47-55
  df = df.join(pd.DataFrame(columns=['Conversation_dT'] + ['Conversation_Protocol_T-' + str(x) for x in range(1,10)] + ['Conversation_Length_T-' + str(x) for x in range(1,10)] + ['Conversation_From_Target_T-' + str(x) for x in range(1,10)] + ['Conversation_dT_T-' + str(x) for x in range(1,10)]))
  
  # Work on raw data instead
  df_arr = df.values

  ## Add data
  ###########
  for session in sessions:
    ses_arr = get_session(session, df_arr)
    addrs = np.unique(ses_arr[:,3:5])[:-1]
    
    for ip in addrs:
      conversation = get_conversation(ip, ses_arr)


      #print("session: " + str(session) + ", IP index: " + str(addrs.tolist().index(ip)) + ", len IPs: " + str(len(addrs)) + ", IPs packets: " + str(len(ses_arr)))
      
      for i in range(len(conversation)):
        item = conversation[i]

        # Info (Doesn't update ses_arr)
        info = item[7]
        df_arr[int(item[0])][13] = filter_info(info)
        df_arr[int(item[0])][14] = re_pull_handshake(info)
        df_arr[int(item[0])][15] = re_pull_attr('Seq=', info)
        df_arr[int(item[0])][16] = re_pull_attr('Ack=', info)
        df_arr[int(item[0])][17] = re_pull_attr('Win=', info)
        df_arr[int(item[0])][18] = re_pull_attr('Len=', info)


        # Conversation_dT
        col = 2
        hist = conversation[max(0,i-1):i,col]
        for j in range(len(hist)):
          dT = item[col] - hist[j]
          df_arr[int(item[0])][19] = dT
          conversation[i][19] = dT

        # Protocol history
        col = 5
        hist =  np.flip(conversation[max(0,i-9):i,col])
        col_start = 20
        for j in range(len(hist)):
          df_arr[int(item[0])][col_start+j] = hist[j]
        
        # Length history
        col = 6
        hist =  np.flip(conversation[max(0,i-9):i,col])
        col_start = 29
        for j in range(len(hist)):
          df_arr[int(item[0])][col_start+j] = hist[j]
        
        # From_Target history
        col = 8
        hist =  np.flip(conversation[max(0,i-9):i,col])
        col_start = 38
        for j in range(len(hist)):
          df_arr[int(item[0])][col_start+j] = hist[j]

        # Conversation_dT history
        col = 19
        hist = np.flip(conversation[max(0,i-9):i,col])
        col_start = 47
        for j in range(len(hist)):
          df_arr[int(item[0])][col_start+j] = hist[j]

  return np_to_df(df_arr, df)

In [45]:
df_browsing_1 = engineer(df_browsing_1, [1])
df_browsing_2 = engineer(df_browsing_2, [2])

df_simple = engineer(df_simple, [3])
df_variations = engineer(df_variations, [4])
df_stealth = engineer(df_stealth, [5])

df_unknown_1 = engineer(df_unknown_1, [6])
df_test = engineer(df_test, [7])

In [26]:
df_browsing_1.tail(10)

Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info,From_Target,OS,Software,Comm_Type,Session,Info_Filtered,Handshake,Seq,Ack,Win,Len,Conversation_dT,Conversation_Protocol_T-1,Conversation_Protocol_T-2,Conversation_Protocol_T-3,Conversation_Protocol_T-4,Conversation_Protocol_T-5,Conversation_Protocol_T-6,Conversation_Protocol_T-7,Conversation_Protocol_T-8,Conversation_Protocol_T-9,Conversation_Length_T-1,Conversation_Length_T-2,Conversation_Length_T-3,Conversation_Length_T-4,Conversation_Length_T-5,Conversation_Length_T-6,Conversation_Length_T-7,Conversation_Length_T-8,Conversation_Length_T-9,Conversation_From_Target_T-1,Conversation_From_Target_T-2,Conversation_From_Target_T-3,Conversation_From_Target_T-4,Conversation_From_Target_T-5,Conversation_From_Target_T-6,Conversation_From_Target_T-7,Conversation_From_Target_T-8,Conversation_From_Target_T-9,Conversation_dT_T-1,Conversation_dT_T-2,Conversation_dT_T-3,Conversation_dT_T-4,Conversation_dT_T-5,Conversation_dT_T-6,Conversation_dT_T-7,Conversation_dT_T-8,Conversation_dT_T-9
22855,25285,651.9,Target,74.125.136.136,UDP,70,64912 > 443 Len=28,True,Windows 10,Chrome,Browsing,1,> Len=,,,,,28.0,0.001133,UDP,UDP,UDP,UDP,UDP,UDP,UDP,UDP,UDP,281,179,1392,1392,62,111,70,70,241,False,False,False,False,False,True,True,True,False,4e-06,1e-06,2e-06,0.026821,0.055303,1.04127,0.000306,0.000719,1e-06
22856,25286,651.9,Target,74.125.136.136,UDP,70,64912 > 443 Len=28,True,Windows 10,Chrome,Browsing,1,> Len=,,,,,28.0,0.000196,UDP,UDP,UDP,UDP,UDP,UDP,UDP,UDP,UDP,70,281,179,1392,1392,62,111,70,70,True,False,False,False,False,False,True,True,True,0.001133,4e-06,1e-06,2e-06,0.026821,0.055303,1.04127,0.000306,0.000719
22857,25289,654.097,Target,162.159.133.234,TLSv1.2,104,Application Data,True,Windows 10,Chrome,Browsing,1,Application Data,,,,,,2.2781,TCP,TCP,TLSv1.2,TCP,TLSv1.2,TCP,TCP,TLSv1.2,TCP,66,449,449,66,108,66,108,108,54,True,False,False,True,False,True,False,False,True,8.3e-05,1e-06,2.44304,0.000136,0.031282,0.000248,2e-06,36.4094,0.040864
22858,25290,654.1,162.159.133.234,Target,TLSv1.2,108,Application Data,False,,,Browsing,1,Application Data,,,,,,0.002528,TLSv1.2,TCP,TCP,TLSv1.2,TCP,TLSv1.2,TCP,TCP,TLSv1.2,104,66,449,449,66,108,66,108,108,True,True,False,False,True,False,True,False,False,2.2781,8.3e-05,1e-06,2.44304,0.000136,0.031282,0.000248,2e-06,36.4094
22859,25291,654.14,Target,162.159.133.234,TCP,54,63269 > 443 [ACK] Seq=801 Ack=3293 Win=513 L...,True,Windows 10,Chrome,Browsing,1,> [ACK] Seq= Ack= Win= Len=,[ACK],801.0,3293.0,513.0,0.0,0.039869,TLSv1.2,TLSv1.2,TCP,TCP,TLSv1.2,TCP,TLSv1.2,TCP,TCP,108,104,66,449,449,66,108,66,108,False,True,True,False,False,True,False,True,False,0.002528,2.2781,8.3e-05,1e-06,2.44304,0.000136,0.031282,0.000248,2e-06
22860,25292,654.145,162.159.133.234,Target,TLSv1.2,86,Application Data,False,,,Browsing,1,Application Data,,,,,,0.00531,TCP,TLSv1.2,TLSv1.2,TCP,TCP,TLSv1.2,TCP,TLSv1.2,TCP,54,108,104,66,449,449,66,108,66,True,False,True,True,False,False,True,False,True,0.039869,0.002528,2.2781,8.3e-05,1e-06,2.44304,0.000136,0.031282,0.000248
22861,25293,654.186,Target,162.159.133.234,TCP,54,63269 > 443 [ACK] Seq=801 Ack=3325 Win=512 L...,True,Windows 10,Chrome,Browsing,1,> [ACK] Seq= Ack= Win= Len=,[ACK],801.0,3325.0,512.0,0.0,0.040886,TLSv1.2,TCP,TLSv1.2,TLSv1.2,TCP,TCP,TLSv1.2,TCP,TLSv1.2,86,54,108,104,66,449,449,66,108,False,True,False,True,True,False,False,True,False,0.00531,0.039869,0.002528,2.2781,8.3e-05,1e-06,2.44304,0.000136,0.031282
22862,25295,655.17,Target,192.168.1.71,TCP,164,"63334 > 8009 [PSH, ACK] Seq=14499 Ack=14592 ...",True,Windows 10,Chrome,Browsing,1,"> [PSH, ACK] Seq= Ack= Win= Len= [TCP segm...","[PSH, ACK]",14499.0,14592.0,513.0,110.0,4.95936,TCP,TCP,TCP,TCP,TCP,TCP,TCP,TCP,TCP,54,261,230,54,261,230,54,164,164,True,False,True,True,False,True,True,False,True,0.042021,0.00541,3.7147,0.041134,0.007432,2.15232,0.041373,0.007012,4.9611
22863,25296,655.176,192.168.1.71,Target,TCP,164,"8009 > 63334 [PSH, ACK] Seq=14592 Ack=14609 ...",False,,,Browsing,1,"> [PSH, ACK] Seq= Ack= Win= Len= [TCP segm...","[PSH, ACK]",14592.0,14609.0,329.0,110.0,0.006584,TCP,TCP,TCP,TCP,TCP,TCP,TCP,TCP,TCP,164,54,261,230,54,261,230,54,164,True,True,False,True,True,False,True,True,False,4.95936,0.042021,0.00541,3.7147,0.041134,0.007432,2.15232,0.041373,0.007012
22864,25297,655.217,Target,192.168.1.71,TCP,54,63334 > 8009 [ACK] Seq=14609 Ack=14702 Win=5...,True,Windows 10,Chrome,Browsing,1,> [ACK] Seq= Ack= Win= Len=,[ACK],14609.0,14702.0,512.0,0.0,0.040803,TCP,TCP,TCP,TCP,TCP,TCP,TCP,TCP,TCP,164,164,54,261,230,54,261,230,54,False,True,True,False,True,True,False,True,True,0.006584,4.95936,0.042021,0.00541,3.7147,0.041134,0.007432,2.15232,0.041373


In [46]:
df_browsing_1.to_csv(path + 'feature-engineered/' + 'browsing-1-FE.csv')
df_browsing_2.to_csv(path + 'feature-engineered/' + 'browsing-2-FE.csv')

df_simple.to_csv(path + 'feature-engineered/' + 'simple-FE.csv')
df_variations.to_csv(path + 'feature-engineered/' + 'variations-FE.csv')
df_stealth.to_csv(path + 'feature-engineered/' + 'stealth-FE.csv')

df_unknown_1.to_csv(path + 'feature-engineered/' + 'unknown-1-FE.csv')
df_test.to_csv(path + 'feature-engineered/' + 'test-FE.csv')

# Save Training Set

In [32]:
df_train = pd.concat([df_browsing_1, df_browsing_2, df_simple, df_variations, df_stealth])

In [33]:
df_train.to_csv(path + 'train.csv',index=False)