In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import re

In [3]:
df_normal_train = pd.read_csv("normalTrafficTraining.csv")
df_normal_test = pd.read_csv("normalTrafficTest.csv")
df_anomal_test = pd.read_csv("anomalousTrafficTest.csv")


print(df_normal_train.shape)
print(df_normal_test.shape)
print(df_anomal_test.shape)

(36000, 14)
(36000, 14)
(24668, 14)


## Content Length, Method Analysis
- Method = 1 <=> Content Length not null for all datapoints?

In [8]:
import pandas as pd

def check_method_contentlen_relation(df):
    condition1_violations = df[(df['Method'] == 1) & (df['ContentLen'].isnull())]

    condition2_violations = df[(df['ContentLen'].notnull()) & (df['Method'] != 1)]

    if condition1_violations.empty and condition2_violations.empty:
        print("The conditions hold: Method = 1 implies ContentLen is not null and vice versa.")
    else:
        print("The conditions do not hold for all data points.")
        if not condition1_violations.empty:
            print("\nRows where Method = 1 but ContentLen is null:")
            print(condition1_violations.head())

        if not condition2_violations.empty:
            print("\nRows where ContentLen is not null but Method != 1:")
            print(condition2_violations.head())


check_method_contentlen_relation(df_normal_train)
check_method_contentlen_relation(df_normal_test)
check_method_contentlen_relation(df_anomal_test)

The conditions hold: Method = 1 implies ContentLen is not null and vice versa.
The conditions hold: Method = 1 implies ContentLen is not null and vice versa.
The conditions do not hold for all data points.

Rows where ContentLen is not null but Method != 1:
     Method                                                URL  \
60        0  http://localhost:8080/CFIDE/administrator/logi...   
67        0  http://localhost:8080/tienda1/4861362529278789730   
183       0  http://localhost:8080/tienda1/publico/producto...   
214       0  http://localhost:8080/tienda1/publico/carrito....   
341       0    http://localhost:8080/tienda1/asf-logo-wide.BAK   

                               Cookie  ContentLen  \
60   50E143DACA21CB0B6409C4685683A385        63.0   
67   6FB8AE3341778EE7D2DC267E2149B25D         4.0   
183  8544C51DF8265593FC790361341BD4A8       237.0   
214  D2A37B66BA1AE2571C7BACB75DB2F3DC        33.0   
341  6718B31F23E303469B218F1ED72D91AF       249.0   

                          

**Observations:**
- If it is GET request, and the content length is not NULL, then in the above datasets, it must be an anonmalous request. Hence adding a boolean for the content length not NULL could be learnt by the model

## ID presence and Format

In [16]:
# Function that takes the dataset, and performs the augmentation and returns the dataset

def add_session_features(df):
    df_copy = df.copy()

    df_copy['SessionIdPresent'] = df_copy['Cookie'].apply(lambda x: 1 if bool(x) and isinstance(x, str) else 0)

    # Encoding rules:
    # 1: 40-char hexadecimal string
    # 2: 32-char hexadecimal string
    # 3: Alphanumeric (non-hex) string of 32-40 chars
    # 0: Other / unknown format

    def identify_format(cookie):
        if not cookie:
            return 0
        elif re.fullmatch(r'^[a-fA-F0-9]{40}$', cookie):
            return 1
        elif re.fullmatch(r'^[a-fA-F0-9]{32}$', cookie):
            return 2
        elif re.fullmatch(r'^[a-zA-Z0-9]{32,40}$', cookie):
            return 3
        else:
            return 0

    df_copy['SessionIdFormat'] = df_copy['Cookie'].apply(identify_format)

    return df_copy

In [18]:
# Perform function on the dataframes

df_normal_train_new = add_session_features(df_normal_train)
df_normal_test_new = add_session_features(df_normal_test)
df_anomal_test_new = add_session_features(df_anomal_test)

In [22]:
df_normal_train_new.head()

Unnamed: 0,Method,URL,Cookie,ContentLen,Payload,ReqLen,ArgLen,NumArgs,NumDigitsArgs,PathLen,NumLettersArgs,NumLettersPath,NumSpecialCharsPath,MaxByteValReq,SessionIdPresent,SessionIdFormat
0,0,http://localhost:8080/tienda1/index.jsp,1F767F17239C9B670A39E9B10C3825F4,,,39,0,0,0,39,0,27,7,120,1,2
1,0,http://localhost:8080/tienda1/publico/anadir.jsp,81761ACA043B0E6014CA42A4BCD06AB5,,id=3&nombre=Vino+Rioja&precio=100&cantidad=55&...,116,35,5,7,48,24,35,8,117,1,2
2,1,http://localhost:8080/tienda1/publico/anadir.jsp,933185092E0B668B90676E0A2B0767AF,68.0,id=3&nombre=Vino+Rioja&precio=100&cantidad=55&...,116,35,5,7,48,24,35,8,117,1,2
3,0,http://localhost:8080/tienda1/publico/autentic...,8FA18BA82C5336D03D3A8AFA3E68CBB0,,modo=entrar&login=choong&pwd=d1se3ci%F3n&remem...,115,32,5,3,52,28,39,8,119,1,2
4,1,http://localhost:8080/tienda1/publico/autentic...,7104E6C68A6BCF1423DAE990CE49FEE2,63.0,modo=entrar&login=choong&pwd=d1se3ci%F3n&remem...,115,32,5,3,52,28,39,8,119,1,2


In [23]:
df_normal_test_new.head()

Unnamed: 0,Method,URL,Cookie,ContentLen,Payload,ReqLen,ArgLen,NumArgs,NumDigitsArgs,PathLen,NumLettersArgs,NumLettersPath,NumSpecialCharsPath,MaxByteValReq,SessionIdPresent,SessionIdFormat
0,0,http://localhost:8080/tienda1/index.jsp,EA414B3E327DED6875848530C864BD8F,,,39,0,0,0,39,0,27,7,120,1,2
1,0,http://localhost:8080/tienda1/publico/anadir.jsp,54E25FF4B7F0E4E855B112F882E9EEA5,,id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantid...,122,41,5,8,48,27,35,8,117,1,2
2,1,http://localhost:8080/tienda1/publico/anadir.jsp,788887A0F479749C4CEEA1E268B4A501,74.0,id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantid...,122,41,5,8,48,27,35,8,117,1,2
3,0,http://localhost:8080/tienda1/publico/autentic...,94ECD5EE8EF7EFE4BB26C701B150ED7B,,modo=entrar&login=caria&pwd=egipciaca&remember...,112,29,5,0,52,29,39,8,119,1,2
4,1,http://localhost:8080/tienda1/publico/autentic...,23391DBBADEC19FE01E02D201F278C6A,60.0,modo=entrar&login=caria&pwd=egipciaca&remember...,112,29,5,0,52,29,39,8,119,1,2


In [24]:
df_anomal_test_new.head()

Unnamed: 0,Method,URL,Cookie,ContentLen,Payload,ReqLen,ArgLen,NumArgs,NumDigitsArgs,PathLen,NumLettersArgs,NumLettersPath,NumSpecialCharsPath,MaxByteValReq,SessionIdPresent,SessionIdFormat
0,0,http://localhost:8080/tienda1/publico/anadir.jsp,B92A8B48B9008CD29F622A994E0F650D,,id=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantid...,194,113,5,14,48,76,35,8,117,1,2
1,1,http://localhost:8080/tienda1/publico/anadir.jsp,AE29AEEBDE479D5E1A18B4108C8E3CE0,146.0,id=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantid...,194,113,5,14,48,76,35,8,117,1,2
2,0,http://localhost:8080/tienda1/publico/anadir.jsp,F563B5262843F12ECAE41815ABDEEA54,,id=2%2F&nombre=Jam%F3n+Ib%E9rico&precio=85&can...,125,44,5,9,48,28,35,8,117,1,2
3,1,http://localhost:8080/tienda1/publico/anadir.jsp,3B654D6DF7F1466EE80D7F756B00E5D1,77.0,id=2%2F&nombre=Jam%F3n+Ib%E9rico&precio=85&can...,125,44,5,9,48,28,35,8,117,1,2
4,0,http://localhost:8080/asf-logo-wide.gif~,51A7470173188BBB993947F2283059E4,,,40,0,0,0,40,0,27,9,126,1,2


In [19]:
# Analysis on the obtained features

print(df_normal_train_new["SessionIdFormat"].nunique())
print(df_normal_train_new["SessionIdPresent"].nunique())

print(df_normal_test_new["SessionIdFormat"].nunique())
print(df_normal_test_new["SessionIdPresent"].nunique())

print(df_anomal_test_new["SessionIdFormat"].nunique())
print(df_anomal_test_new["SessionIdPresent"].nunique())

1
1
1
1
1
1


## Requests Per Session

In [1]:
def add_cookie_count_column(df, *dfs):
    all_cookies = pd.concat([df['Cookie'] for df in dfs if 'Cookie' in df.columns])
    
    cookie_counts = all_cookies.value_counts()

    df_copy = df.copy()

    df_copy['cookie_request_count'] = df_copy['Cookie'].map(cookie_counts).fillna(0).astype(int)
    
    return df_copy


In [2]:
df1_with_counts = add_cookie_count_column(df_normal_train, df_normal_train, df_normal_test, df_anomal_test)
df2_with_counts = add_cookie_count_column(df_normal_test, df_normal_train, df_normal_test, df_anomal_test)
df3_with_counts = add_cookie_count_column(df_anomal_test, df_normal_train, df_normal_test, df_anomal_test)

print("DataFrame 1 with Cookie Counts:\n", df1_with_counts.head())
print("\nDataFrame 2 with Cookie Counts:\n", df2_with_counts.head())
print("\nDataFrame 3 with Cookie Counts:\n", df3_with_counts.head())


NameError: name 'df_normal_train' is not defined

In [None]:
print(df1_with_counts['cookie_request_count'].nunique())
print(df2_with_counts['cookie_request_count'].nunique())
print(df3_with_counts['cookie_request_count'].nunique())

1
1
1
