In [2]:
with open('anomalousTrafficTest.txt', 'r') as file:
    lines = file.readlines()
    non_blank_lines = [line for line in lines if line.strip()]

with open('anomalousTrafficTest_clean.txt', 'w') as file:
    file.writelines(non_blank_lines)

In [3]:
import csv
import re

columns = ['Method', 'URL', 'Cookie', 'ContentLen', 'Payload', 'ReqLen', 'ArgLen', 'NumArgs', 'NumDigitsArgs', 'PathLen', 'NumLettersArgs', 'NumLettersPath', 'NumSpecialCharsPath', 'MaxByteValReq']

rows = []

def extract_info(entry):
    url = entry['URL']
    payload = entry['Payload'] if entry['Payload'] else ''
    
    if '?' in url:
        url, url_payload = url.split('?', 1)
        payload = payload + '&' + url_payload if payload else url_payload
    
    length_of_request = len(url) + len(payload)
    
    url_args = re.findall(r'[?&]([^=&]+)=([^&]*)', url)
    payload_args = re.findall(r'([^=&]+)=([^&]*)', payload)
    
    length_of_arguments = sum(len(arg[1]) for arg in url_args + payload_args)
    number_of_arguments = len(url_args) + len(payload_args)
    number_of_digits_in_arguments = sum(len(re.findall(r'\d', arg[1])) for arg in url_args + payload_args)
    
    path = re.split(r'[?#]', url)[0]
    length_of_path = len(path)
    number_of_letters_in_arguments = sum(len(re.findall(r'[a-zA-Z]', arg[1])) for arg in url_args + payload_args)
    number_of_letter_chars_in_path = len(re.findall(r'[a-zA-Z]', path))
    number_of_special_chars_in_path = len(re.findall(r'[^a-zA-Z0-9]', path))
    max_byte_value_in_request = max(ord(char) for char in url + payload)
    
    # Remove 'JSESSIONID=' prefix from the Cookie value
    if entry['Cookie']:
        entry['Cookie'] = entry['Cookie'].replace('JSESSIONID=', '')
    
    # Store 0 for GET and 1 for POST in the Method column
    entry['Method'] = 0 if entry['Method'] == 'GET' else 1
    
    entry.update({
        'URL': url,
        'Payload': payload,
        'ReqLen': length_of_request,
        'ArgLen': length_of_arguments,
        'NumArgs': number_of_arguments,
        'NumDigitsArgs': number_of_digits_in_arguments,
        'PathLen': length_of_path,
        'NumLettersArgs': number_of_letters_in_arguments,
        'NumLettersPath': number_of_letter_chars_in_path,
        'NumSpecialCharsPath': number_of_special_chars_in_path,
        'MaxByteValReq': max_byte_value_in_request
    })


with open('anomalousTrafficTest_clean.txt', 'r') as file:
    lines = file.readlines()
    entry = {}
    
    for line in lines:
        line = line.strip()
        
        if line.startswith('GET') or line.startswith('POST'):
            if entry:
                extract_info(entry)
                rows.append(entry)
                
            entry = {col: None for col in columns}
            parts = line.split(' ', 2)
            entry['Method'] = parts[0]
            entry['URL'] = parts[1]
            
        elif ': ' in line:
            key, value = line.split(': ', 1)
            if key in entry:
                entry[key] = value
            if key == 'Cookie':
                entry['Cookie'] = value
            if key == 'Content-Length':
                entry['ContentLen'] = int(value)
                
        else:
            if 'Payload' in entry and entry['Payload']:
                entry['Payload'] += line
            else:
                entry['Payload'] = line
            
    if entry:
        extract_info(entry)
        rows.append(entry)

with open('anomalousTrafficTest.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=columns)
    writer.writeheader()
    writer.writerows(rows)


In [2]:
# Load the data from the three CSV files - normalTrafficTraining.csv, normalTrafficTest.csv, and anomalousTrafficTest.csv
import pandas as pd

normalTrafficTraining = pd.read_csv('normalTrafficTraining.csv')
normalTrafficTest = pd.read_csv('normalTrafficTest.csv')
anomalousTrafficTest = pd.read_csv('anomalousTrafficTest.csv')

# Combine the normalTrafficTraining and normalTrafficTest datasets
normalTraffic = pd.concat([normalTrafficTraining, normalTrafficTest])

# Combine the normalTraffic and anomalousTrafficTest datasets
allTraffic = pd.concat([normalTraffic, anomalousTrafficTest])

# Show the first few rows of the allTraffic dataset
allTraffic.head()

Unnamed: 0,Method,URL,Cookie,ContentLen,Payload,ReqLen,ArgLen,NumArgs,NumDigitsArgs,PathLen,NumLettersArgs,NumLettersPath,NumSpecialCharsPath,MaxByteValReq
0,0,http://localhost:8080/tienda1/index.jsp,1F767F17239C9B670A39E9B10C3825F4,,,39,0,0,0,39,0,27,7,120
1,0,http://localhost:8080/tienda1/publico/anadir.jsp,81761ACA043B0E6014CA42A4BCD06AB5,,id=3&nombre=Vino+Rioja&precio=100&cantidad=55&...,116,35,5,7,48,24,35,8,117
2,1,http://localhost:8080/tienda1/publico/anadir.jsp,933185092E0B668B90676E0A2B0767AF,68.0,id=3&nombre=Vino+Rioja&precio=100&cantidad=55&...,116,35,5,7,48,24,35,8,117
3,0,http://localhost:8080/tienda1/publico/autentic...,8FA18BA82C5336D03D3A8AFA3E68CBB0,,modo=entrar&login=choong&pwd=d1se3ci%F3n&remem...,115,32,5,3,52,28,39,8,119
4,1,http://localhost:8080/tienda1/publico/autentic...,7104E6C68A6BCF1423DAE990CE49FEE2,63.0,modo=entrar&login=choong&pwd=d1se3ci%F3n&remem...,115,32,5,3,52,28,39,8,119
