In [1]:
import re
import pandas as pd
from urllib import parse
import os

In [2]:
def convert(filename):
    log_pattern = re.compile(
        r'(?P<ip>\S+) - - \[(?P<timestamp>.*?)\] "(?P<method>\S+) (?P<url>.+?) HTTP/(?P<http_version>\d+\.\d+)" (?P<status>\d+) (?P<size>\d+)(?: "(?P<site>[^"]+)")?(?: "(?P<header>[^"]+)")?'
    )
    data = []
    with open(filename, "r") as file:
        for line in file:
            try:
                match = log_pattern.match(line)
                if match:
                    url = match.group("url")
                    data.append({
                        "method": match.group("method"),
                        "url":  re.sub(r'[\x00-\x1F\x7F]', 'I', url),
                        "site": (match.group("site") if match.group("site") else None),
                        "header": (match.group("header") if match.group("header") else None), 
                    })
            except Exception as e:
                print(line)
                pass
    return data

# Generate propositional feature

In [3]:
import base64

def clean_control_chars(text):
    if isinstance(text, str):
        return re.sub(r'[\x00-\x1F\x7F]', '', text)
    return text

def is_base64(s):
    if len(s) % 4 == 0 and re.fullmatch(r'[A-Za-z0-9+/=]+', s):
        try:
            decoded = base64.b64decode(s, validate=True)
            return len(decoded) > 0
        except Exception:
            return False
    return False

def categorize_segment(segment):
    if segment == "<EOS>":
        return segment
    if re.fullmatch(r"[a-fA-F0-9]{32}", segment):
        return "MD5_HASH"
    if re.fullmatch(r"[a-fA-F0-9]{40,64}", segment):
        return "SHA_HASH"
    if re.fullmatch(r"[a-fA-F0-9]{3,}", segment):
        return "HEX"
    if is_base64(segment):
        return "BASE64"
    if re.fullmatch(r"[a-zA-Z0-9+/=]{16,}", segment):
        return "ENCRYPTION"
    if segment in ["<QUERY>", "<EOS>"]:
        return segment
    if segment in "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~":
        return segment 

    if segment == "I":
        return segment
    return "".join(["W" if i.isalpha() else "D" if i.isdigit() else i for i in segment])

def feature_extract(url):
    return " ".join(categorize_segment(seg) for seg in url.split())

  if segment in "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~":


#  Generate benign labelling based on non malicious aspect

In [None]:
def classify_logs(df):
    attack_patterns = {
        "SQLI": r"(--|UNION.*SELECT|SELECT.*FROM|DROP\s+TABLE|INSERT\s+INTO|UPDATE\s+SET|DELETE\s+FROM|\s+(?i:OR|AND)\s+)",
        "XSS": r"((<script[^>]*>.*?</script>)|on\w+=['\"]?\w*\(|javascript:|alert\(|confirm\(|prompt\()",
    }
    
    def detect_attack(url):
        for attack, pattern in attack_patterns.items():
            if re.search(pattern, url, re.IGNORECASE):
                return attack
        return "Benign"
    print(df.head())
    df["label"] = df["url"].apply(detect_attack)
    return df

In [5]:
def change_value_char(feature):
    feature = re.sub(r"\?", " <QUERY> ", feature, count=1)
    feature = feature + " <EOS> "
    return feature

In [None]:
def string_segmentation(url):
    url = re.sub(r'%([0-9A-Fa-f]{2})', lambda m: '' if not (0x20 <= int(m.group(1), 16) <= 0x7E) else m.group(0), url)
    url = parse.unquote(url)
    url = re.sub(r'[^\x20-\x7F]+', '', url)
    url = re.sub(r'([!"#$%&\'()*+,\-./?:;<=>@\[\\\]^_`{|}~])', r' \1 ', url)
    url = re.sub(r"\?", "<QUERY>", url, count=1)
    url = url + " <EOS> "
    url = re.sub(r'\s+', ' ', url).strip()
    return url

In [None]:
logs = [f for f in os.listdir(".") if f in ["access_log_Jul95.txt", "acunetix.txt", "xss-fired-2.log","sqli-fired.log"] or f.endswith("-dvwa.log") or f.endswith("-12.log") or f.endswith("-server.log")]
for log in logs:
    print("Processing log: ", log)
    d = convert(log)
    df = pd.DataFrame(d)
    df = classify_logs(df)
    df['url'] = df['url'].apply(string_segmentation)
    df['feature'] = df['url'].apply(feature_extract)
    df['url'].head()
    df['label'].value_counts()
    pd.set_option("display.max_colwidth", None)
    print(df.loc[df["label"] != "Benign", ["url", "label"]])
    excel_filename = log.replace(".log", "")
    excel_filename = excel_filename.replace(".txt", "")
    excel_filename += ".xlsx"
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].apply(clean_control_chars)
    df.to_excel(excel_filename, index=False)


Processing log:  access_log_Jul95.txt
  method                                                url  site header
0    GET                       /images/MOSAIC-logosmall.gif  None   None
1    GET                          /images/USA-logosmall.gif  None   None
2    GET   /shuttle/technology/images/srb_mod_compare_3.jpg  None   None
3    GET  /shuttle/missions/sts-70/images/KSC-95EC-1019.jpg  None   None
4    GET          /shuttle/resources/orbiters/atlantis.html  None   None
Empty DataFrame
Columns: [url, label]
Index: []
Processing log:  acunetix.txt
  method                           url                       site  \
0   POST      /administrator/index.php  http://192.168.4.161/DVWA   
1   POST  /index.php/component/search/                          -   
2   POST  /index.php/component/search/  http://192.168.4.161/DVWA   
3   POST  /index.php/component/search/  http://192.168.4.161/DVWA   
4   POST  /index.php/component/search/                          -   

                               