In [1]:
import numpy as np
import pandas as pd
import re
import base64

In [2]:
def is_base64(s):
    if len(s) % 4 == 0 and re.fullmatch(r'[A-Za-z0-9+/=]+', s):
        try:
            decoded = base64.b64decode(s, validate=True)
            return len(decoded) > 0
        except Exception:
            return False
    return False

In [3]:
def categorize_segment(segment):
    if segment == "<EOS>":
        return segment
    if re.fullmatch(r"[a-fA-F0-9]{32}", segment):
        return "MD5_HASH"
    if re.fullmatch(r"[a-fA-F0-9]{40,64}", segment):
        return "SHA_HASH"
    if re.fullmatch(r"[a-fA-F0-9]{3,}", segment):
        return "HEX"
    if is_base64(segment):
        return "BASE64"
    if re.fullmatch(r"[a-zA-Z0-9+/=]{16,}", segment):
        return "ENCRYPTION"
    if segment in "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~":
        return segment 
    if segment == "I":
        return segment
    return "".join(["W" if i.isalpha() else "D" if i.isdigit() else i for i in segment])

def feature_extract(url):
    return " ".join(categorize_segment(seg) for seg in url.split())

  if segment in "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~":


In [4]:
train_df = pd.read_csv('csv_dataset_preprocessed.csv')
train_df.head()

Unnamed: 0,method,url,site,header,label,feature
0,GET,/ mod / quiz / attempt . php <QUERY> attempt =...,https://cbt.its.ac.id/mod/quiz/attempt.php?att...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Benign,/ WWW / BASE64 / WWWWWWW . WWW <QUERY> WWWWWWW...
1,GET,/ pluginfile . php / 113836 / question / quest...,https://cbt.its.ac.id/mod/quiz/attempt.php?att...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Benign,/ WWWWWWWWWW . WWW / HEX / BASE64 / BASE64 / H...
2,GET,/ vulnerabilities / sqli / <QUERY> id = - 7668...,http://localhost:4280/vulnerabilities/sqli/,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Malicious,/ WWWWWWWWWWWWWWW / BASE64 / <QUERY> WW = - HE...
3,GET,/ uploads . rar <EOS>,-,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,Benign,/ WWWWWWW . WWW <EOS>
4,GET,/ vulnerabilities / xss _ d / <QUERY> default ...,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Malicious,/ WWWWWWWWWWWWWWW / WWW _ W / <QUERY> WWWWWWW ...


In [5]:
benign_rows = train_df[train_df['label'] == "Benign"]
print(f"total null {train_df.isna().sum()}")
benign_rows = benign_rows[~benign_rows["url"].str.contains("/ vulnerabilities", na=False)]
print(len(benign_rows))

total null method        0
url           0
site       9636
header     9636
label         0
feature       0
dtype: int64
161730


In [16]:
def cut_before_query(row):
    if row['label'] == "Malicious" and ("/ vuln" in row['url'] or row['url'].startswith("/ <QUERY> ") or row['url'].startswith("/ vulnerabilities")):
        if '<QUERY>' in row['url']:
            url_parts = row['url'].split('<QUERY>', 1)
            row['url'] = ' <QUERY> ' + url_parts[1].strip()
        if '<QUERY>' in row['feature']:
            feat_parts = row['feature'].split('<QUERY>', 1)
            row['feature'] = ' <QUERY> ' + feat_parts[1].strip()
    return row

train_df = train_df.apply(cut_before_query, axis=1)


In [17]:
print(train_df['url'].head())
print(len(train_df[train_df['label'] == "Malicious"]))

0    / mod / quiz / attempt . php <QUERY> attempt =...
1    / pluginfile . php / 113836 / question / quest...
2     <QUERY> id = - 7668 ) UNION ALL SELECT 1839 ,...
3                                / uploads . rar <EOS>
4     <QUERY> default = < bdo + oncut = " alert ( 1...
Name: url, dtype: object
53795


In [18]:
# Generatee randow list of benign without <EOS> in the url
from random import random, choice
benign_sample = benign_rows[~benign_rows['url'].str.contains("/ lib /", na=False)]
print(len(benign_sample))
param_to_change = ['attempt', 'id', 'message', 'page', 'default', 'submit', "Submit", 'ip']
blacklist = set(param_to_change + ["EOS", "eos", "query", "QUERY"])
benign_sample.drop_duplicates(subset=['url'], inplace=True)
for row in train_df.itertuples():
    if row.label == "Malicious" and (row.url.startswith(" <QUERY> ") or row.url.startswith("/ <QUERY> ") or row.url.startswith("/ vulnerabilities")):
        benign_n_sample = benign_sample.sample(n=1, random_state=int(random() * len(benign_sample)))
        while True:
            random_url = benign_sample.sample(n=1, random_state=int(random() * len(benign_sample)))['url'].values[0]
            alpha_words = re.findall(r'[a-zA-Z]+', random_url)
            alpha_words = [word for word in alpha_words if word not in blacklist]
            if len(alpha_words) > 0:
                break
        modified_url = row.url
        for param in param_to_change:
            if param in train_df.at[row.Index, 'url']:
                modified_url = modified_url.replace(param, choice(alpha_words),1)
        benign_n_sample_url = benign_n_sample.iloc[0]['url']
        cleaned_benign = re.sub(r"(?i)<query>.*?<eos>", "", benign_n_sample_url, flags=re.IGNORECASE)
        train_df.at[row.Index, 'url'] = cleaned_benign.replace("<EOS>","") + modified_url
train_df['url'] = train_df['url'].str.replace("<EOS>", "", regex=False, case=False)
train_df['feature'] = train_df['url'].apply(feature_extract)
train_df.drop_duplicates(subset=['url'], inplace=True)

149264


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_sample.drop_duplicates(subset=['url'], inplace=True)


In [19]:
print(train_df['url'].head())
print(len(train_df[train_df['label'] == "Malicious"]))

0    / mod / quiz / attempt . php <QUERY> attempt =...
1    / pluginfile . php / 113836 / question / quest...
2    / mod / quiz / attempt . php  <QUERY> cmid = -...
3                                     / uploads . rar 
4    / mod / quiz / attempt . php  <QUERY> jpg = < ...
Name: url, dtype: object
53795


In [20]:
train_df.to_csv('csv_dataset_preprocessed_2_imbalanced.csv', index=False)