# URL clustering based on similarity

- urls_df = full dataset
- urls_tdf = full dataset transformed
- urls_tsdf = sampled dataset transformed

## Imports & Helper functions

### Save or load jupyter session

### Imports

In [None]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")

sns.set()

### Helper functions

In [None]:
from polyleven import levenshtein


def levenshtein_pdist(u, v):
    if isinstance(u, np.ndarray):
        u = u[0]
    if isinstance(v, np.ndarray):
        v = v[0]
    return levenshtein(u, v)

## Preprocessing

In [None]:
urls_df = pd.read_csv(
    "datasets/kaggle_siddharta_malicious_benign.csv",
    delimiter=",",
    dtype={"url": "string"},
)

urls_df.head()

### Extract domain names from URLs

In [None]:
# Regex pattern to extract fully qualified domain name (FQDN)
pattern = r"(?:.*?:\/\/)?(?P<www>[wW]{3}\.)?(?P<domain>[\w\.\-]+)[^\w]*"

# Execute regex over URLs
match = urls_df["url"].str.extract(pattern)

# Extract domain using named group
urls_df["FQDN"] = match["domain"]

# Indicate if www subdomain is present
urls_df["has_www"] = match["www"].notna()

urls_df.head()

### Remove all addreses without domain (IPs)

In [None]:
# Pattern that matches all IPv4 addresses
pattern = "(?:.*?:\/\/)?(?P<www>[wW]{3}\.)?[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}([/:].*)?$"

# Leave only data not containing pure IPv4
urls_df = urls_df[~urls_df["url"].str.match(pattern)]

### Extract features from domains

#### Separate TLD, domain and subdomain

In [None]:
import tldextract


# Function to extract components of domain using tldextract
def extract_domain_components(url):
    ext = tldextract.extract(url)
    return pd.Series([ext.subdomain, ext.domain, ext.suffix, ext.suffix == ""])

In [None]:
# Apply function to url column to extract domain components and explode into separate columns
urls_df[["subdomain", "domain", "TLD", "is_invalid_TLD"]] = urls_df["url"].apply(
    extract_domain_components
)

In [None]:
urls_df[urls_df["is_invalid_TLD"]].head()

In [None]:
# Remove domains with invalid TLD
urls_df = urls_df[~urls_df["is_invalid_TLD"]]

#### Length of domain, subdomain and TLD

In [None]:
urls_df[["domain_length", "subdomain_length", "TLD_length"]] = urls_df[
    ["domain", "subdomain", "TLD"]
].applymap(len)
urls_df.head(2)

#### Number of subdomains

I decide to include www in the count of subdomains. Might make performance-wise issues later. Reconsider if needed

In [None]:
urls_df["num_of_subdomains"] = (
    urls_df["subdomain"].str.split(".").apply(lambda x: len(x) if x != [""] else 0)
)
urls_df.head(2)

#### Characters frequency & vowel-to-consonant ratio
Characters:
- alphabetical - "a-zA-Z"
- digits - "0-9"
- special - all except alphabetical, digits and dot

Can be changed based on the occurences of dots. It may be better to remove dots so this information is uncorrelated with num_of_substrings

Try the result with and without dots to analyze the difference

In [None]:
for column in ["domain", "subdomain", "TLD"]:
    # Vowel-to-consonant ratio
    vowel_counts = urls_df[column].str.count(r"[aeiouAEIOU]")
    consonant_counts = urls_df[column].str.count(r"[b-df-hj-np-tv-zB-DF-HJ-NP-TV-Z]")

    # Get alphabetical, numeric and special character counts for specific column
    numeric_counts = urls_df[column].str.count(r"[0-9]")
    special_counts = urls_df[column].str.count(r"[^A-Za-z0-9\s\.]")
    alpha_counts = vowel_counts + consonant_counts

    # Add them into DF
    urls_df[
        [
            f"{column}_alpha_count",
            f"{column}_numeric_count",
            f"{column}_special_count",
            f"{column}_vowel_consonant_ratio",
        ]
    ] = pd.Series(
        [alpha_counts, numeric_counts, special_counts, vowel_counts / consonant_counts]
    )

urls_df.head(2)

#### Complexity of domain and subdomain

Using compression algorithm (`smaz` python implementation) to approximate Kolmogorov complexity

In [None]:
import smaz

urls_df[["domain_complexity", "subdomain_complexity"]] = urls_df[
    ["domain", "subdomain"]
].applymap(lambda s: len(smaz.compress(s)) / len(s) if s != "" else np.nan)

#### N-grams

In [133]:
# Define columns to compute Ngrams over
NGRAM_COLUMNS = ["domain", "subdomain"]

# Ngrams implementation over characters
def ngrams(string, n):
    return ["".join(string[i : i + n]) for i in range(len(string) - n + 1)]


# Define values for n
n_values = [2, 3]

for n in n_values:
    # Create names or new columns
    ngram_new_columns = [f"{col}_n{n}grams" for col in NGRAM_COLUMNS]

    # Apply vectorized function over dataframe
    urls_df[ngram_new_columns] = urls_df[NGRAM_COLUMNS].applymap(lambda x: ngrams(x, n))

urls_df.head(3)

Unnamed: 0,url,type,domain,has_www,subdomain,domain_tldextract,TLD,is_invalid_TLD,domain_regex,domain_length,...,TLD_special_count,domain_vowel_consonant_ratio,subdomain_vowel_consonant_ratio,TLD_vowel_consonant_ratio,domain_complexity,subdomain_complexity,domain_n2grams,subdomain_n2grams,domain_n3grams,subdomain_n3grams
0,br-icloud.com.br,phishing,br-icloud,False,,br-icloud,com.br,False,br-icloud.com.br,9,...,0,0.6,,0.25,0.777778,,"[br, r-, -i, ic, cl, lo, ou, ud]",[],"[br-, r-i, -ic, icl, clo, lou, oud]",[]
1,mp3raid.com/music/krizz_kaliko.html,benign,mp3raid,False,,mp3raid,com,False,mp3raid.com,7,...,0,0.5,,0.5,0.857143,,"[mp, p3, 3r, ra, ai, id]",[],"[mp3, p3r, 3ra, rai, aid]",[]
2,bopsecrets.org/rexroth/cr/1.htm,benign,bopsecrets,False,,bopsecrets,org,False,bopsecrets.org,10,...,0,0.428571,,0.5,0.8,,"[bo, op, ps, se, ec, cr, re, et, ts]",[],"[bop, ops, pse, sec, ecr, cre, ret, ets]",[]


### Encode labels

In [None]:
# Create OneHotEncoded features from type

from sklearn.preprocessing import OneHotEncoder

ohenc = OneHotEncoder(sparse_output=False)
type_ohenc = pd.DataFrame(
    ohenc.fit_transform(urls_df["type"].values.reshape(-1, 1)),
    columns=ohenc.categories_[0],
).astype(bool)

# URLs_transformed df
urls_tdf = pd.concat([urls_df, type_ohenc], axis=1)

In [None]:
urls_tdf["malicious"] = ~urls_tdf["benign"]
urls_tdf.head()

## Data exploration

### Levenshtein distances

#### Domain unchanged

In [None]:
# Create two sets of N_SAMPLES random samples
N_SAMPLES = 50000

urls_tsdf = pd.DataFrame()

urls_tsdf["sample1"] = urls_tdf.sample(n=N_SAMPLES, random_state=123).reset_index(
    drop=True
)["domain"]
urls_tsdf["sample2"] = urls_tdf.sample(n=N_SAMPLES, random_state=545).reset_index(
    drop=True
)["domain"]

In [None]:
# Calculate Levenshtein distance on each pair (N_SAMPLES distances)
urls_tsdf["levenshtein_distance_domain"] = urls_tsdf.apply(
    lambda row: levenshtein(row.sample1, row.sample2), axis=1
)

In [None]:
urls_tsdf.describe()

In [None]:
sns.displot(
    urls_tsdf[urls_tsdf["levenshtein_distance_domain"]],
    x="levenshtein_distance_domain",
    binwidth=3,
    height=10,
)
plt.title(
    f"Distribution of levenshtein's distances among domains over {N_SAMPLES} random samples"
)

In [None]:
sns.displot(
    urls_tsdf[urls_tsdf["levenshtein_distance_domain"] < 75],
    x="levenshtein_distance_domain",
    binwidth=3,
    height=10,
)
plt.title(
    f"Distribution of levenshtein's distances among domains over {N_SAMPLES} random samples"
)

## Experiments

### Unchanged URLs

Keep `urls_tdf` intact for this section to show what it should

#### DBSCAN

DBSCAN will not work. It needs $\mathcal{O}(n^2)$

#### Hierarchical clustering

In [None]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, fcluster, linkage
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances

In [None]:
# Create two sets of N_SAMPLES random samples
N_SAMPLES = 1000

urls_tsdf = urls_tdf.sample(n=N_SAMPLES, random_state=111).reset_index(drop=True)

In [None]:
# Calculate pairwise distances between domains using levenshtein distance function
X = urls_tsdf["domain"].values.reshape(-1, 1)
distances = pdist(X, metric=levenshtein_pdist)
distances_squareform = squareform(distances)

In [None]:
# Perform hierarchical clustering
Z = linkage(distances, "complete")

# Draw dendrogram for visual cutoff selection
fig, ax = plt.subplots(figsize=(40, 20))
dendrogram(Z, ax=ax)


fig1 = fig
plt.show()

In [None]:
CUTOFF = [42, 23, 19, 31]

# Load the figure object from the previous cell
fig = fig1

# Get the axes object from the figure
ax = fig.axes[0]

# Add the cutoff horizontal line
for cutoff in CUTOFF:
    ax.axhline(y=cutoff, color="r", linestyle="--")
    ax.text(x=ax.get_xlim()[0], y=cutoff, s=f"Cutoff: {cutoff}", va="center")


# Show the plot
fig

In [None]:
# Determine the optimal number of clusters
max_d = 15  # set the threshold distance
clusters = fcluster(Z, max_d, criterion="distance")

In [None]:
# Add cluster labels to the original dataset
urls_tsdf["cluster"] = clusters

#### Evaluation of cluster quality based on cutoff

In [None]:
from sklearn.metrics import silhouette_score

# Compute the silhouette score
silhouette_avg = silhouette_score(
    distances_squareform, urls_tsdf["cluster"], metric="precomputed"
)
print(f"Silhouette score: {silhouette_avg}")

# calculate prevalence of malicious domains in each cluster
cluster_prevalence = urls_tsdf.groupby("cluster")["malicious"].mean()

# group by cluster id and count the number of items in each cluster
cluster_counts = urls_tsdf.groupby("cluster").count()["url"]

# create a dataframe combining the cluster counts and cluster prevalence
cluster_data = pd.DataFrame({"count": cluster_counts, "prevalence": cluster_prevalence})

# filter perfect clusters
non_trivial_clusters = cluster_data.loc[
    (cluster_data["prevalence"] != 0) & (cluster_data["prevalence"] != 1)
]

print(f"Total count of samples {len(urls_tsdf)}")
print(f"Total count of clusters {len(cluster_data)}")
print(
    f"Count of samples in perfect clusters {len(urls_tsdf) - non_trivial_clusters['count'].sum()}"
)

print(
    f"Prevalence of non-perfect malicious domains within clusters:\n{non_trivial_clusters}"
)

In [None]:
# reset the index to get the cluster id as a column
cluster_data = cluster_data.reset_index()

# Create color palette
colors = sns.color_palette("viridis", as_cmap=True)

# Create bar plot
fig, ax = plt.subplots(figsize=(12, 8))
sns.barplot(
    x="cluster",
    y="prevalence",
    data=cluster_data,
    palette=colors(cluster_data["count"] / cluster_data["count"].max()),
    ax=ax,
    dodge=False,
)

# Set labels and title
ax.set_title("Prevalence of Malicious Domains by Cluster")
ax.set_xlabel("Cluster Number")
ax.set_ylabel("Prevalence")

# Move the legend outside the plot and make it a gradient line
sm = plt.cm.ScalarMappable(
    cmap=colors, norm=plt.Normalize(vmin=0, vmax=cluster_data["count"].max())
)
sm.set_array([])
cbar = plt.colorbar(
    sm,
    orientation="horizontal",
    pad=0.1,
    shrink=0.5,
    aspect=15,
)
cbar.ax.set_xlabel("Cluster Size")

plt.subplots_adjust(right=0.8)

plt.tight_layout()

plt.show()

In [None]:
urls_tsdf.groupby("cluster").count()