## Artificial Intelligence for Cybersecurity Project
### Dataset used: "Malicious URLs dataset" by Manu Siddhartha

### Candidates: Riccardo Fantasia & Leonardo Pantani

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
df_raw = pd.read_csv("UNIPI-IA-dataset.csv")

In [None]:
df_raw.describe()

In [None]:
types = ["malware", "phishing", "defacement", "benign"]

df_raw["type"] = pd.Categorical(df_raw["type"], categories=types, ordered=True)
df = df_raw.sort_values(by="type").drop_duplicates(subset="url", keep="first")

df.describe()

In [None]:
from plotly import graph_objects as go

count = df['type'].value_counts()
colors = [
    '#FF6633', '#FFB399', '#FF33FF', '#FFFF99', '#00B3E6',
    '#E6B333', '#3366E6', '#999966', '#99FF99', '#B34D4D'
]
fig = go.Figure(data=[go.Bar(x=count.index, y=count, marker=dict(color=colors))])
fig.update_layout(
    xaxis_title='Types',
    yaxis_title='Count',
    title='Count of Different Types of URLs',
    plot_bgcolor='black',
    paper_bgcolor='black',
    font=dict(color='white')
)
fig.update_xaxes(tickfont=dict(color='white'))
fig.update_yaxes(tickfont=dict(color='white'))
fig.show()

Estriamo le 23 features da ogni url nel dataset intero.

In [6]:
from utils.features_extractors import (extract_feature_ip_use,extract_feature_url_entropy,extract_feature_num_digits,extract_feature_url_length,extract_feature_num_query_parameters,extract_feature_num_fragments,extract_feature_num_percent20,extract_feature_num_at_signs,extract_feature_hashttp,extract_feature_hashttps,extract_feature_dot_number,extract_feature_num_www, extract_feature_directory_num,extract_feature_embed_domain_number,extract_feature_suspiciousurl,extract_feature_count_percent,extract_feature_count_dash,extract_feature_count_equal,extract_feature_is_shortened,extract_feature_hostname_length,extract_feature_first_directory_length,extract_feature_top_level_domain_length,extract_feature_letter_count)

df['feature_ip_use'] = df['url'].apply(extract_feature_ip_use)
df['feature_url_entropy'] = df['url'].apply(extract_feature_url_entropy)
df['feature_num_digits'] = df['url'].apply(extract_feature_num_digits)
df['feature_url_length'] = df['url'].apply(extract_feature_url_length)
df['feature_num_query_parameters'] = df['url'].apply(extract_feature_num_query_parameters)
df['feature_num_fragments'] = df['url'].apply(extract_feature_num_fragments)
df['feature_num_percent20'] = df['url'].apply(extract_feature_num_percent20)
df['feature_num_at_signs'] = df['url'].apply(extract_feature_num_at_signs)
df['feature_hashttp'] = df['url'].apply(extract_feature_hashttp)
df['feature_hashttps'] = df['url'].apply(extract_feature_hashttps)
df['feature_dot_number'] = df['url'].apply(extract_feature_dot_number)
df['feature_num_www'] = df['url'].apply(extract_feature_num_www)
df['feature_directory_num'] = df['url'].apply(extract_feature_directory_num)
df['feature_embed_domain_number'] = df['url'].apply(extract_feature_embed_domain_number)
df['feature_suspiciousurl'] = df['url'].apply(extract_feature_suspiciousurl)
df['feature_count_percent'] = df['url'].apply(extract_feature_count_percent)
df['feature_count_dash'] = df['url'].apply(extract_feature_count_dash)
df['feature_count_equal'] = df['url'].apply(extract_feature_count_equal)
df['feature_is_shortened'] = df['url'].apply(extract_feature_is_shortened)
df['feature_hostname_length'] = df['url'].apply(extract_feature_hostname_length)
df['feature_first_directory_length'] = df['url'].apply(extract_feature_first_directory_length)
df['feature_top_level_domain_length'] = df['url'].apply(extract_feature_top_level_domain_length)
df['feature_letter_count'] = df['url'].apply(extract_feature_letter_count)

Mostriamo una HeatMap di correlazione tra le feature

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filtra solo le colonne con prefisso "feature"
features_df = df[[col for col in df.columns if col.startswith('feature')]]
features_df = features_df.rename(columns=lambda x: x.replace('feature_', ''))

# Crea e mostra la heatmap
correlation_matrix = features_df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True, square=True)
plt.title("Heatmap delle Correlazioni tra le Feature", fontsize=16)
plt.show()

Data la forte correlazione tra:
- embed_domain_number <-> hashttp
- count_equal <-> num_query_parameters
- url_length <-> letter_count

... e data la stretta somiglianza logica delle funzioni abbiamo deciso di rimuovere:
- embed_domain_number
- count_equal
- url_length

In [None]:
# Scatter plot di count_equal <-> num_query_parameters
plt.figure(figsize=(8, 6))
plt.scatter(features_df['count_equal'], features_df['num_query_parameters'], alpha=0.7, color='orange')
plt.title('Scatter Plot: count_equals vs num_query_parameters')
plt.xlabel('count_equals')
plt.ylabel('num_query_parameters')
plt.grid(True)
plt.show()

# Scatter plot di url_length <-> letter_count
plt.figure(figsize=(8, 6))
plt.scatter(features_df['url_length'], features_df['letter_count'], alpha=0.7, color='green')
plt.title('Scatter Plot: url_length vs letter_count')
plt.xlabel('url_length')
plt.ylabel('letter_count')
plt.grid(True)
plt.show()

Facciamo un box plot per ogni feature

In [None]:
features = ['feature_url_entropy', 'feature_num_digits', 'feature_url_length', 'feature_num_query_parameters', 'feature_num_fragments', 'feature_num_percent20', 'feature_num_at_signs', 'feature_dot_number', 'feature_num_www', 'feature_directory_num', 'feature_embed_domain_number', 'feature_count_percent', 'feature_count_dash', 'feature_count_equal', 'feature_hostname_length', 'feature_first_directory_length', 'feature_top_level_domain_length', 'feature_letter_count']
for feature in features:
    plt.figure(figsize=(12, 1.5))
    plt.boxplot(df[feature].dropna(), vert=False, patch_artist=True, showmeans=True)
    plt.title(f'Box Plot of {feature}', fontsize=10)
    plt.xlabel(feature, fontsize=8)
    plt.gca().yaxis.set_visible(False)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()


Rimuoviamo le seguenti feature:
- embed_domain_number
- count_equal
- url_length

Testeremo successivamente allenando il modello di classificazione sul dataset originale e su quello ripulito dalle suddette feature, per valutare l'effettivo miglioramento in termini di classificazione.

In [10]:
df = df.drop(columns=["feature_embed_domain_number", "feature_count_equal", "feature_url_length"])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filtra solo le colonne con prefisso "feature"
features_df = df[[col for col in df.columns if col.startswith('feature')]]
features_df = features_df.rename(columns=lambda x: x.replace('feature_', ''))

# Crea e mostra la heatmap
correlation_matrix = features_df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True, square=True)
plt.title("Heatmap delle Correlazioni tra le Feature", fontsize=16)
plt.show()

Suddividiamo il dataset in training e test set con un rapporto 80% e 20%.

In [None]:
from sklearn.model_selection import train_test_split
from utils.utils import printInfo

train_ratio = 0.80
test_ratio = 0.20
x_train_unbalanced, x_test, y_train_unbalanced, y_test = train_test_split(df.drop(columns=["type", "url"]).copy(), df["type"].copy(), test_size=1-train_ratio, shuffle=True, stratify=df["type"].copy())

printInfo("training", y_train_unbalanced)
printInfo("test", y_test)

Un aspetto che notiamo dall'output soprastante è lo sbilanciamento, in termini di numero di sample, della classe malevola rispetto alla benigna.

In [None]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

def balance_data_undersample_benign(x_train, y_train):
    class_counts = Counter(y_train)
    other_classes_count = sum([class_counts[label] for label in class_counts if label != "benign"])
    benign_target = other_classes_count
    undersampling_strategy = {"benign": benign_target}
    rus = RandomUnderSampler(sampling_strategy=undersampling_strategy, random_state=42)
    x_train_resampled, y_train_resampled = rus.fit_resample(x_train, y_train)
    return x_train_resampled, y_train_resampled

x_train, y_train = balance_data_undersample_benign(x_train_unbalanced, y_train_unbalanced)
printInfo("training bilanciato", y_train)

Si vorrebbe fare uno scaling dei dati in modo da avere circa media 0 e varianza 1.

Nell'ottica di una classificazione eseguita mantenendo gli outliers, si procede inizialmente con una normalizzazione RobustScaler che mantiene meglio l'effetto degli outliers. Dopodiché si procederà con una Z-Normalization.

In [14]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
x_train_scaled_robust = scaler.fit_transform(x_train)
x_test_scaled_robust = scaler.transform(x_test)

x_train_scaled_robust = pd.DataFrame(x_train_scaled_robust, columns=x_train.columns, index=x_train.index)
x_test_scaled_robust = pd.DataFrame(x_test_scaled_robust, columns=x_test.columns, index=x_test.index)


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled_standard = scaler.fit_transform(x_train)
x_test_scaled_standard = scaler.transform(x_test)

x_train_scaled_standard = pd.DataFrame(x_train_scaled_standard, columns=x_train.columns, index=x_train.index)
x_test_scaled_standard = pd.DataFrame(x_test_scaled_standard, columns=x_test.columns, index=x_test.index)

# CLASSIFICATORI

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier_randomforest = RandomForestClassifier()
classifier_randomforest.fit(x_train, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import numpy as np

y_pred = classifier_randomforest.predict(x_test)
y_pred_proba = classifier_randomforest.predict_proba(x_test)

report = classification_report(y_test, y_pred, target_names=['benign', 'phishing', 'defacement', 'malware'])
print(report)

auc_score = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
print(f"AUC Score: {auc_score:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['benign', 'phishing', 'defacement', 'malware'], yticklabels=['benign', 'phishing', 'defacement', 'malware'])
plt.xlabel('Classe predetta')
plt.ylabel('Classe reale')
plt.title('Matrice di Confusione')
plt.show()

## LOGISTIC REGRESSION
Data la natura di questo classificatore, è necessario categorizzare le classi phishing, defacement, malware in un'unica che considereremo come "maligna".

In [None]:
# Raggruppa le classi in 'malignant' e 'benign'
y_train = y_train.replace({"malware": "malignant", "phishing": "malignant", "defacement": "malignant", "benign": "benign"})
y_test = y_test.replace({"malware": "malignant", "phishing": "malignant", "defacement": "malignant", "benign": "benign"})

printInfo("training", y_train)
printInfo("test", y_test)

### Con Robust Scaler

In [None]:
from sklearn.linear_model import LogisticRegression

classifier_logisticregression = LogisticRegression()
classifier_logisticregression.fit(x_train_scaled_robust, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import numpy as np

y_pred = classifier_logisticregression.predict(x_test_scaled_robust)
y_pred_proba = classifier_logisticregression.predict_proba(x_test_scaled_robust)

report = classification_report(y_test, y_pred, target_names=['benign', 'malignant'])
print(report)

auc_score = roc_auc_score(y_test, y_pred_proba[:, 1])
print(f"AUC Score: {auc_score:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['benign', 'malignant'], yticklabels=['benign', 'malignant'])
plt.xlabel('Classe predetta')
plt.ylabel('Classe reale')
plt.title('Matrice di Confusione [con RobustScaler]')
plt.show()

### Con Standard Scaler (Z-Normalization)

In [None]:
from sklearn.linear_model import LogisticRegression

classifier_logisticregression = LogisticRegression()
classifier_logisticregression.fit(x_train_scaled_standard, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import numpy as np

y_pred = classifier_logisticregression.predict(x_test_scaled_standard)
y_pred_proba = classifier_logisticregression.predict_proba(x_test_scaled_standard)

report = classification_report(y_test, y_pred, target_names=['benign', 'malignant'])
print(report)

auc_score = roc_auc_score(y_test, y_pred_proba[:, 1])
print(f"AUC Score: {auc_score:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['benign', 'malignant'], yticklabels=['benign', 'malignant'])
plt.xlabel('Classe predetta')
plt.ylabel('Classe reale')
plt.title('Matrice di Confusione [con Z-Normalization]')
plt.show()