# Malicious URL CLassification

In [30]:
import pandas as pd
import numpy as np
import re

# from matplotlib.colors import hex2color
# import matplotlib.pyplot as plt

from gensim.models import Word2Vec
from nltk import ngrams

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB

import joblib



## Import the dataset

In [2]:
DATASET_FILE = "./datasets/malicious_phish.csv"

df = pd.read_csv(DATASET_FILE)

df.rename(columns={"type": "target"}, inplace=True)
df['target'] = df['target'].map(lambda x: 'malicious' if x != 'benign' else 'benign')

df["target"].value_counts()


In [5]:
distribution = dict(df["target"].value_counts())

# Set up the figure with a white background
fig, ax = plt.subplots(facecolor='white')

dull_red = hex2color('#E34234')
dull_green = hex2color('#32712C')

# Plotting the pie chart
ax.pie(distribution.values(), labels=distribution.keys(), colors=[dull_green, dull_red], autopct='%1.1f%%', startangle=90)
ax.set_title('Class Distribution')
ax.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.

# Show the pie chart
plt.show()

NameError: name 'plt' is not defined

## URL Tokenization and Word Embedding with Word2Vec

In [8]:
def preprocess_url(url):
    # Remove sequential strings following '#' and '?'
    url = re.sub(r"#.*", "", url)
    url = re.sub(r"\?.*", "", url)

    # Split the URL into protocol and rest (domain, path and file)
    protocol, rest = url.split("://", 1) if "://" in url else ("", url)

    # Split the rest into domain and rest (path and file)
    domain, rest = rest.split("/", 1) if "/" in rest else (rest, "")
    
    # Split the rest into path and file
    path, file = rest.rsplit("/", 1) if "/" in rest else (rest, "")

    return [protocol, domain, path, file]

# Token extraction
tokenized_urls = [preprocess_url(url) for url in df['url'].tolist()]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_urls, vector_size=100, window=5, min_count=1, workers=4)
model.save("models/word2vec_model.model")

In [9]:
model = Word2Vec.load("models/word2vec_model.model")

In [10]:
# Function to get the aggregated embedding for a URL
def get_url_embedding(url_tokens, model):
    embeddings = [model.wv[token] for token in url_tokens if token in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Generate embeddings for each URL
url_embeddings = [get_url_embedding(url_token, model) for url_token in tokenized_urls]

[['', 'br-icloud.com.br', '', ''], ['', 'mp3raid.com', 'music', 'krizz_kaliko.html'], ['', 'bopsecrets.org', 'rexroth/cr', '1.htm'], ['http', 'www.garage-pirenne.be', 'index.php', ''], ['http', 'adventure-nicaragua.net', 'index.php', ''], ['http', 'buzzfil.net', 'm/show-art', 'ils-etaient-loin-de-s-imaginer-que-le-hibou-allait-faire-ceci-quand-ils-filmaient-2.html'], ['', 'espn.go.com', 'nba/player/_/id/3457', 'brandon-rush'], ['', 'yourbittorrent.com', '', ''], ['http', 'www.pashminaonline.com', 'pure-pashminas', ''], ['', 'allmusic.com', 'album', 'crazy-from-the-heat-r16990']]
[array([-0.02841005, -1.3570445 ,  2.583899  ,  2.4544125 , -2.8078327 ,
        0.2890032 ,  2.982779  ,  0.3699581 ,  0.3969229 , -0.39165783,
       -0.26323265, -0.6882718 , -1.9151151 ,  0.35113522, -1.1075875 ,
       -0.59009653, -2.071853  , -2.0503783 ,  0.27332672,  3.3190815 ,
        2.5255659 ,  1.2138379 ,  0.5392667 ,  3.108008  ,  0.25414407,
        0.9086647 , -0.9959712 , -2.5154858 , -0.9000

In [28]:
X_train, X_test, y_train, y_test = train_test_split(url_embeddings, df['target'].tolist(), test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Print the basic statistics of the training set
data_counts = {
    "train": pd.Series(y_train).value_counts().to_list(),
    "test": pd.Series(y_test).value_counts().to_list(),
    "validation": pd.Series(y_val).value_counts().to_list(),
    "total": pd.Series(df['target'].tolist()).value_counts().to_list(),
}
pd.DataFrame(data_counts, index=["benign", "malicious"])

Unnamed: 0,train,test,validation,total
benign,342325,42748,43030,428103
malicious,178627,22371,22090,223088


In [12]:
ss = StandardScaler(copy=True, with_mean=True, with_std=True)

# Fit the scaler on the training data
ss.fit(X_train)

# Transform the training, validation, and testing data
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)
X_val = ss.transform(X_val)

## Naive Bayes Classifier

In [31]:
nb_classifier = GaussianNB()

nb_classifier.fit(X_train, y_train)

joblib.dump(nb_classifier, "models/nb_classifier.joblib")

['models/nb_classifier.joblib']

In [32]:
nb_classifier = joblib.load("models/nb_classifier.joblib")

nb_classifier.score(X_test, y_test)

0.8245673305794008

In [33]:
y_pred = nb_classifier.predict(X_test)

print(classification_report(y_test, y_pred, target_names=["benign", "malicious"], zero_division=0, digits=4))

              precision    recall  f1-score   support

      benign     0.8530    0.8854    0.8689     42748
   malicious     0.7638    0.7083    0.7350     22371

    accuracy                         0.8246     65119
   macro avg     0.8084    0.7969    0.8020     65119
weighted avg     0.8223    0.8246    0.8229     65119



## Random Forest Classifier

In [24]:
rf_classifier = RandomForestClassifier(n_estimators=50, n_jobs=10)

rf_classifier.fit(X_train, y_train)

joblib.dump(rf_classifier, "models/rf_classifier.joblib")

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 50building tree 2 of 50

building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50


[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:  3.1min


building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50
building tree 46 of 50
building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:  4.7min finished


['models/rf_classifier.joblib']

In [26]:
rf_classifier = joblib.load("models/rf_classifier.joblib")

# Tune hyperparameters using X_val and y_val
rf_classifier.score(X_val, y_val)

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:    0.2s finished


0.914004914004914

In [27]:
y_pred = rf_classifier.predict(X_test)

print(classification_report(y_test, y_pred, target_names=["benign", "malicious"], zero_division=0, digits=4))

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:    0.0s finished


              precision    recall  f1-score   support

      benign     0.8866    0.9940    0.9373     42748
   malicious     0.9852    0.7571    0.8562     22371

    accuracy                         0.9126     65119
   macro avg     0.9359    0.8756    0.8967     65119
weighted avg     0.9205    0.9126    0.9094     65119

