In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('C:/Users/angel/OneDrive/Documents/SDP/malicious_phish.csv/urldata.csv/urldata.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0


In [3]:
df.result.unique()

array([0, 1], dtype=int64)

In [4]:
df.result.value_counts()

result
0    345738
1    104438
Name: count, dtype: int64

In [5]:
%pip install tld




In [6]:
from urllib.parse import urlparse
from tld import get_tld
from tldextract import extract
import tldextract
import re

In [7]:
suspicious_keywords = ["login", "secure", "verify", "account", "update", "bank", "free", "offer", "password", "support"]
trusted_brands = ["paypal", "google", "amazon", "facebook", "microsoft", "apple"]

def makeTokens(url):
    url = url.lower().strip()
    url = re.sub(r"\[\.\]", ".", url)  # Normalize [.] obfuscation

    parsed_url = urlparse(url)
    ext = tldextract.extract(url)

    subdomain, main_domain, suffix = ext.subdomain, ext.domain, ext.suffix
    path = parsed_url.path

    # Tokenize subdomain, domain, and path
    tokens = re.split(r'[/.\-]', subdomain + " " + main_domain + " " + path)
    tokens = [t for t in tokens if t and t not in ["com", "net", "org", "www", suffix]]

    # Flag if subdomain contains a trusted brand but is not the official domain
    if main_domain not in trusted_brands and any(brand in subdomain for brand in trusted_brands):
        tokens.append("fake_brand")

    # Flag URLs containing phishing keywords
    if any(word in path for word in suspicious_keywords):
        tokens.append("suspicious_path")

    return " ".join(set(tokens))

In [8]:
df['url'] = df['url'].astype(str)  # Convert all values to strings
df['url'] = df['url'].str.replace(r"\[\.\]", ".", regex=True).str.strip()  # Replace "[.]" with "."
df = df[df['url'].str.strip() != '.']  # Remove empty or single dot entries
df = df.dropna()  # Drop NaN values

In [9]:
df['url'] = df['url'].apply(makeTokens)

In [10]:
df.tail(3)

Unnamed: 0.1,Unnamed: 0,url,label,result
450173,450173,minicolors plugins faboleena appmanager 018a...,malicious,1
450174,450174,atualizapj,malicious,1
450175,450175,writeassociate bancasaleon bhdi do inicio te...,malicious,1


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['url'])

In [13]:
feature =  X
target = df['result']

In [14]:
print(feature.shape, target.shape)

(450176, 403159) (450176,)


In [15]:
target.value_counts()

result
0    345738
1    104438
Name: count, dtype: int64

In [16]:
print(df['result'].isnull().sum())

0


In [17]:
from imblearn.over_sampling import SMOTE

In [18]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(feature, target)

In [19]:
y_resampled.value_counts()

result
0    345738
1    345738
Name: count, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_train,x_test,y_train,y_test = train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=42)

In [22]:
import xgboost as xgb

In [23]:
model = xgb.XGBClassifier(eval_metric='logloss')
model.fit(x_train, y_train)

accuracy = model.score(x_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9436


In [24]:
print(accuracy)

0.9436498524903106


In [40]:
def predict(url):
    sample_url = pd.Series([url])
    sample_feature = sample_url.apply(makeTokens).tolist() 

    sample_feature = pd.DataFrame(sample_feature)

    sample_feature = vectorizer.transform(sample_feature[0])
    # Make prediction
    predicted_probabilities = model.predict(sample_feature)  
    predicted_label = ["malicious" if p > 0.3 else "benign" for p in predicted_probabilities]
    return predicted_label[0] , predicted_probabilities[0]
predict("http://anuoluwapoegbedayo.org/secure/zigi.securities/")

('malicious', 1)

In [None]:
#model.save_model("malicious.json") 