In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('C:/Users/angel/OneDrive/Documents/SDP/malicious_phish.csv/urldata.csv/urldata.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0


In [3]:
df.result.unique()

array([0, 1], dtype=int64)

In [4]:
df.result.value_counts()

result
0    345738
1    104438
Name: count, dtype: int64

In [6]:
from urllib.parse import urlparse
from tld import get_tld
from tldextract import extract
import tldextract
import re

In [7]:
suspicious_keywords = [
    "login", "secure", "verify", "account", "update",
    "bank", "free", "offer", "password", "support"
]

trusted_brands = [
    "paypal", "google", "amazon", "facebook",
    "microsoft", "apple", "gmail", "chatgpt"
]

trusted_domains = [
    "google.com", "mail.google.com", "paypal.com",
    "facebook.com", "microsoft.com", "apple.com",
    "gmail.com", "chatgpt.com"
]

def makeTokens(url):
    url = url.lower().strip()
    url = re.sub(r"\[\.\]", ".", url)  # Normalize obfuscation

    parsed_url = urlparse(url)
    ext = tldextract.extract(url)

    subdomain, main_domain, suffix = ext.subdomain, ext.domain, ext.suffix
    path = parsed_url.path

    full_domain = f"{subdomain}.{main_domain}.{suffix}".strip(".")
    base_domain = f"{main_domain}.{suffix}"

    # Tokenize the URL
    tokens = re.split(r'[/.\-]', subdomain + " " + main_domain + " " + path)
    tokens = [t for t in tokens if t and t not in ["com", "net", "org", "www", suffix]]

    # Flag suspicious if not trusted
    if base_domain not in trusted_domains and full_domain not in trusted_domains:
        if any(brand in subdomain for brand in trusted_brands):
            tokens.append("fake_brand")
        if any(word in path for word in suspicious_keywords):
            tokens.append("suspicious_path")

    return " ".join(set(tokens))

In [8]:
df['url'] = df['url'].astype(str)  # Convert all values to strings
df['url'] = df['url'].str.replace(r"\[\.\]", ".", regex=True).str.strip()  # Replace "[.]" with "."
df = df[df['url'].str.strip() != '.']  # Remove empty or single dot entries
df = df.dropna()  # Drop NaN values

In [9]:
df['url'] = df['url'].apply(makeTokens)

In [10]:
df.tail(3)

Unnamed: 0.1,Unnamed: 0,url,label,result
450173,450173,appmanager js portailas infortis assure_somtc=...,malicious,1
450174,450174,atualizapj,malicious,1
450175,450175,portal bancasaleon writeassociate inicio io8...,malicious,1


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['url'])

In [13]:
'''import joblib

joblib.dump(vectorizer, "vectorizer.pkl")'''

'import joblib\n\njoblib.dump(vectorizer, "vectorizer.pkl")'

In [14]:
'''joblib.load("vectorizer.pkl")'''

'joblib.load("vectorizer.pkl")'

In [15]:
feature =  X
target = df['result']

In [16]:
print(feature.shape, target.shape)

(450176, 403159) (450176,)


In [17]:
target.value_counts()

result
0    345738
1    104438
Name: count, dtype: int64

In [18]:
print(df['result'].isnull().sum())

0


In [19]:
from imblearn.over_sampling import SMOTE

In [20]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(feature, target)

In [21]:
y_resampled.value_counts()

result
0    345738
1    345738
Name: count, dtype: int64

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
x_train,x_test,y_train,y_test = train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=42)

In [24]:
import xgboost as xgb

In [25]:
model = xgb.XGBClassifier(eval_metric='logloss')
model.fit(x_train, y_train)

accuracy = model.score(x_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9441


In [26]:
accuracy = model.score(x_test, y_test)
print(accuracy)

0.944062011916469


In [27]:
from sklearn.metrics import classification_report, confusion_matrix

confusion = confusion_matrix(y_test, model.predict(x_test))
print(confusion)

[[64501  4562]
 [ 3174 66059]]


In [39]:
def predict(url):
    sample_url = pd.Series([url])
    sample_feature = sample_url.apply(makeTokens).tolist() 

    sample_feature = pd.DataFrame(sample_feature)

    sample_feature = vectorizer.transform(sample_feature[0])
    # Make prediction
    predicted_probabilities = model.predict(sample_feature)  
    predicted_label = ["malicious" if p > 0.3 else "benign" for p in predicted_probabilities]
    return predicted_label[0] , predicted_probabilities[0]
predict("https://hianimez.to/watch/case-closed-323?ep=6274")

('malicious', 1)

In [29]:
model.save_model("malicious.json") 

In [30]:
loaded_model = xgb.XGBClassifier()
loaded_model.load_model("malicious.json")

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train, y_train)
accuracy = lr.score(x_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9742


In [33]:
lr_pred = lr.predict(x_test)

In [34]:
confusion = confusion_matrix(y_test, lr_pred)
print(confusion)

[[66963  2100]
 [ 1462 67771]]


In [41]:
def predict(url):
    sample_url = pd.Series([url])
    sample_feature = sample_url.apply(makeTokens)  # Ensure this outputs a string, not a list

    sample_feature = vectorizer.transform(sample_feature)  # Transform the text

    # Get probabilities
    predicted_probabilities = lr.predict_proba(sample_feature)[:, 1]  # Probability of being "malicious"
    
    # Classify based on threshold
    predicted_label = "malicious" if predicted_probabilities[0] > 0.6 else "benign"


    
    return predicted_label, predicted_probabilities[0]

# Example usage:
predict("https://hianimez.to/watch/case-closed-323?ep=6274")

('benign', 0.31660515835385505)

In [36]:
import joblib
joblib.dump(lr, "logistic_model.pkl")

['logistic_model.pkl']

In [37]:
loaded_model = joblib.load("logistic_model.pkl")
loaded_model