In [2]:
import pandas as pd

df = pd.read_csv("../data/urls_dataset.csv")
df.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [3]:
df.columns

Index(['URL', 'Label'], dtype='object')

In [4]:
df['Label'] = df['Label'].map({'bad': 1, 'good': 0})

In [5]:
df['Label'].value_counts()

Label
0    392924
1    156422
Name: count, dtype: int64

In [6]:
df.rename(columns={'URL': 'url', 'Label': 'label'}, inplace=True)

In [7]:
df.isnull().sum()

url      0
label    0
dtype: int64

In [8]:
import re

def extract_features(url):
    features = []
    features.append(len(url))                      # URL length
    features.append(url.count('.'))                # number of dots
    features.append(url.count('/'))                # number of slashes
    features.append(len(re.findall(r'[-@?=&#]', url)))  # special characters
    features.append(1 if url.startswith("https") else 0) # https presence
    return features

In [9]:
X = df['url'].apply(extract_features).tolist()
y = df['label']

In [10]:
import numpy as np
X = np.array(X)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [23]:
from sklearn.linear_model import LogisticRegression

lg_model = LogisticRegression(max_iter=1000)
lg_model.fit(X_train, y_train)

In [24]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

y_pred = lg_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7736688814052972
[[75581  3004]
 [21863  9422]]
              precision    recall  f1-score   support

           0       0.78      0.96      0.86     78585
           1       0.76      0.30      0.43     31285

    accuracy                           0.77    109870
   macro avg       0.77      0.63      0.64    109870
weighted avg       0.77      0.77      0.74    109870



In [25]:
def predict_url_safety(url):
    features = np.array(extract_features(url)).reshape(1, -1)
    prob = lg_model.predict_proba(features)[0]

    return {
        "Safe %": round(prob[0] * 100, 2),
        "Malicious %": round(prob[1] * 100, 2)
    }

In [26]:
import joblib

joblib.dump(lg_model, "../models/url_fraud_lg_model.pkl")

['../models/url_fraud_lg_model.pkl']

In [27]:
import re
import tldextract

def extract_features(url):
    features = []
    features.append(len(url))                          # URL length
    features.append(url.count('.'))                    # dots
    features.append(url.count('/'))                    # slashes
    features.append(len(re.findall(r'[-@?=&#]', url))) # special chars
    features.append(1 if url.startswith("https") else 0)

    ext = tldextract.extract(url)
    features.append(len(ext.domain))                   # domain length
    features.append(1 if ext.subdomain else 0)          # subdomain presence

    return features

In [28]:
import numpy as np

X = np.array(df['url'].apply(extract_features).tolist())
y = df['label']

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [32]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

In [33]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

y_pred = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8538363520524256
[[68157 10428]
 [ 5631 25654]]
              precision    recall  f1-score   support

           0       0.92      0.87      0.89     78585
           1       0.71      0.82      0.76     31285

    accuracy                           0.85    109870
   macro avg       0.82      0.84      0.83    109870
weighted avg       0.86      0.85      0.86    109870



In [34]:
def predict_url_safety(url):
    features = np.array(extract_features(url)).reshape(1, -1)
    prob = rf_model.predict_proba(features)[0]
    return {
        "Safe %": round(prob[0] * 100, 2),
        "Malicious %": round(prob[1] * 100, 2)
    }

predict_url_safety("http://example-login-security-update.com")

{'Safe %': np.float64(75.93), 'Malicious %': np.float64(24.07)}

In [41]:
joblib.dump(rf_model, "../models/url_fraud_rf_model.pkl")

['../models/url_fraud_rf_model.pkl']

In [40]:
import joblib
rf_model = joblib.load(r"C:\Users\HP\Documents\Url_fraud_detection\models\url_fraud_rf_model.pkl")

In [42]:
rf_model = joblib.load("../models/url_fraud_rf_model.pkl")
print(type(rf_model))

<class 'sklearn.ensemble._forest.RandomForestClassifier'>


In [35]:
import numpy as np

# Example URL feature extraction function (same as before)
def extract_features(url):
    import re, tldextract
    features = []
    features.append(len(url))
    features.append(url.count('.'))
    features.append(url.count('/'))
    features.append(len(re.findall(r'[-@?=&#]', url)))
    features.append(1 if url.startswith("https") else 0)
    ext = tldextract.extract(url)
    features.append(len(ext.domain))
    features.append(1 if ext.subdomain else 0)
    return np.array(features).reshape(1, -1)

# Predict safety
url = "http://example-login-update.com"
features = extract_features(url)
prob = model.predict_proba(features)[0]
print("Safe %:", round(prob[0]*100,2))
print("Malicious %:", round(prob[1]*100,2))


Safe %: 22.68
Malicious %: 77.32


In [27]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# Create model
gb_model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

# Train model
gb_model.fit(X_train, y_train)

# Predict
y_pred = gb_model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8272776918176026
              precision    recall  f1-score   support

           0       0.83      0.95      0.89     78585
           1       0.80      0.53      0.64     31285

    accuracy                           0.83    109870
   macro avg       0.81      0.74      0.76    109870
weighted avg       0.82      0.83      0.82    109870



In [31]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gb_model.fit(X_train_res, y_train_res)

In [33]:
y_pred = gb_model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.790770911076727
              precision    recall  f1-score   support

           0       0.90      0.79      0.84     78585
           1       0.60      0.79      0.68     31285

    accuracy                           0.79    109870
   macro avg       0.75      0.79      0.76    109870
weighted avg       0.82      0.79      0.80    109870

