In [2]:
import re
import socket
import urllib.parse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from datetime import datetime
import requests

In [1]:
!pip install flask




[notice] A new release of pip is available: 23.3.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
df = pd.read_csv('dataset_phishing.csv')

In [4]:
df.isnull().sum()

url                0
length_url         0
length_hostname    0
ip                 0
nb_dots            0
                  ..
web_traffic        0
dns_record         0
google_index       0
page_rank          0
status             0
Length: 89, dtype: int64

In [5]:
df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [7]:
# Handle missing values (if any)
df.fillna(method='ffill', inplace=True)

# Encode categorical variables (if any)
# Encode the target variable
label_encoder = LabelEncoder()
df['status'] = label_encoder.fit_transform(df['status'])

# Split the data into features and target
# Select features and target
features = df[['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or',
                 'domain_in_title', 'domain_with_copyright', 'whois_registered_domain', 'domain_registration_length',
                 'domain_age', 'web_traffic', 'dns_record', 'google_index', 'page_rank']]
target = df['status']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [8]:
# Create the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss',random_state=42)

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

Parameters: { "use_label_encoder" } are not used.



In [9]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.952755905511811
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1157
           1       0.95      0.95      0.95      1129

    accuracy                           0.95      2286
   macro avg       0.95      0.95      0.95      2286
weighted avg       0.95      0.95      0.95      2286



In [10]:
from sklearn.metrics import confusion_matrix

# Ensure predictions and true labels are from the same test set
y_pred = model.predict(X_test)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[1102   55]
 [  53 1076]]


In [11]:
# Define the parameter grid
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best parameters: {best_params}')
print(f'Best cross-validation score: {best_score}')

Parameters: { "use_label_encoder" } are not used.



Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300}
Best cross-validation score: 0.9492563429571304


In [12]:
def extract_features(url):
    parsed_url = urllib.parse.urlparse(url)
    hostname = parsed_url.netloc
    path = parsed_url.path

    # Example feature extraction logic
    features = {
        'length_url': len(url),
        'length_hostname': len(hostname),
        'ip': 1 if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", hostname) else 0,
        'nb_dots': url.count('.'),
        'nb_hyphens': url.count('-'),
        'nb_at': url.count('@'),
        'nb_qm': url.count('?'),
        'nb_and': url.count('&'),
        'nb_or': url.count('|'),
        'domain_in_title': 1,  # Placeholder
        'domain_with_copyright': 1,  # Placeholder
        'whois_registered_domain': 1,  # Placeholder
        'domain_registration_length': 365,  # Placeholder
        'domain_age': 365,  # Placeholder
        'web_traffic': 1,  # Placeholder
        'dns_record': 1 if socket.gethostbyname(hostname) else 0,
        'google_index': 1,  # Placeholder
        'page_rank': 1,  # Placeholder
    }

    # Add missing features with default values (0)
    required_features = [
        'length_url', 'length_hostname', 'ip', 'nb_dots',
        # Add all other feature names here
        'web_traffic', 'dns_record', 'google_index', 'page_rank'
    ]

    for feature in required_features:
        if feature not in features:
            features[feature] = 0

    return features


In [13]:
# Extract features from the URL
url = 'https://shadetreetechnology.com/V4/validation/a111aedc8ae390eabcfa130e041a10a4'
features = extract_features(url)

# Convert the features dictionary to a DataFrame
features_algo = pd.DataFrame([features])

# Reorder columns to match the training data format
required_columns = ['length_url', 'length_hostname', 'ip', 'nb_dots',
                    # Add all other feature names in order
                    'web_traffic', 'dns_record', 'google_index', 'page_rank']

features_df = features_algo[required_columns]

features_df


Unnamed: 0,length_url,length_hostname,ip,nb_dots,web_traffic,dns_record,google_index,page_rank
0,78,23,0,1,1,1,1,1


In [14]:
y_pred = model.predict(features_algo)

if y_pred[0] == 0:
  print("This is a fraud URL.")
else:
  print("This is a genuine URL.")


This is a genuine URL.


In [15]:
# prompt: save this model as pkl file

import pickle
# Save the model to a file
filename = 'phishing_detection_model.pkl'
pickle.dump(model, open(filename, 'wb'))
