In [18]:
import requests
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import warnings
from urllib.parse import urlparse
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import ssl
import socket

In [19]:
df = pd.read_csv('dataset200.csv')
df.drop_duplicates(inplace=True)
df['label'] = df['label'].replace({'legitimate': 0, 'phishing': 1})


In [20]:
x = df.drop('label', axis=1)
y = df['label']


In [21]:
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) #old train split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y) #new train split 

In [22]:
df

Unnamed: 0,url,label
0,https://www.google.com/,0
1,https://www.wikipedia.org/,0
2,https://www.amazon.com/,0
3,https://www.youtube.com/,0
4,https://www.facebook.com/,0
...,...,...
90,http://eden.it-guys.net.nz/wp-content/language...,1
91,http://gruposdefreefire.000webhostapp.com/,1
92,https://onedrive.live.com/?authkey=%21AG7v3K%5...,1
93,http://ruasepicas.pt/paypal.com.update.account...,1


In [23]:

def url_length(url):
    return len(url)

In [24]:
def suspicious_words(url):
    suspicious = {'login', 'signin', 'verify', 'banking', 'account', 'secure', 'facbok', 'faceb0ok'}
    url_words = re.findall('\w+', urlparse(url).netloc + urlparse(url).path)
    count = 0
    for word in url_words:
        if word.lower() in suspicious:
            count += 1
    return count

In [25]:
# Compile the regular expression pattern outside the function
special_char_pattern = re.compile('[!@#$%^&*(),.?":{}|<>]')

# Use the compiled pattern inside the function
def special_characters(url):
    return len(special_char_pattern.findall(url))

In [26]:
def has_valid_ssl(url):
    try:
        hostname = url.split('//')[1].split('/')[0]  # Extract the hostname from the URL
        context = ssl.create_default_context()
        with socket.create_connection((hostname, 443)) as sock:
            with context.wrap_socket(sock, server_hostname=hostname) as sslsock:
                return True  # SSL handshake succeeded, SSL is valid
    except (ssl.SSLError, socket.error):
        return False  # SSL handshake failed, SSL is invalid

In [27]:
    
def domain_length(url):
    domain = urlparse(url).netloc
    return len(domain)


In [28]:
# Preprocess the training data
x_train['url_length'] = x_train['url'].apply(url_length)
x_train['suspicious_words'] = x_train['url'].apply(suspicious_words)
x_train['special_characters'] = x_train['url'].apply(special_characters)
x_train['has_valid_ssl'] = x_train['url'].apply(has_valid_ssl)
x_train['domain_length'] = x_train['url'].apply(domain_length)


In [29]:

# Feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(x_train[['url_length', 'suspicious_words', 'special_characters', 'has_valid_ssl','domain_length']])
x_train_scaled = pd.DataFrame(scaled_features, columns=['url_length', 'suspicious_words', 'special_characters', 'has_valid_ssl','domain_length'])

y_train = y_train.values.ravel()  # Convert y_train to a 1D array

params = {'C': [0.001, 0.01, 0.1, 1, 10]}
log_reg = LogisticRegression(max_iter=100)
grid_search = GridSearchCV(log_reg, params, cv=5, error_score='raise')
grid_search.fit(x_train_scaled, y_train)


In [30]:
# Preprocess the test data
x_test['url_length'] = x_test['url'].apply(url_length)
x_test['suspicious_words'] = x_test['url'].apply(suspicious_words)
x_test['special_characters'] = x_test['url'].apply(special_characters)
x_test['has_valid_ssl'] = x_test['url'].apply(has_valid_ssl)
x_test['domain_length'] = x_test['url'].apply(domain_length)


In [31]:
# Feature scaling for test data
scaled_features = scaler.transform(x_test[['url_length', 'suspicious_words', 'special_characters', 'has_valid_ssl','domain_length']])
x_test_scaled = pd.DataFrame(scaled_features, columns=['url_length', 'suspicious_words', 'special_characters', 'has_valid_ssl','domain_length'])

y_pred = grid_search.predict(x_test_scaled[['url_length', 'suspicious_words', 'special_characters', 'has_valid_ssl','domain_length']])




print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))

print("Best hyperparameters:", grid_search.best_params_)


Accuracy:  0.8947368421052632
Precision:  1.0
Recall:  0.6666666666666666
F1 Score:  0.8
Best hyperparameters: {'C': 1}


In [32]:
from sklearn.metrics import confusion_matrix

# After predicting on the test data
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp)
print('Specificity: ', specificity)

Specificity:  1.0


In [33]:
# Preprocess the new data
new_urls = ['https://www.facebok.com/', 'http://shadetreetechnology.com/V4/validation/a111aedc8ae390eabcfa130e041a10a4']
new_data = pd.DataFrame({'URL': new_urls})
new_data['url_length'] = new_data['URL'].apply(url_length)
new_data['suspicious_words'] = new_data['URL'].apply(suspicious_words)
new_data['special_characters'] = new_data['URL'].apply(special_characters)
new_data['has_valid_ssl'] = new_data['URL'].apply(has_valid_ssl)
new_data['domain_length'] = new_data['URL'].apply(domain_length)


In [34]:
# Feature scaling for new data
scaled_features = scaler.transform(new_data[['url_length', 'suspicious_words', 'special_characters', 'has_valid_ssl','domain_length']])
new_data_scaled = pd.DataFrame(scaled_features, columns=['url_length', 'suspicious_words', 'special_characters', 'has_valid_ssl','domain_length'])

predictions = grid_search.predict(new_data_scaled[['url_length', 'suspicious_words', 'special_characters', 'has_valid_ssl','domain_length']])
prediction_labels = ["phishing" if p == 1 else "legitimate" for p in predictions]
results = pd.DataFrame({'URL': new_data['URL'], 'Prediction': prediction_labels})
actual_labels = [0, 1]  # Actual labels for the new URLs (phishing: 1, legitimate: 0)
new_accuracy = accuracy_score(actual_labels, predictions)
print('Accuracy for new data:', new_accuracy)
print(results)

Accuracy for new data: 1.0
                                                 URL  Prediction
0                           https://www.facebok.com/  legitimate
1  http://shadetreetechnology.com/V4/validation/a...    phishing


In [35]:
import pickle

with open('phishing_detection_model.pkl', 'wb') as file:
    pickle.dump(grid_search, file)