## Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from ipaddress import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import svm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout, Flatten, SpatialDropout1D, LSTM,Input, concatenate, Reshape, InputLayer, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.regularizers import l1, l2, l1_l2
import re
import tldextract
import warnings
warnings.filterwarnings("ignore")
sns.set_style(style='white') 
from sklearn.svm import LinearSVC

## Data Transforming

In [3]:

data=pd.read_csv('cleaned_data.csv')
df=data[['url','status']]

In [4]:
data

Unnamed: 0,url,status,url_length,class,num_special_chars,num_digits,num_dots,num_slash,num_and,num_percent,...,num_digits_in_domain,subdomain,num_subdomains,pcount,top_domain,ptld,spl,is_HTTPS,depth,subdomain_digits
0,0000111servicehelpdesk.godaddysites.com,0,39,phishing,0,7,2,0,0,0,...,0,0000111servicehelpdesk,1,2,com,1,3,0,1,7
1,000011accesswebform.godaddysites.com,0,36,phishing,0,6,2,0,0,0,...,0,000011accesswebform,1,2,com,1,3,0,1,6
2,00003.online,0,12,phishing,0,5,1,0,0,0,...,5,,0,1,online,1,2,0,1,0
3,0009servicedeskowa.godaddysites.com,0,35,phishing,0,4,2,0,0,0,...,0,0009servicedeskowa,1,2,com,1,3,0,1,4
4,000n38p.wcomhost.com,0,20,phishing,0,5,2,0,0,0,...,0,000n38p,1,2,com,1,3,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
798458,zzufg.com,0,9,phishing,0,0,1,0,0,0,...,0,,0,1,com,1,2,0,1,0
798459,zzu.li,0,6,phishing,0,0,1,0,0,0,...,0,,0,1,li,1,2,0,1,0
798460,zzz.co.uk,0,9,phishing,0,0,2,0,0,0,...,0,,0,2,co.uk,2,3,0,1,0
798461,zzzoolight.co.za,0,16,phishing,0,0,2,0,0,0,...,0,,0,2,co.za,2,3,0,1,0


In [5]:
# def sp(text):
#     res=[]
#     res[:]=text+' '
#     # return res
#     return " ".join(res)
# df.url=df.url.apply(sp)


In [6]:
df.url

0            0000111servicehelpdesk.godaddysites.com
1               000011accesswebform.godaddysites.com
2                                       00003.online
3                0009servicedeskowa.godaddysites.com
4                               000n38p.wcomhost.com
                             ...                    
798458                                     zzufg.com
798459                                        zzu.li
798460                                     zzz.co.uk
798461                              zzzoolight.co.za
798462    zzzoolight.co.za0-i-fdik.000webhostapp.com
Name: url, Length: 798463, dtype: object

In [7]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
data['top_domain_encoded'] = label_encoder.fit_transform(data['top_domain'])
data['domain_encoded'] = label_encoder.fit_transform(data['domain'])
data['subdomain_encoded'] = label_encoder.fit_transform(data['subdomain'])
data=data.drop(columns=['url','domain','class','ptld','spl','subdomain','top_domain','pcount'])


In [8]:
a=data.top_domain_encoded.nunique()
data.top_domain_encoded.nunique()

1232

In [9]:
b=data.domain_encoded.nunique()
data.domain_encoded.nunique()

308783

In [10]:
c=data.subdomain_encoded.nunique()
data.subdomain_encoded.nunique()

111204

In [11]:
d=a+b+c
d

421219

In [12]:
data.head()

Unnamed: 0,status,url_length,num_special_chars,num_digits,num_dots,num_slash,num_and,num_percent,domain_length,num_digits_in_domain,num_subdomains,is_HTTPS,depth,subdomain_digits,top_domain_encoded,domain_encoded,subdomain_encoded
0,0,39,0,7,2,0,0,0,12,0,1,0,1,7,225,111707,17
1,0,36,0,6,2,0,0,0,12,0,1,0,1,6,225,111707,18
2,0,12,0,5,1,0,0,0,5,5,0,0,1,0,878,10,111203
3,0,35,0,4,2,0,0,0,12,0,1,0,1,4,225,111707,25
4,0,20,0,5,2,0,0,0,8,0,1,0,1,5,225,291146,29


In [13]:

y=data['status']
X=data.drop('status', axis=1)


In [14]:
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=504)

In [15]:
X_train.shape

(638770, 16)

In [16]:
X_test.shape

(159693, 16)

## Traditional Models

In [17]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
# trains in around 2 minutes and has 0.95 accross the board

              precision    recall  f1-score   support

           0       0.95      0.94      0.95     74314
           1       0.95      0.96      0.95     85379

    accuracy                           0.95    159693
   macro avg       0.95      0.95      0.95    159693
weighted avg       0.95      0.95      0.95    159693



In [18]:
vectorizer = TfidfVectorizer(analyzer='char')
X=vectorizer.fit_transform(df.url)
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
y1=df['status']
X1=tfidf_df
X_train1, X_test1, y_train1,y_test1 = train_test_split(X,y,test_size=0.2,random_state=504)


In [19]:
# rf = RandomForestClassifier()
# rf.fit(X_train1, y_train1)
# y_pred1 = rf.predict(X_test1)
# print(classification_report(y_test1, y_pred1))
# took 32 minutes to train on unstructured data, was less accurate for the most part 

In [20]:
# clf = svm.SVC(kernel='linear') 

# clf.fit(X_train, y_train)

# y_pred = clf.predict(X_test)
# print(classification_report(y_test, y_pred))
# ran for 5 hours and did not finish training

In [21]:
clf = LinearSVC() 

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
# ran for 5 minutes and did not finish training

              precision    recall  f1-score   support

           0       0.47      0.99      0.64     74314
           1       0.78      0.03      0.06     85379

    accuracy                           0.48    159693
   macro avg       0.63      0.51      0.35    159693
weighted avg       0.64      0.48      0.33    159693



In [22]:

clf = SGDClassifier() 

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
# took  4 minuted but terrible results

              precision    recall  f1-score   support

           0       0.48      0.86      0.61     74314
           1       0.60      0.18      0.28     85379

    accuracy                           0.50    159693
   macro avg       0.54      0.52      0.45    159693
weighted avg       0.54      0.50      0.44    159693



# Neural Networks

In [23]:
tld_num=data['top_domain_encoded'].nunique()
ld_num=data['domain_encoded'].nunique()
sub_num=data['subdomain_encoded'].nunique()



## LSTM

In [24]:
tf.keras.backend.clear_session()
tf.random.set_seed(50)


In [25]:
Lstm  = Sequential()
Lstm.add(Embedding(d,8))
Lstm.add(Dropout(0.4))
Lstm.add(LSTM(128,return_state=False))
Lstm.add(Dense(24,activation='relu'))
Lstm.add(Dense(1,activation ='sigmoid',kernel_regularizer=l1(0.01), bias_regularizer=l2(0.01)))

In [26]:
Lstm.compile(loss ='binary_crossentropy',optimizer='adam', metrics=['accuracy', 'precision', 'recall'])

In [27]:
num_epochs =30
early_stop = EarlyStopping(monitor='val_loss',patience=3, verbose=1, restore_best_weights=True)
historyL=Lstm.fit(X_train, y_train,epochs = num_epochs,validation_data=(X_test,y_test),callbacks =[early_stop],verbose =2)
# around8 minutes an epeoch when split 

Epoch 1/30


19962/19962 - 289s - 14ms/step - accuracy: 0.9207 - loss: 0.1927 - precision: 0.9156 - recall: 0.9381 - val_accuracy: 0.9519 - val_loss: 0.1234 - val_precision: 0.9691 - val_recall: 0.9400
Epoch 2/30
19962/19962 - 273s - 14ms/step - accuracy: 0.9746 - loss: 0.0732 - precision: 0.9756 - recall: 0.9769 - val_accuracy: 0.9548 - val_loss: 0.1153 - val_precision: 0.9623 - val_recall: 0.9528
Epoch 3/30
19962/19962 - 265s - 13ms/step - accuracy: 0.9844 - loss: 0.0480 - precision: 0.9857 - recall: 0.9851 - val_accuracy: 0.9512 - val_loss: 0.1263 - val_precision: 0.9577 - val_recall: 0.9507
Epoch 4/30
19962/19962 - 274s - 14ms/step - accuracy: 0.9877 - loss: 0.0388 - precision: 0.9889 - recall: 0.9881 - val_accuracy: 0.9514 - val_loss: 0.1319 - val_precision: 0.9588 - val_recall: 0.9499
Epoch 5/30
19962/19962 - 268s - 13ms/step - accuracy: 0.9896 - loss: 0.0336 - precision: 0.9907 - recall: 0.9899 - val_accuracy: 0.9486 - val_loss: 0.1282 - val_precision: 0.9580 - val_recall: 0.9454
Epoch 5: ea

## CNN

In [28]:
tf.keras.backend.clear_session()
tf.random.set_seed(50)

In [29]:
cnn  = Sequential()
cnn.add(Embedding(d,8))
cnn.add(Dropout(0.4))
cnn.add(Conv1D(128, 5, activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Flatten())
cnn.add(Dense(24,activation='relu'))
cnn.add(Dense(1,activation ='sigmoid',kernel_regularizer=l1(0.01), bias_regularizer=l2(0.01)))

In [30]:
cnn.compile(loss ='binary_crossentropy',optimizer='adam', metrics=['accuracy', 'precision', 'recall'])

In [31]:
num_epochs =30
early_stop = EarlyStopping(monitor='val_loss',patience=3, verbose=1, restore_best_weights=True)
historyC=cnn.fit(X_train, y_train,epochs = num_epochs,validation_data=(X_test,y_test),callbacks =[early_stop],verbose =2)
# around 6 minutes an epoch when split 

Epoch 1/30


19962/19962 - 155s - 8ms/step - accuracy: 0.9299 - loss: 0.1779 - precision: 0.9233 - recall: 0.9475 - val_accuracy: 0.9521 - val_loss: 0.1211 - val_precision: 0.9656 - val_recall: 0.9442
Epoch 2/30
19962/19962 - 152s - 8ms/step - accuracy: 0.9739 - loss: 0.0770 - precision: 0.9750 - recall: 0.9762 - val_accuracy: 0.9498 - val_loss: 0.1209 - val_precision: 0.9623 - val_recall: 0.9430
Epoch 3/30
19962/19962 - 153s - 8ms/step - accuracy: 0.9849 - loss: 0.0485 - precision: 0.9868 - recall: 0.9849 - val_accuracy: 0.9461 - val_loss: 0.1283 - val_precision: 0.9580 - val_recall: 0.9404
Epoch 4/30
19962/19962 - 151s - 8ms/step - accuracy: 0.9884 - loss: 0.0387 - precision: 0.9901 - recall: 0.9883 - val_accuracy: 0.9466 - val_loss: 0.1314 - val_precision: 0.9561 - val_recall: 0.9436
Epoch 5/30
19962/19962 - 150s - 8ms/step - accuracy: 0.9898 - loss: 0.0342 - precision: 0.9912 - recall: 0.9898 - val_accuracy: 0.9475 - val_loss: 0.1303 - val_precision: 0.9542 - val_recall: 0.9473
Epoch 5: early s