## Imports

In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from ipaddress import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout, Flatten, SpatialDropout1D, LSTM,Input, concatenate, Reshape, InputLayer, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.regularizers import l1, l2, l1_l2
import keras
import re
import tldextract
import warnings
warnings.filterwarnings("ignore")
sns.set_style(style='white') 

## Data Transforming

In [21]:

data=pd.read_csv('cleaned_data.csv')
df=data[['url','status']]

In [22]:
data

Unnamed: 0,url,status,url_length,class,num_special_chars,num_digits,num_dots,num_slash,num_and,num_percent,...,num_digits_in_domain,subdomain,num_subdomains,pcount,top_domain,ptld,spl,is_HTTPS,depth,subdomain_digits
0,0000111servicehelpdesk.godaddysites.com,0,39,phishing,0,7,2,0,0,0,...,0,0000111servicehelpdesk,1,2,com,1,3,0,1,7
1,000011accesswebform.godaddysites.com,0,36,phishing,0,6,2,0,0,0,...,0,000011accesswebform,1,2,com,1,3,0,1,6
2,00003.online,0,12,phishing,0,5,1,0,0,0,...,5,0,0,1,online,1,2,0,1,0
3,0009servicedeskowa.godaddysites.com,0,35,phishing,0,4,2,0,0,0,...,0,0009servicedeskowa,1,2,com,1,3,0,1,4
4,000n38p.wcomhost.com,0,20,phishing,0,5,2,0,0,0,...,0,000n38p,1,2,com,1,3,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
798458,zzufg.com,0,9,phishing,0,0,1,0,0,0,...,0,0,0,1,com,1,2,0,1,0
798459,zzu.li,0,6,phishing,0,0,1,0,0,0,...,0,0,0,1,li,1,2,0,1,0
798460,zzz.co.uk,0,9,phishing,0,0,2,0,0,0,...,0,0,0,2,co.uk,2,3,0,1,0
798461,zzzoolight.co.za,0,16,phishing,0,0,2,0,0,0,...,0,0,0,2,co.za,2,3,0,1,0


In [23]:
# def sp(text):
#     res=[]
#     res[:]=text+' '
#     # return res
#     return " ".join(res)
# df.url=df.url.apply(sp)


In [24]:
df.url

0            0000111servicehelpdesk.godaddysites.com
1               000011accesswebform.godaddysites.com
2                                       00003.online
3                0009servicedeskowa.godaddysites.com
4                               000n38p.wcomhost.com
                             ...                    
798458                                     zzufg.com
798459                                        zzu.li
798460                                     zzz.co.uk
798461                              zzzoolight.co.za
798462    zzzoolight.co.za0-i-fdik.000webhostapp.com
Name: url, Length: 798463, dtype: object

In [25]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
data['top_domain_encoded'] = label_encoder.fit_transform(data['top_domain'])
data['domain_encoded'] = label_encoder.fit_transform(data['domain'])
data['subdomain_encoded'] = label_encoder.fit_transform(data['subdomain'])
data=data.drop(columns=['url','domain','class','ptld','spl','subdomain','top_domain','pcount'])


In [26]:
a=data.top_domain_encoded.nunique()
data.top_domain_encoded.nunique()

1232

In [27]:
b=data.domain_encoded.nunique()
data.domain_encoded.nunique()

308783

In [28]:
c=data.subdomain_encoded.nunique()
data.subdomain_encoded.nunique()

111203

In [29]:
d=a+b+c
d

421218

In [30]:
data.head()

Unnamed: 0,status,url_length,num_special_chars,num_digits,num_dots,num_slash,num_and,num_percent,domain_length,num_digits_in_domain,num_subdomains,is_HTTPS,depth,subdomain_digits,top_domain_encoded,domain_encoded,subdomain_encoded
0,0,39,0,7,2,0,0,0,12,0,1,0,1,7,225,111707,17
1,0,36,0,6,2,0,0,0,12,0,1,0,1,6,225,111707,18
2,0,12,0,5,1,0,0,0,5,5,0,0,1,0,878,10,0
3,0,35,0,4,2,0,0,0,12,0,1,0,1,4,225,111707,25
4,0,20,0,5,2,0,0,0,8,0,1,0,1,5,225,291146,29


In [31]:

y=data['status']
X=data.drop('status', axis=1)


In [32]:
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=54)

In [33]:
X_train.shape

(558924, 16)

In [34]:
X_test.shape

(239539, 16)

## CNN

In [35]:
tf.keras.backend.clear_session()
tf.random.set_seed(50)

In [36]:
cnn  = Sequential()
cnn.add(Embedding(d,8))
cnn.add(Dropout(0.3))
cnn.add(Conv1D(128, 5, activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Flatten())
cnn.add(Dense(24,activation='relu'))
cnn.add(Dense(1,activation ='sigmoid',kernel_regularizer=l1(0.03), bias_regularizer=l2(0.03)))
# maybe consider increaseing dropout as this model has a high starting accuracy so combat overfitting
# look into handling the categorical features and concatenating with the numeric features to see if that makes a difference for embedding layers

In [37]:
cnn.compile(loss ='binary_crossentropy',optimizer='adam', metrics=['accuracy', 'precision', 'recall'])

In [38]:
num_epochs =30
early_stop = EarlyStopping(monitor='val_loss',patience=3, verbose=1, restore_best_weights=True)
historyC=cnn.fit(X_train, y_train,epochs = num_epochs,validation_data=(X_test,y_test),callbacks =[early_stop],verbose =2)
# around 6 minutes an epoch when split 

Epoch 1/30


17467/17467 - 395s - 23ms/step - accuracy: 0.9321 - loss: 0.1792 - precision: 0.9257 - recall: 0.9493 - val_accuracy: 0.9567 - val_loss: 0.1142 - val_precision: 0.9447 - val_recall: 0.9761
Epoch 2/30
17467/17467 - 465s - 27ms/step - accuracy: 0.9829 - loss: 0.0588 - precision: 0.9830 - recall: 0.9850 - val_accuracy: 0.9522 - val_loss: 0.1194 - val_precision: 0.9466 - val_recall: 0.9649
Epoch 3/30
17467/17467 - 396s - 23ms/step - accuracy: 0.9927 - loss: 0.0295 - precision: 0.9934 - recall: 0.9930 - val_accuracy: 0.9474 - val_loss: 0.1254 - val_precision: 0.9422 - val_recall: 0.9606
Epoch 4/30
17467/17467 - 414s - 24ms/step - accuracy: 0.9949 - loss: 0.0208 - precision: 0.9955 - recall: 0.9950 - val_accuracy: 0.9488 - val_loss: 0.1213 - val_precision: 0.9543 - val_recall: 0.9498
Epoch 4: early stopping
Restoring model weights from the end of the best epoch: 1.
