## Imports

In [82]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from ipaddress import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout, Flatten, SpatialDropout1D, LSTM,Input, Concatenate, Reshape, InputLayer, concatenate
from tensorflow.keras.regularizers import l1, l2, l1_l2
import keras
import re
import tldextract
import warnings
warnings.filterwarnings("ignore")
sns.set_style(style='white') 

## Data Transforming

In [83]:

data=pd.read_csv('cleaned_data.csv')
df=data[['url','status']]

In [84]:
data

Unnamed: 0,url,status,url_length,class,num_special_chars,num_digits,num_dots,num_slash,num_and,num_percent,domain,domain_length,num_digits_in_domain,subdomain,num_subdomains,pcount,top_domain,ptld,spl,is_HTTPS,depth,subdomain_digits
0,0000111servicehelpdesk.godaddysites.com,0,39,phishing,0,7,2,0,0,0,godaddysites,12,0,0000111servicehelpdesk,1,2,com,1,3,0,1,7
1,000011accesswebform.godaddysites.com,0,36,phishing,0,6,2,0,0,0,godaddysites,12,0,000011accesswebform,1,2,com,1,3,0,1,6
2,00003.online,0,12,phishing,0,5,1,0,0,0,00003,5,5,,0,1,online,1,2,0,1,0
3,0009servicedeskowa.godaddysites.com,0,35,phishing,0,4,2,0,0,0,godaddysites,12,0,0009servicedeskowa,1,2,com,1,3,0,1,4
4,000n38p.wcomhost.com,0,20,phishing,0,5,2,0,0,0,wcomhost,8,0,000n38p,1,2,com,1,3,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
798458,zzufg.com,0,9,phishing,0,0,1,0,0,0,zzufg,5,0,,0,1,com,1,2,0,1,0
798459,zzu.li,0,6,phishing,0,0,1,0,0,0,zzu,3,0,,0,1,li,1,2,0,1,0
798460,zzz.co.uk,0,9,phishing,0,0,2,0,0,0,zzz,3,0,,0,2,co.uk,2,3,0,1,0
798461,zzzoolight.co.za,0,16,phishing,0,0,2,0,0,0,zzzoolight,10,0,,0,2,co.za,2,3,0,1,0


In [85]:
# def sp(text):
#     res=[]
#     res[:]=text+' '
#     # return res
#     return " ".join(res)
# df.url=df.url.apply(sp)


In [86]:
df.url

0            0000111servicehelpdesk.godaddysites.com
1               000011accesswebform.godaddysites.com
2                                       00003.online
3                0009servicedeskowa.godaddysites.com
4                               000n38p.wcomhost.com
                             ...                    
798458                                     zzufg.com
798459                                        zzu.li
798460                                     zzz.co.uk
798461                              zzzoolight.co.za
798462    zzzoolight.co.za0-i-fdik.000webhostapp.com
Name: url, Length: 798463, dtype: object

In [87]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
data['top_domain_encoded'] = label_encoder.fit_transform(data['top_domain'])
data['domain_encoded'] = label_encoder.fit_transform(data['domain'])
data['subdomain_encoded'] = label_encoder.fit_transform(data['subdomain'])
data=data.drop(columns=['url','domain','class','ptld','spl','subdomain','top_domain','pcount'])


In [88]:
a=data.top_domain_encoded.nunique()
data.top_domain_encoded.nunique()

1232

In [89]:
b=data.domain_encoded.nunique()
data.domain_encoded.nunique()

308783

In [90]:
c=data.subdomain_encoded.nunique()
data.subdomain_encoded.nunique()

111204

In [91]:
d=a+b+c
d

421219

In [92]:
data.head()

Unnamed: 0,status,url_length,num_special_chars,num_digits,num_dots,num_slash,num_and,num_percent,domain_length,num_digits_in_domain,num_subdomains,is_HTTPS,depth,subdomain_digits,top_domain_encoded,domain_encoded,subdomain_encoded
0,0,39,0,7,2,0,0,0,12,0,1,0,1,7,225,111707,17
1,0,36,0,6,2,0,0,0,12,0,1,0,1,6,225,111707,18
2,0,12,0,5,1,0,0,0,5,5,0,0,1,0,878,10,111203
3,0,35,0,4,2,0,0,0,12,0,1,0,1,4,225,111707,25
4,0,20,0,5,2,0,0,0,8,0,1,0,1,5,225,291146,29


In [93]:

y=data['status']
X=data.drop('status', axis=1)


In [94]:
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=504)

In [95]:
X_train.shape

(638770, 16)

In [96]:
X_test.shape

(159693, 16)

## LSTM

In [97]:


# Input layers for categorical and numeric features
input_top = Input(shape=(1,), name='top_domain_encoded')
input_domain = Input(shape=(1,), name='domain_encoded')
input_sub = Input(shape=(1,), name='subdomain_encoded')
input_https = Input(shape=(1,), name='is_HTTPS')
input_numeric = Input(shape=(1,12), name='numeric_input')


embedding_dim = 12
embedded_tld = Embedding(input_dim=a, output_dim=embedding_dim)(input_top)
embedded_domain = Embedding(input_dim=b, output_dim=embedding_dim)(input_domain)
embedded_sub = Embedding(input_dim=c, output_dim=embedding_dim)(input_sub)
embedded_https = Embedding(input_dim=2, output_dim=embedding_dim)(input_https)

embedded_tld = Reshape((1, embedding_dim))(embedded_tld)
embedded_domain = Reshape((1, embedding_dim))(embedded_domain)
embedded_sub = Reshape((1, embedding_dim))(embedded_sub)
embedded_https = Reshape((1, embedding_dim))(embedded_https)


numeric_reshaped = Reshape((1, 12))(input_numeric)
concat_embeds = Concatenate(axis=1)([embedded_tld, embedded_domain, embedded_sub, embedded_https, numeric_reshaped])

lstm_out = LSTM(32, activation='relu')(concat_embeds)
dropout = Dropout(0.4)(lstm_out)
output = Dense(1, activation='sigmoid', kernel_regularizer='l1', bias_regularizer='l2')(dropout)

model = Model(inputs=[input_top, input_domain, input_sub, input_https, input_numeric], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

numeric = X_train.drop(['top_domain_encoded', 'domain_encoded', 'subdomain_encoded', 'is_HTTPS'], axis=1)

numeric_reshaped_train = np.expand_dims(numeric, axis=1)
numeric_reshaped_test = np.expand_dims(X_test.drop(['top_domain_encoded', 'domain_encoded', 'subdomain_encoded', 'is_HTTPS'], axis=1), axis=1)

# Fit the model
num_epochs = 30
historyL = model.fit([X_train['top_domain_encoded'], X_train['domain_encoded'], 
                      X_train['subdomain_encoded'], X_train['is_HTTPS'], numeric_reshaped_train], 
                      y_train, epochs=num_epochs, 
                      validation_data=([X_test['top_domain_encoded'], X_test['domain_encoded'], 
                                        X_test['subdomain_encoded'], X_test['is_HTTPS'], numeric_reshaped_test], 
                                        y_test), 
                      callbacks=[early_stop], verbose=2)


Epoch 1/30


19962/19962 - 241s - 12ms/step - accuracy: 0.9289 - loss: 0.1823 - val_accuracy: 0.9546 - val_loss: 0.1207
Epoch 2/30
19962/19962 - 237s - 12ms/step - accuracy: 0.9946 - loss: 0.0289 - val_accuracy: 0.9321 - val_loss: 0.2387
Epoch 3/30
19962/19962 - 246s - 12ms/step - accuracy: 0.9973 - loss: 0.0163 - val_accuracy: 0.9250 - val_loss: 0.1765
Epoch 4/30
19962/19962 - 244s - 12ms/step - accuracy: 0.9979 - loss: 0.0124 - val_accuracy: 0.9496 - val_loss: 0.1359
Epoch 4: early stopping
Restoring model weights from the end of the best epoch: 1.


In [107]:
# tf.keras.backend.clear_session()
# tf.random.set_seed(50)
Lstm  = Sequential()
Lstm.add(Embedding(b,8))
Lstm.add(Dropout(0.2))
Lstm.add(LSTM(128,return_state=False))
Lstm.add(Dense(24,activation='relu'))
Lstm.add(Dense(1,activation ='sigmoid',kernel_regularizer=l1(0.01), bias_regularizer=l2(0.01)))

In [108]:
Lstm.compile(loss ='binary_crossentropy',optimizer='adam', metrics=['accuracy', 'precision', 'recall'])

In [109]:
num_epochs =30
early_stop = EarlyStopping(monitor='val_loss',patience=3, verbose=1, restore_best_weights=True)
historyL=Lstm.fit(X_train, y_train,epochs = num_epochs,validation_data=(X_test,y_test),callbacks =[early_stop],verbose =2)
# around8 minutes an epeoch when split 

Epoch 1/30


19962/19962 - 251s - 13ms/step - accuracy: 0.9294 - loss: 0.1654 - precision: 0.9236 - recall: 0.9461 - val_accuracy: 0.9572 - val_loss: 0.1067 - val_precision: 0.9679 - val_recall: 0.9515
Epoch 2/30
19962/19962 - 251s - 13ms/step - accuracy: 0.9820 - loss: 0.0513 - precision: 0.9835 - recall: 0.9828 - val_accuracy: 0.9538 - val_loss: 0.1148 - val_precision: 0.9584 - val_recall: 0.9550
Epoch 3/30
