## Imports

In [71]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from ipaddress import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout, Flatten, SpatialDropout1D, LSTM,Input, concatenate, Reshape, InputLayer, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.regularizers import l1, l2, l1_l2
import keras
import re
import tldextract
import warnings
warnings.filterwarnings("ignore")
sns.set_style(style='white') 

## Data Transforming

In [72]:

data=pd.read_csv('cleaned_data.csv')
df=data[['url','status']]
data=data.drop('url',axis=1)

In [73]:
data

Unnamed: 0,status,url_length,num_special_chars,num_digits,num_dots,num_slash,num_and,num_percent,domain_length,num_digits_in_domain,num_subdomains,subdomain_digits,is_HTTPS,depth,top_domain_encoded,domain_encoded,subdomain_encoded
0,0,39,0,7,2,0,0,0,12,0,1,7,0,1,225,111707,17
1,0,36,0,6,2,0,0,0,12,0,1,6,0,1,225,111707,18
2,0,12,0,5,1,0,0,0,5,5,0,0,0,1,878,10,0
3,0,35,0,4,2,0,0,0,12,0,1,4,0,1,225,111707,25
4,0,20,0,5,2,0,0,0,8,0,1,5,0,1,225,291146,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
798458,0,9,0,0,1,0,0,0,5,0,0,0,0,1,225,308751,0
798459,0,6,0,0,1,0,0,0,3,0,0,0,0,1,690,308750,0
798460,0,9,0,0,2,0,0,0,3,0,0,0,0,1,215,308771,0
798461,0,16,0,0,2,0,0,0,10,0,0,0,0,1,219,308776,0


In [74]:
a=data.top_domain_encoded.nunique()
data.top_domain_encoded.nunique()

1232

In [75]:
b=data.domain_encoded.nunique()
data.domain_encoded.nunique()

308783

In [76]:
c=data.subdomain_encoded.nunique()
data.subdomain_encoded.nunique()

111203

In [77]:
d=a+b+c
d

421218

In [78]:
data.head()

Unnamed: 0,status,url_length,num_special_chars,num_digits,num_dots,num_slash,num_and,num_percent,domain_length,num_digits_in_domain,num_subdomains,subdomain_digits,is_HTTPS,depth,top_domain_encoded,domain_encoded,subdomain_encoded
0,0,39,0,7,2,0,0,0,12,0,1,7,0,1,225,111707,17
1,0,36,0,6,2,0,0,0,12,0,1,6,0,1,225,111707,18
2,0,12,0,5,1,0,0,0,5,5,0,0,0,1,878,10,0
3,0,35,0,4,2,0,0,0,12,0,1,4,0,1,225,111707,25
4,0,20,0,5,2,0,0,0,8,0,1,5,0,1,225,291146,29


In [79]:

y=data['status']
X=data.drop('status', axis=1)
col=X.columns
X=StandardScaler().fit_transform(X)
X=pd.DataFrame(X,columns=col)

In [80]:
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=54)

In [81]:
X_train.shape

(638770, 16)

In [82]:
X_test.shape

(159693, 16)

## CNN

In [83]:
tf.keras.backend.clear_session()
tf.random.set_seed(50)

In [84]:
cnn  = Sequential()
cnn.add(Embedding(d,8))
cnn.add(Dropout(0.2))
cnn.add(Conv1D(128, 5, activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Flatten())
cnn.add(Dense(24,activation='relu'))
cnn.add(Dense(1,activation ='sigmoid',kernel_regularizer=l1(0.01), bias_regularizer=l2(0.01)))
# cnn.add(Dense(1,activation ='sigmoid'))
# maybe consider increaseing dropout as this model has a high starting accuracy so combat overfitting
# look into handling the categorical features and concatenating with the numeric features to see if that makes a difference for embedding layers

In [85]:
cnn.compile(loss ='binary_crossentropy',optimizer='adam', metrics=['accuracy', 'precision', 'recall'])

In [86]:
num_epochs =30
early_stop = EarlyStopping(monitor='val_loss',patience=3, verbose=1, restore_best_weights=True)
historyC=cnn.fit(X_train, y_train,epochs = num_epochs,validation_data=(X_test,y_test),callbacks =[early_stop],verbose =2)
# around 6 minutes an epoch when split 
# reached around 88

Epoch 1/30


19962/19962 - 223s - 11ms/step - accuracy: 0.8591 - loss: 0.3496 - precision: 0.8273 - recall: 0.9309 - val_accuracy: 0.8734 - val_loss: 0.3224 - val_precision: 0.8416 - val_recall: 0.9399
Epoch 2/30
19962/19962 - 477s - 24ms/step - accuracy: 0.8686 - loss: 0.3293 - precision: 0.8386 - recall: 0.9341 - val_accuracy: 0.8746 - val_loss: 0.3166 - val_precision: 0.8407 - val_recall: 0.9443
Epoch 3/30
19962/19962 - 426s - 21ms/step - accuracy: 0.8708 - loss: 0.3243 - precision: 0.8400 - recall: 0.9368 - val_accuracy: 0.8754 - val_loss: 0.3141 - val_precision: 0.8413 - val_recall: 0.9449
Epoch 4/30
19962/19962 - 432s - 22ms/step - accuracy: 0.8719 - loss: 0.3219 - precision: 0.8411 - recall: 0.9376 - val_accuracy: 0.8757 - val_loss: 0.3129 - val_precision: 0.8438 - val_recall: 0.9415
Epoch 5/30
19962/19962 - 427s - 21ms/step - accuracy: 0.8725 - loss: 0.3203 - precision: 0.8419 - recall: 0.9378 - val_accuracy: 0.8762 - val_loss: 0.3114 - val_precision: 0.8420 - val_recall: 0.9457
Epoch 6/30
