## Imports

In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from ipaddress import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score,f1_score,auc,roc_curve,roc_auc_score, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import svm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout, Flatten, SpatialDropout1D, LSTM,Input, concatenate, Reshape, InputLayer, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.regularizers import l1, l2, l1_l2
import re
import tldextract
import warnings
warnings.filterwarnings("ignore")
sns.set_style(style='white') 
from sklearn.svm import LinearSVC

## Data Transforming

In [27]:

data=pd.read_csv('cleaned_data.csv')
df=data[['url','status']]
data=data.drop('url',axis=1)

In [None]:
data

In [None]:
a=data.top_domain_encoded.nunique()
data.top_domain_encoded.nunique()

In [None]:
b=data.domain_encoded.nunique()
data.domain_encoded.nunique()

In [None]:
d=a+b
d

In [None]:
data.head()

In [33]:

y=data['status']
X=data.drop('status', axis=1)
col=X.columns
X=StandardScaler().fit_transform(X)
X=pd.DataFrame(X,columns=col)

In [34]:
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=54)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X_train

## Traditional Models

In [38]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f' The Accuracy  :{accuracy_score(y_test,y_pred)}')
print(f' The Precision :{precision_score(y_test,y_pred)}')
print(f' The Recall :{recall_score(y_test,y_pred)}')
print(f' The F1 Score model is :{f1_score(y_test,y_pred)}')
print(f' The ROC_AUC Score model is :{roc_auc_score(y_test,y_pred)}')
# trains in around 2 minutes and has 0.95 across the board

 The Accuracy  :0.9509934687181029
 The Precision :0.9480720745726644
 The Recall :0.9608964741185296
 The F1 Score model is :0.9544411973593825
 The ROC_AUC Score model is :0.9502657979955255


In [39]:
clf = LinearSVC() 

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f' The Accuracy  :{accuracy_score(y_test,y_pred)}')
print(f' The Precision :{precision_score(y_test,y_pred)}')
print(f' The Recall :{recall_score(y_test,y_pred)}')
print(f' The F1 Score model is :{f1_score(y_test,y_pred)}')
print(f' The ROC_AUC Score model is :{roc_auc_score(y_test,y_pred)}')
# ran for 5 minutes and did not finish training

 The Accuracy  :0.7498137050465581
 The Precision :0.7267745902868742
 The Recall :0.8519786196549137
 The F1 Score model is :0.7844119123035167
 The ROC_AUC Score model is :0.7423066489328736


# Neural Networks

## LSTM

In [40]:
tf.keras.backend.clear_session()
tf.random.set_seed(50)


In [41]:
Lstm  = Sequential()
Lstm.add(Embedding(d,8))
Lstm.add(Dropout(0.2))
Lstm.add(LSTM(128,return_state=False))
Lstm.add(Dense(24,activation='relu'))
# Lstm.add(Dense(1,activation ='sigmoid',kernel_regularizer=l1(0.01), bias_regularizer=l2(0.01)))
Lstm.add(Dense(1,activation ='sigmoid'))

In [42]:
Lstm.compile(loss ='binary_crossentropy',optimizer='adam', metrics=['accuracy', 'precision', 'recall'])

In [43]:
num_epochs =30
early_stop = EarlyStopping(monitor='val_loss',patience=3, verbose=1, restore_best_weights=True)
historyL=Lstm.fit(X_train, y_train,epochs = num_epochs,validation_data=(X_test,y_test),callbacks =[early_stop],verbose =2)
# around8 minutes an epoch when split 

Epoch 1/30
19962/19962 - 226s - 11ms/step - accuracy: 0.8544 - loss: 0.3488 - precision: 0.8220 - recall: 0.9288 - val_accuracy: 0.8736 - val_loss: 0.3112 - val_precision: 0.8491 - val_recall: 0.9284
Epoch 2/30
19962/19962 - 229s - 11ms/step - accuracy: 0.8735 - loss: 0.3117 - precision: 0.8426 - recall: 0.9388 - val_accuracy: 0.8778 - val_loss: 0.3016 - val_precision: 0.8530 - val_recall: 0.9318
Epoch 3/30
19962/19962 - 229s - 11ms/step - accuracy: 0.8761 - loss: 0.3051 - precision: 0.8449 - recall: 0.9410 - val_accuracy: 0.8784 - val_loss: 0.2995 - val_precision: 0.8530 - val_recall: 0.9331
Epoch 4/30
19962/19962 - 228s - 11ms/step - accuracy: 0.8773 - loss: 0.3017 - precision: 0.8455 - recall: 0.9427 - val_accuracy: 0.8793 - val_loss: 0.2979 - val_precision: 0.8513 - val_recall: 0.9379
Epoch 5/30
19962/19962 - 229s - 11ms/step - accuracy: 0.8782 - loss: 0.2993 - precision: 0.8461 - recall: 0.9439 - val_accuracy: 0.8795 - val_loss: 0.2973 - val_precision: 0.8516 - val_recall: 0.9378


In [44]:
results = Lstm.evaluate(X_test, y_test, verbose=2)
y_pred_prob = Lstm.predict(X_test)  # Get the predicted probabilities
y_pred = (y_pred_prob > 0.5).astype(int)  
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)  
# Assuming you compiled with accuracy, precision, and recall
print(f"Restored Model Test Loss: {results[0]}")
print(f"Restored Model Test Accuracy: {results[1]}")
print(f"Restored Model Test Precision: {results[2]}")
print(f"Restored Model Test Recall: {results[3]}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")

4991/4991 - 13s - 3ms/step - accuracy: 0.8803 - loss: 0.2951 - precision: 0.8500 - recall: 0.9421
[1m4991/4991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step
Restored Model Test Loss: 0.29514002799987793
Restored Model Test Accuracy: 0.8802577257156372
Restored Model Test Precision: 0.84999680519104
Restored Model Test Recall: 0.9421183466911316
F1 Score: 0.8936898871407127
ROC-AUC Score: 0.9316299602540842


## CNN

In [45]:
tf.keras.backend.clear_session()
tf.random.set_seed(50)

In [46]:
cnn  = Sequential()
cnn.add(Embedding(d,8))
cnn.add(Dropout(0.2))
cnn.add(Conv1D(128, 5, activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Flatten())
cnn.add(Dense(24,activation='relu'))
# cnn.add(Dense(1,activation ='sigmoid',kernel_regularizer=l1(0.01), bias_regularizer=l2(0.01)))
cnn.add(Dense(1,activation ='sigmoid'))

In [47]:
cnn.compile(loss ='binary_crossentropy',optimizer='adam', metrics=['accuracy', 'precision', 'recall'])

In [48]:
num_epochs =30
early_stop = EarlyStopping(monitor='val_loss',patience=3, verbose=1, restore_best_weights=True)
historyC=cnn.fit(X_train, y_train,epochs = num_epochs,validation_data=(X_test,y_test),callbacks =[early_stop],verbose =2)
# around 6 minutes an epoch when split 

Epoch 1/30
19962/19962 - 135s - 7ms/step - accuracy: 0.8662 - loss: 0.3274 - precision: 0.8349 - recall: 0.9347 - val_accuracy: 0.8765 - val_loss: 0.3078 - val_precision: 0.8405 - val_recall: 0.9489
Epoch 2/30
19962/19962 - 133s - 7ms/step - accuracy: 0.8732 - loss: 0.3138 - precision: 0.8420 - recall: 0.9390 - val_accuracy: 0.8778 - val_loss: 0.3034 - val_precision: 0.8499 - val_recall: 0.9367
Epoch 3/30
19962/19962 - 133s - 7ms/step - accuracy: 0.8744 - loss: 0.3101 - precision: 0.8434 - recall: 0.9395 - val_accuracy: 0.8781 - val_loss: 0.3015 - val_precision: 0.8510 - val_recall: 0.9357
Epoch 4/30
19962/19962 - 134s - 7ms/step - accuracy: 0.8751 - loss: 0.3081 - precision: 0.8443 - recall: 0.9397 - val_accuracy: 0.8784 - val_loss: 0.3005 - val_precision: 0.8505 - val_recall: 0.9371
Epoch 5/30
19962/19962 - 133s - 7ms/step - accuracy: 0.8757 - loss: 0.3070 - precision: 0.8449 - recall: 0.9401 - val_accuracy: 0.8782 - val_loss: 0.3007 - val_precision: 0.8500 - val_recall: 0.9373
Epoch

In [49]:
results = cnn.evaluate(X_test, y_test, verbose=2)
y_pred_prob = cnn.predict(X_test)  # Get the predicted probabilities
y_pred = (y_pred_prob > 0.5).astype(int)  
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)  
# Assuming you compiled with accuracy, precision, and recall
print(f"Restored Model Test Loss: {results[0]}")
print(f"Restored Model Test Accuracy: {results[1]}")
print(f"Restored Model Test Precision: {results[2]}")
print(f"Restored Model Test Recall: {results[3]}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")

4991/4991 - 6s - 1ms/step - accuracy: 0.8789 - loss: 0.2980 - precision: 0.8512 - recall: 0.9372
[1m4991/4991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step
Restored Model Test Loss: 0.29803353548049927
Restored Model Test Accuracy: 0.8789364695549011
Restored Model Test Precision: 0.851187527179718
Restored Model Test Recall: 0.9372421503067017
F1 Score: 0.8921444471098863
ROC-AUC Score: 0.930766005305423
