### Import Libraries

In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix,accuracy_score

### Load the Data

In [2]:
df = pd.read_csv("dataset/phishing_domain_dataset.csv")
df.head()

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,3,0,0,1,0,0,0,0,0,0,...,1,2,0,892,0,0,0,0,0,1
1,5,0,1,3,0,3,0,2,0,0,...,1,2,1,9540,1,0,0,0,0,1
2,2,0,0,1,0,0,0,0,0,0,...,1,2,3,589,1,0,0,0,0,0
3,4,0,2,5,0,0,0,0,0,0,...,1,2,0,292,1,0,0,0,0,1
4,2,0,0,0,0,0,0,0,0,0,...,1,2,1,3597,0,1,0,0,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88647 entries, 0 to 88646
Columns: 112 entries, qty_dot_url to phishing
dtypes: float64(1), int64(111)
memory usage: 75.7 MB


### Process the Data

In [4]:
X = df.drop("phishing",axis=1)
y = df["phishing"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

print(f"Train Size : {X_test.shape},{y_train.shape},\nTest Size : {X_test.shape},{y_test.shape}")

Train Size : (17730, 111),(70917,),
Test Size : (17730, 111),(17730,)


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Utility Code

In [6]:
def return_model(model,X_train,y_train,X_test,y_test):

    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)

    return accuracy_score(y_test,y_pred),confusion_matrix(y_test,y_pred)


### Logistic Regression

In [7]:
model= LogisticRegression()

accuracy,matrix = return_model(model,X_train,y_train,X_test,y_test)

print(f"Accuracy: {accuracy} \nConfusion Matrix: \n {matrix}")

Accuracy: 0.9330513254371122 
Confusion Matrix: 
 [[11033   637]
 [  550  5510]]


### Decision Tree Classifier

In [8]:
decision_tree_model = DecisionTreeClassifier()

accuracy,matrix = return_model(decision_tree_model,X_train,y_train,X_test,y_test)

print(f"Accuracy: {accuracy} \nConfusion Matrix: \n {matrix}")

Accuracy: 0.9546531302876481 
Confusion Matrix: 
 [[11274   396]
 [  408  5652]]


### Random Forest Classifier

In [9]:
random_forest_model = RandomForestClassifier()

accuracy,matrix = return_model(random_forest_model,X_train,y_train,X_test,y_test)

print(f"Accuracy: {accuracy} \nConfusion Matrix: \n {matrix}")

Accuracy: 0.9715736040609138 
Confusion Matrix: 
 [[11418   252]
 [  252  5808]]


### Building a Deep Learning Model

In [10]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential


deep_model = Sequential([
    layers.InputLayer(input_shape=(111,)),
    layers.Dense(128,activation="relu"),
    layers.Dense(64,activation="relu"),
    layers.Dense(32,activation="relu"),
    layers.Dense(16,activation="relu"),
    layers.Dense(1,activation="sigmoid")
])

deep_model.summary()

In [11]:
deep_model.compile(
    loss = tf.keras.losses.BinaryCrossentropy(),
    optimizer = "adam",
    metrics = ["accuracy"]
)

In [12]:
history = deep_model.fit(
    X_train,y_train,
    epochs=50,
    batch_size=64,
    validation_data = [X_test,y_test]
)

Epoch 1/50
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.9210 - loss: 0.1939 - val_accuracy: 0.9438 - val_loss: 0.1422
Epoch 2/50
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 950us/step - accuracy: 0.9511 - loss: 0.1277 - val_accuracy: 0.9489 - val_loss: 0.1307
Epoch 3/50
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 890us/step - accuracy: 0.9527 - loss: 0.1218 - val_accuracy: 0.9540 - val_loss: 0.1191
Epoch 4/50
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 886us/step - accuracy: 0.9554 - loss: 0.1162 - val_accuracy: 0.9548 - val_loss: 0.1184
Epoch 5/50
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 933us/step - accuracy: 0.9585 - loss: 0.1090 - val_accuracy: 0.9550 - val_loss: 0.1154
Epoch 6/50
[1m1109/1109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 892us/step - accuracy: 0.9614 - loss: 0.1049 - val_accuracy: 0.9544 - val_loss: 0.1196
Epoch 

### Accuracy and Loss

In [15]:
loss,accuracy = deep_model.evaluate(X_test,y_test)
print(f"Accuracy : {accuracy}")
print(f"Loss : {loss}")

[1m555/555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 513us/step - accuracy: 0.9620 - loss: 0.1549
Accuracy : 0.9628877639770508
Loss : 0.1416003257036209


### Save the Model

In [17]:
import joblib
joblib.dump(deep_model, 'models/classifier-v1')
print("Model Saved Successfully")

Model Saved Successfully
