<a href="https://colab.research.google.com/github/JamesTan44181/DLI-Group-Assignment/blob/main/DLI_GROUP_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Step1: Data preprocessing

In [1]:
import pandas as pd
url = "https://raw.githubusercontent.com/JamesTan44181/DLI-Group-Assignment/main/cleaned_balanced_dataset.csv"
read_file = pd.read_csv(url)

#confirm successful read file
print(read_file.head())

#setting features, and label (target user)
x = read_file.iloc[:, :-1].values
y = read_file.iloc[:, -1].values

#confirmation for features and label
print(x)
print(y)

#splitting data into training and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42, stratify=y)

#features scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.transform(x_test)

   url_length  has_ip_address  dot_count  https_flag  url_entropy  \
0          47               0          2           1     4.250669   
1          40               0          2           1     4.196439   
2          23               0          1           0     3.936180   
3          30               0          1           0     3.894740   
4          47               0          1           1     4.143127   

   token_count  subdomain_count  query_param_count  tld_length  path_length  \
0            6                1                  1           3            1   
1            6                1                  1           3            1   
2            5                0                  1           2            1   
3            5                0                  1           3            1   
4            7                0                  1           3           18   

   has_hyphen_in_domain  number_of_digits  tld_popularity  \
0                     1                 0        

Step2: Model

In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
import os

#setting random seeds
np.random.seed(42)
tf.random.set_seed(42)

#Define Keras Model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

#hyperparamater
learning_rate = 0.001
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

#Compiling and Training the Model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

#Fit keras model
#This line trains the compiled model using the data we prepared earlier.
#Calculate training time
import time
start_time = time.time()
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32,verbose=1)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Step3: Evaluation

In [None]:
#Fitting the Model and Predicting Anomalies
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, precision_score, recall_score, f1_score
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

#Evaluation metrics
acc = np.mean(y_pred.flatten() == y_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

print("\n📊 Evaluation Results:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")

#Draw ROC curve
import matplotlib.pyplot as plt
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Confusion Matrix
import seaborn as sns
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

Step4: Save and verdict the model

In [None]:
#Save
model.save("phishing_model.keras")
print("Model saved as phishing_model.keras")
pd.DataFrame(X_test).to_csv("X_test.csv", index=False)
pd.DataFrame(y_test).to_csv("y_test.csv", index=False)
print("Test data saved as X_test.csv and y_test.csv")

#Verdict
#F1-score is the harmonic mean of Precision and Recall
if f1 >=0.9:
  print(f"Success: Achieved F1 = {f1:.4f}, target met.")
else:
    print(f"F1 = {f1:.4f}, target not met.")