In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np
from google.colab import drive
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Multiply, Add

drive.mount('/content/drive')

print("GPU Available: ", tf.config.list_physical_devices('GPU'))

file_path = '/content/drive/MyDrive/malicious_phish_CSV.csv'
df = pd.read_csv(file_path, header=None, names=['url', 'type'])

print(df.columns)

Mounted at /content/drive
GPU Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Index(['url', 'type'], dtype='object')


In [None]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['type'])
y = to_categorical(df['label_encoded'])

In [None]:
urls = df['url'].values
max_sequence_length = 200
max_num_words = 10000
embedding_dim = 128

tokenizer = Tokenizer(num_words=max_num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(urls)
seq_data = tokenizer.texts_to_sequences(urls)
seq_data = pad_sequences(seq_data, maxlen=max_sequence_length)


In [None]:
def extract_features(df):
    features_list = []

    for url in df['url']:
        features = {}
        features['url_length'] = len(url)
        features['has_ip'] = 1 if re.findall(r'[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+', url) else 0
        features['is_https'] = 1 if url.startswith('https') else 0
        features['digit_count'] = sum(c.isdigit() for c in url)
        features['letter_count'] = sum(c.isalpha() for c in url)
        features['tld_count'] = len(re.findall(r'\.[a-zA-Z]+', url))
        features['special_chars_count'] = len(re.findall('[^A-Za-z0-9]', url))
        features['sus_url'] = 1 if any(s in url for s in ['PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr']) else 0
        features['php'] = 1 if '.php' in url else 0
        features['index'] = 1 if 'index' in url else 0
        features['option'] = 1 if 'option' in url else 0
        features['article'] = 1 if 'article' in url else 0
        features['content'] = 1 if 'content' in url else 0
        features['html'] = 1 if '.html' in url else 0
        features['view'] = 1 if 'view' in url else 0
        features['component'] = 1 if 'component' in url else 0
        features['slash_count'] = url.count('/')
        features['hyphen_count'] = url.count('-')
        features['underscore_count'] = url.count('_')
        features['dot_count'] = url.count('.')
        features['equals_count'] = url.count('=')
        features['question_count'] = url.count('?')
        features['percent_count'] = url.count('%')
        features['param_count'] = len(re.findall(r'\?|\&', url))
        features['count_dir'] = len(urlparse(url).path.split('/')) - 1
        features['abnormal_url'] = 1 if re.search(r'[^a-zA-Z0-9\-\.\/\?\=\&]', url) else 0
        features['count_www'] = url.count('www')
        features['at_symbol'] = url.count('@')

        features_list.append(features)

    return pd.DataFrame(features_list)


df_features = extract_features(df)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_features)

In [None]:
X_seq_train, X_seq_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    seq_data, num_data, y, test_size=0.2, random_state=42)

X_seq_train, X_seq_val, X_num_train, X_num_val, y_train, y_val = train_test_split(
    X_seq_train, X_num_train, y_train, test_size=0.25, random_state=42)


In [None]:
input_num = Input(shape=(X_num_train.shape[1],), name='Numerical_Input')
x_num = Dense(64, activation='relu')(input_num)
x_num = Dense(32, activation='relu')(x_num)
num_output = Dense(16, activation='relu')(x_num)

input_seq = Input(shape=(200,), name='Sequence_Input')
x_seq = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=200)(input_seq)
x_seq = LSTM(64, return_sequences=True)(x_seq)
x_seq = LSTM(32)(x_seq)
seq_output = Dense(16, activation='relu')(x_seq)

gate_num = Dense(16, activation='sigmoid')(num_output)
weighted_num = Multiply()([0.6 * gate_num, num_output])
weighted_seq = Multiply()([0.4 * (1 - gate_num), seq_output])
merged = Add()([weighted_num, weighted_seq])

#5:5
#gate_num = Dense(16, activation='sigmoid')(num_output)
#weighted_num = Multiply()([0.5 * gate_num, num_output])
#weighted_seq = Multiply()([0.5 * (1 - gate_num), seq_output])
#merged = Add()([weighted_num, weighted_seq])

#4:6
#gate_num = Dense(16, activation='sigmoid')(num_output)
#weighted_num = Multiply()([0.4 * gate_num, num_output])
#weighted_seq = Multiply()([0.6 * (1 - gate_num), seq_output])
#merged = Add()([weighted_num, weighted_seq])

#8:2
#gate_num = Dense(16, activation='sigmoid')(num_output)
#weighted_num = Multiply()([0.8 * gate_num, num_output])
#weighted_seq = Multiply()([0.2 * (1 - gate_num), seq_output])
#merged = Add()([weighted_num, weighted_seq])
x = Dense(64, activation='relu')(merged)
output = Dense(4, activation='softmax', name='Output')(x)

model = Model(inputs=[input_num, input_seq], outputs=output)

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    [X_num_train, X_seq_train],
    y_train,
    validation_data=([X_num_val, X_seq_val], y_val),
    epochs=20,
    batch_size=128,
    callbacks=[early_stopping]
)


test_loss, test_accuracy = model.evaluate(
    [X_num_test, X_seq_test],
    y_test,
    verbose=1
)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Epoch 1/20
[1m3053/3053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 41ms/step - accuracy: 0.9019 - loss: 0.2837 - val_accuracy: 0.9760 - val_loss: 0.0746
Epoch 2/20
[1m3053/3053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 40ms/step - accuracy: 0.9795 - loss: 0.0638 - val_accuracy: 0.9801 - val_loss: 0.0608
Epoch 3/20
[1m3053/3053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 40ms/step - accuracy: 0.9840 - loss: 0.0500 - val_accuracy: 0.9816 - val_loss: 0.0588
Epoch 4/20
[1m3053/3053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 40ms/step - accuracy: 0.9860 - loss: 0.0432 - val_accuracy: 0.9814 - val_loss: 0.0576
Epoch 5/20
[1m3053/3053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 40ms/step - accuracy: 0.9874 - loss: 0.0384 - val_accuracy: 0.9821 - val_loss: 0.0580
Epoch 6/20
[1m3053/3053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 40ms/step - accuracy: 0.9886 - loss: 0.0346 - val_accuracy: 0.9819 - val_loss: 0.058