In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# Step 1: Load Necessary Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/kaggle/input/balanced-dataset-with-features/balanced_data_with_features.csv')
print(f"Dataset Loaded! Number of samples: {len(df)}")
df.head()

Dataset Loaded! Number of samples: 514966


Unnamed: 0,url,type,label,url_length,contains_https,contains_ip,contains_suspicious_keyword,num_digits,num_subdomains,num_special_chars
0,cybermuse.gallery.ca/cybermuse/search/artist_e...,benign,1,65,False,False,False,4,3,9
1,fanpix.net/gallery/paul-gleason-pictures.htm,benign,1,44,False,False,False,0,2,6
2,biogs.com/strictlycomedancing/bellingham.html,benign,1,45,False,False,False,0,2,4
3,mylife.com/c-1090135275,benign,1,23,False,False,False,10,1,3
4,mcshane-construction.com/leadership.aspx,benign,1,40,False,False,False,0,2,4


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# Step 3: Data Preparation
# Convert boolean columns to numerical values
df['contains_https'] = df['contains_https'].astype(int)
df['contains_ip'] = df['contains_ip'].astype(int)
df['contains_suspicious_keyword'] = df['contains_suspicious_keyword'].astype(int)

# Define structural features
structural_features = df[['url_length', 'contains_https', 'contains_ip', 'contains_suspicious_keyword',
                          'num_digits', 'num_subdomains', 'num_special_chars']]

# Normalize structural features
scaler = StandardScaler()
X_structural = scaler.fit_transform(structural_features)

# Extract URLs for TF-IDF vectorization
urls = df['url']

# Step 3.1: Apply TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=500)  # Limit to 500 features for efficiency
X_tfidf = tfidf_vectorizer.fit_transform(urls)

# Step 3.2: Combine Structural Features and TF-IDF Features
X_combined = hstack([X_structural, X_tfidf]).toarray()  # Convert to array for compatibility

# Define labels
y = df['label']

# Step 3.3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Step 3.4: Reshape data for CNN
X_train_reshaped = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

print(f"Combined Feature Shape (Training): {X_train_reshaped.shape}")
print(f"Combined Feature Shape (Testing): {X_test_reshaped.shape}")

Combined Feature Shape (Training): (411972, 507, 1)
Combined Feature Shape (Testing): (102994, 507, 1)


In [4]:
# Enhanced CNN Model for Combined Features
input_shape = X_train_reshaped.shape[1]  # 507 (7 structural features + 500 TF-IDF features)

model = keras.Sequential([
    layers.InputLayer(input_shape=(input_shape, 1)),
    layers.Conv1D(64, kernel_size=3, activation='relu'),
    layers.BatchNormalization(),
    layers.Conv1D(128, kernel_size=3, activation='relu'),
    layers.BatchNormalization(),
    layers.Conv1D(256, kernel_size=3, activation='relu'),
    layers.BatchNormalization(),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(5, activation='softmax')  # Output layer for 5 classes
])

# Compile the model
model.compile(optimizer=keras.optimizers.AdamW(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Introduce Early Stopping
early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)





In [None]:
from sklearn.metrics import accuracy_score, f1_score
history = model.fit(
    X_train_reshaped, y_train,
    epochs=20,
    batch_size=256,
    validation_split=0.2,
    verbose=1  # Suppress training output
)

Epoch 1/20
[1m1288/1288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 90ms/step - accuracy: 0.6484 - loss: 0.9719 - val_accuracy: 0.7374 - val_loss: 0.5376
Epoch 2/20
[1m1288/1288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 89ms/step - accuracy: 0.7372 - loss: 0.4892 - val_accuracy: 0.6948 - val_loss: 5.2423
Epoch 3/20
[1m1288/1288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 88ms/step - accuracy: 0.7544 - loss: 0.4384 - val_accuracy: 0.3972 - val_loss: 4.5904
Epoch 4/20
[1m1288/1288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 88ms/step - accuracy: 0.7635 - loss: 0.4153 - val_accuracy: 0.3480 - val_loss: 6.9195
Epoch 5/20
[1m1288/1288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 87ms/step - accuracy: 0.7711 - loss: 0.3924 - val_accuracy: 0.5931 - val_loss: 21.7684
Epoch 7/20
[1m1288/1288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 86ms/step - accuracy: 0.7725 - loss: 0.3859 - val_accuracy: 0.7821 - val_loss: 0.36

In [6]:
# Step 6: Evaluate the Model
test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test, verbose=0)
print(f"\n✅ Test Accuracy: {test_accuracy * 100:.2f}%")
print(f"✅ Test Loss: {test_loss:.4f}")


✅ Test Accuracy: 67.79%
✅ Test Loss: 0.6073


NEW CODE LSTM

In [51]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from scipy.sparse import hstack

In [52]:
# Step 1: Load the Dataset
df = pd.read_csv('/kaggle/input/balanced-dataset-with-features/balanced_data_with_features.csv')
df

Unnamed: 0,url,type,label,url_length,contains_https,contains_ip,contains_suspicious_keyword,num_digits,num_subdomains,num_special_chars
0,cybermuse.gallery.ca/cybermuse/search/artist_e...,benign,1,65,False,False,False,4,3,9
1,fanpix.net/gallery/paul-gleason-pictures.htm,benign,1,44,False,False,False,0,2,6
2,biogs.com/strictlycomedancing/bellingham.html,benign,1,45,False,False,False,0,2,4
3,mylife.com/c-1090135275,benign,1,23,False,False,False,10,1,3
4,mcshane-construction.com/leadership.aspx,benign,1,40,False,False,False,0,2,4
...,...,...,...,...,...,...,...,...,...,...
514961,http://1f4h4ih79y.biz,spam,4,21,False,False,False,5,1,4
514962,http://fx71o0gh9k.org,spam,4,21,False,False,False,4,1,4
514963,https://k1okr8hui7.biz,spam,4,22,True,False,False,3,1,4
514964,https://1vknhlr6fk.biz,spam,4,22,True,False,False,2,1,4


In [53]:
# Step 2: Prepare Structural Features
df['contains_https'] = df['contains_https'].astype(int)
df['contains_ip'] = df['contains_ip'].astype(int)
df['contains_suspicious_keyword'] = df['contains_suspicious_keyword'].astype(int)

structural_features = df[['url_length', 'contains_https', 'contains_ip', 
                          'contains_suspicious_keyword', 'num_digits', 
                          'num_subdomains', 'num_special_chars']]

scaler = StandardScaler()
X_structural = scaler.fit_transform(structural_features)

In [54]:
# Step 3: Prepare TF-IDF Features
urls = df['url']
tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=500)
X_tfidf = tfidf_vectorizer.fit_transform(urls)

In [55]:
# Step 4: Use Pretrained LSTM-based Model (Universal Sentence Encoder - USE)
import tensorflow_hub as hub
from tqdm import tqdm  # To show progress bar

# Load the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Define a function to process URLs in batches
def process_in_batches(urls, batch_size=246):
    embeddings = []
    for i in tqdm(range(0, len(urls), batch_size)):
        batch = urls[i:i + batch_size]
        batch_embeddings = embed(batch).numpy()
        embeddings.append(batch_embeddings)

    # print("done!")
    return np.vstack(embeddings)

# Process URLs in batches
X_use = process_in_batches(urls)


100%|██████████| 2094/2094 [00:19<00:00, 108.67it/s]


In [56]:
# Step 5: Combine All Features
X_combined = np.hstack([X_structural, X_tfidf.toarray(), X_use])

In [57]:
# Step 6: Define Labels
y = df['label']

In [58]:
# Step 7: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [59]:
# Step 8: Define Model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [60]:
# Step 9: Train the Model
history = model.fit(X_train, y_train, epochs=500, batch_size=256, verbose=1)

Epoch 1/500
[1m1610/1610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.7145 - loss: 0.5577
Epoch 2/500
[1m1610/1610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7675 - loss: 0.4001
Epoch 3/500
[1m1610/1610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7719 - loss: 0.3817
Epoch 4/500
[1m1610/1610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7767 - loss: 0.3683
Epoch 5/500
[1m1610/1610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7807 - loss: 0.3604
Epoch 6/500
[1m1610/1610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7810 - loss: 0.3547
Epoch 7/500
[1m1610/1610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7829 - loss: 0.3507
Epoch 8/500
[1m1610/1610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7828 - loss: 0.3456
Epoch 9/500
[1m

In [67]:
model_path = '/kaggle/working/malicious_url_classification_model.h5'
model.save(model_path)
print(f"\n✅ Model saved successfully at {model_path}")


✅ Model saved successfully at /kaggle/working/malicious_url_classification_model.h5
