In [1]:
pip install pandas numpy scikit-learn tensorflow nltk scipy




In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack, csr_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import gc
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt')

# Load the dataset
file_path = '/content/PhiUSIIL_Phishing_URL_Dataset (updated 3-3-24 ).csv'
df = pd.read_csv(file_path)

# Handle missing values
# Fill missing numerical values with the median value of each column
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = numerical_cols.drop('label')  # Exclude the label column
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# Fill missing categorical values with the mode value of each column
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Check for and handle missing values in the target column
if df['label'].isnull().any():
    print("Found NaN values in the target column. Filling with mode value.")
    df['label'] = df['label'].fillna(df['label'].mode()[0])

# Encode categorical variables
# Example: Encoding the 'TLD' column
le = LabelEncoder()
df['TLD'] = le.fit_transform(df['TLD'])

# Tokenization and Hashing Vectorizer for text columns
text_cols = ['FILENAME', 'URL', 'Domain', 'Title']
hashing_vectorizer = HashingVectorizer(n_features=2**6, alternate_sign=False)  # Further reduced number of features for efficiency

# Apply tokenization and vectorization
text_features = [hashing_vectorizer.transform(df[col].apply(lambda x: ' '.join(word_tokenize(x)))) for col in text_cols]

# Combine text features into a single sparse matrix
text_features_combined = hstack(text_features)

# Separate numerical features and target
X_numerical = df[numerical_cols].values
y = df['label'].values

# Standardize numerical features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(X_numerical)

# Combine numerical and text features
X = hstack([csr_matrix(X_numerical), text_features_combined])

# Reduce dimensionality with Truncated SVD
svd = TruncatedSVD(n_components=30, random_state=42)  # Further reduced number of components for efficiency
X_reduced = svd.fit_transform(X)

# Adjust labels for one-hot encoding
y_adjusted = y - 1

# 10-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
classification_reports = []

for fold, (train_index, test_index) in enumerate(skf.split(X_reduced, y)):
    X_train, X_test = X_reduced[train_index], X_reduced[test_index]
    y_train, y_test = y_adjusted[train_index], y_adjusted[test_index]

    print(f"Fold {fold + 1}:")
    print(f"Training data size: {len(train_index)}")
    print(f"Testing data size: {len(test_index)}")

    # Define the neural network model
    model = Sequential([
        Dense(64, input_dim=X_train.shape[1], activation='relu'),
        Dropout(0.5),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

    # Predict using the trained model
    y_pred_test = model.predict(X_test).flatten()

    # Adjust predicted labels back to original range (1 and 2)
    y_pred_test_labels_adjusted = np.round(y_pred_test).astype(int) + 1
    y_test_labels_adjusted = y_test + 1

    # Evaluate the model
    accuracy = accuracy_score(y_test_labels_adjusted, y_pred_test_labels_adjusted)
    precision = precision_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    recall = recall_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    f1 = f1_score(y_test_labels_adjusted, y_pred_test_labels_adjusted, average='weighted')
    report = classification_report(y_test_labels_adjusted, y_pred_test_labels_adjusted, output_dict=True)

    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    classification_reports.append(report)

    print(f"Fold {fold + 1} Results:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}\n")

    # Free up memory
    del X_train, X_test, y_train, y_test, model, y_pred_test
    gc.collect()

# Aggregate the classification reports
average_classification_report = {}
for key in classification_reports[0].keys():
    if isinstance(classification_reports[0][key], dict):
        average_classification_report[key] = {}
        for sub_key in classification_reports[0][key].keys():
            average_classification_report[key][sub_key] = np.mean([report[key][sub_key] for report in classification_reports])
    else:
        average_classification_report[key] = np.mean([report[key] for report in classification_reports])

# Calculate average metrics
average_accuracy = np.mean(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1_score = np.mean(f1_scores)

print(f"Average Model Accuracy: {average_accuracy}")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1 Score: {average_f1_score}")
print("Average Classification Report:")
for key, value in average_classification_report.items():
    if isinstance(value, dict):
        print(f"  {key}:")
        for sub_key, sub_value in value.items():
            print(f"    {sub_key}: {sub_value}")
    else:
        print(f"  {key}: {value}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fold 1:
Training data size: 212215
Testing data size: 23580


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - accuracy: 0.9794 - loss: 0.0577
Epoch 2/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - accuracy: 0.9991 - loss: 0.0038
Epoch 3/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0028
Epoch 4/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0023
Epoch 5/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0018
Epoch 6/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0015
Epoch 7/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0017
Epoch 8/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0015
Epoch 9/10
[1m6

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.9763 - loss: 0.0581
Epoch 2/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9991 - loss: 0.0039
Epoch 3/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0026
Epoch 4/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0022
Epoch 5/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9997 - loss: 0.0012
Epoch 6/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 3ms/step - accuracy: 0.9994 - loss: 0.0023
Epoch 7/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0016
Epoch 8/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0023
Epoch 9/10
[1m6632/6632[0

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9775 - loss: 0.0616
Epoch 2/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9991 - loss: 0.0040
Epoch 3/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0025
Epoch 4/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0019
Epoch 5/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9997 - loss: 0.0016
Epoch 6/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0017
Epoch 7/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0014
Epoch 8/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0011
Epoch 9/10
[1m6632/6632[0

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9824 - loss: 0.0488
Epoch 2/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9991 - loss: 0.0034
Epoch 3/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0027
Epoch 4/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0022
Epoch 5/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0017
Epoch 6/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.9995 - loss: 0.0017
Epoch 7/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.9997 - loss: 0.0014
Epoch 8/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0013
Epoch 9/10
[1m6632/6632[0

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.9824 - loss: 0.0498
Epoch 2/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9991 - loss: 0.0044
Epoch 3/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0027
Epoch 4/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0023
Epoch 5/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0017
Epoch 6/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0018
Epoch 7/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0020
Epoch 8/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9997 - loss: 0.0020
Epoch 9/10
[1m6632/6632[0

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9834 - loss: 0.0462
Epoch 2/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9991 - loss: 0.0041
Epoch 3/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0028
Epoch 4/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0024
Epoch 5/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0016
Epoch 6/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0019
Epoch 7/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0017
Epoch 8/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0017
Epoch 9/10
[1m6632/6632[0

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9769 - loss: 0.0611
Epoch 2/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9990 - loss: 0.0041
Epoch 3/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0028
Epoch 4/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0024
Epoch 5/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0021
Epoch 6/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0019
Epoch 7/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0016
Epoch 8/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0015
Epoch 9/10
[1m6632/6632[0

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9797 - loss: 0.0588
Epoch 2/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9990 - loss: 0.0040
Epoch 3/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0028
Epoch 4/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0028
Epoch 5/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0021
Epoch 6/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0020
Epoch 7/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0015
Epoch 8/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0017
Epoch 9/10
[1m6632/6632[0

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9801 - loss: 0.0553
Epoch 2/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9992 - loss: 0.0028
Epoch 3/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step - accuracy: 0.9995 - loss: 0.0023
Epoch 4/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0017
Epoch 5/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0018
Epoch 6/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9997 - loss: 0.0018
Epoch 7/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0013
Epoch 8/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0022
Epoch 9/10
[1m6632/6632[0

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9825 - loss: 0.0504
Epoch 2/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9992 - loss: 0.0030
Epoch 3/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0029
Epoch 4/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0025
Epoch 5/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0021
Epoch 6/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0020
Epoch 7/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0014
Epoch 8/10
[1m6632/6632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0016
Epoch 9/10
[1m6632/6632[0