In [22]:
# Step 1: Import Necessary Libraries
import numpy as np
import pandas as pd
import torch
import learn2learn as l2l
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import torch.nn as nn
import torch.optim as optim

In [23]:
# Step 2: Load the Dataset
df = pd.read_csv("phishing_site_urls.csv")  # Ensure dataset is in the working directory

In [28]:
df.head()

Unnamed: 0,URL,Label,url_length,num_digits,num_special_chars,num_subdomains,has_https
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,225,58,24,6,0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,81,1,12,5,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,177,47,19,7,0
3,mail.printakid.com/www.online.americanexpress....,bad,60,0,8,6,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,116,21,12,1,0


In [27]:
# Step 3: Feature Engineering (Lexical & Host-Based Features)
df['url_length'] = df['URL'].apply(len)
df['num_digits'] = df['URL'].apply(lambda x: sum(c.isdigit() for c in x))
df['num_special_chars'] = df['URL'].apply(lambda x: sum(c in ['.', '/', '-', '_'] for c in x))
df['num_subdomains'] = df['URL'].apply(lambda x: x.count('.'))
df['has_https'] = df['URL'].apply(lambda x: 1 if 'https' in x.lower() else 0)

In [29]:
import whois
from datetime import datetime

def get_domain_age(url):
    try:
        domain_info = whois.whois(url)
        creation_date = domain_info.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]  # Take the first date if multiple exist
        age = (datetime.now() - creation_date).days if creation_date else 0
        return age
    except:
        return 0

df["domain_age"] = df["URL"].apply(get_domain_age)


In [31]:
df['domain_age'] = df['domain_age'].fillna(0)  # Fill missing values with 0

In [32]:
df.head()

Unnamed: 0,URL,Label,url_length,num_digits,num_special_chars,num_subdomains,has_https,domain_age
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,225,58,24,6,0,0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,81,1,12,5,0,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,177,47,19,7,0,0
3,mail.printakid.com/www.online.americanexpress....,bad,60,0,8,6,0,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,116,21,12,1,0,0


In [33]:
# Step 4: Convert Labels to Numerical
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])  # 0 = Legitimate, 1 = Malicious

In [34]:
# Step 5: Select Features & Labels
X = df[['url_length', 'num_digits', 'num_special_chars', 'num_subdomains', 'has_https', 'domain_age']]
y = df['Label']

In [35]:
# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [36]:
# Step 7: Apply SMOTE for Class Balancing
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [37]:
# Step 8: Standardize Features
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Convert Data to PyTorch Tensors
X_train_tensor = torch.tensor(X_train_resampled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_resampled.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [38]:
# Step 9: Define MAML-Based Model
class MAMLModel(nn.Module):
    def __init__(self, input_size):
        super(MAMLModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)  # Output layer (Binary Classification)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [39]:
# Step 10: Initialize Model & MAML Algorithm
input_size = X_train_tensor.shape[1]
model = MAMLModel(input_size)
maml = l2l.algorithms.MAML(model, lr=0.001, first_order=False)
optimizer = optim.Adam(maml.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss()

In [40]:
# Step 11: Train the MAML Model
epochs = 1000
for epoch in range(epochs):
    optimizer.zero_grad()
    
    # Inner loop: Simulate task adaptation
    learner = maml.clone()
    predictions = learner(X_train_tensor)
    loss = loss_function(predictions, y_train_tensor)
    
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f"Epoch {epoch}/{epochs}, Loss: {loss.item()}")

Epoch 0/1000, Loss: 0.6949577927589417
Epoch 100/1000, Loss: 0.5836813449859619
Epoch 200/1000, Loss: 0.5506035685539246
Epoch 300/1000, Loss: 0.5351392030715942
Epoch 400/1000, Loss: 0.5247899889945984
Epoch 500/1000, Loss: 0.5168629884719849
Epoch 600/1000, Loss: 0.5111380815505981
Epoch 700/1000, Loss: 0.5070642232894897
Epoch 800/1000, Loss: 0.5044757127761841
Epoch 900/1000, Loss: 0.5026086568832397


In [42]:
# Step 12: Evaluate the Model
with torch.no_grad():
    test_predictions = model(X_test_tensor)
    test_predictions = torch.argmax(test_predictions, dim=1).numpy()

In [43]:
# Step 13: Print Model Performance
accuracy = accuracy_score(y_test, test_predictions)
print(f"Accuracy: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_test, test_predictions))

Accuracy: 0.7680
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.64      0.61     31285
           1       0.85      0.82      0.83     78585

    accuracy                           0.77    109870
   macro avg       0.72      0.73      0.72    109870
weighted avg       0.78      0.77      0.77    109870

