In [40]:
print("Hello world")

Hello world


In [41]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

PyTorch version: 2.5.1+cu121
CUDA available: True
Device: NVIDIA GeForce RTX 4050 Laptop GPU


In [42]:
import pandas as pd
import numpy as np

In [43]:
df = pd.read_csv("C:\\Users\\meetj\\OneDrive\\Desktop\\API_Project\\dataset\\dataset_full.csv")


  df = pd.read_csv("C:\\Users\\meetj\\OneDrive\\Desktop\\API_Project\\dataset\\dataset_full.csv")


In [44]:
print(f"Original shape: {df.shape}")
print(f"Columns (first 10): {df.columns.tolist()[:10]}")
print(f"Missing values:\n{df.isnull().sum()}")

Original shape: (88647, 112)
Columns (first 10): ['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url', 'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url', 'qty_exclamation_url', 'qty_space_url']
Missing values:
qty_dot_url             0
qty_hyphen_url          0
qty_underline_url       0
qty_slash_url           0
qty_questionmark_url    0
                       ..
qty_redirects           0
url_google_index        0
domain_google_index     0
url_shortened           0
phishing                0
Length: 112, dtype: int64


In [45]:
def clean_qty_dot_url(val):
    if isinstance(val, str):
        nums = [float(x) for x in val.split() if x.replace('.', '', 1).replace('-', '', 1).isdigit()]
        return np.mean(nums) if nums else 0
    return val

df['qty_dot_url'] = df['qty_dot_url'].apply(clean_qty_dot_url)

In [46]:
X = df.drop(columns=["phishing"])
y = df["phishing"]


In [47]:
non_numeric_cols = X.select_dtypes(exclude=['number']).columns.tolist()
if non_numeric_cols:
    print(f"Dropping non-numeric columns: {non_numeric_cols}")
    X = X.select_dtypes(include=['number'])

In [48]:
print(f"Original target distribution:\n{y.value_counts()}")
valid_mask = y.isin([0, 1])
X = X[valid_mask].reset_index(drop=True)
y = y[valid_mask].reset_index(drop=True)
y = y.astype(int)

print(f"\nCleaned shape: X={X.shape}, y={y.shape}")
print(f"Target distribution:\n{y.value_counts()}")

Original target distribution:
phishing
0    58000
1    30647
Name: count, dtype: int64

Cleaned shape: X=(88647, 111), y=(88647,)
Target distribution:
phishing
0    58000
1    30647
Name: count, dtype: int64


In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [52]:
import torch

# Define device BEFORE using it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [53]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use training statistics

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [54]:
class PhishingNet(nn.Module):
    def __init__(self, input_dim):
        super(PhishingNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.fc4 = nn.Linear(64, 2)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        return self.fc4(x)

In [55]:
input_dim = X_train_scaled.shape[1]
model = PhishingNet(input_dim).to(device)
print(f"\nModel architecture:\n{model}")


Model architecture:
PhishingNet(
  (fc1): Linear(in_features=111, out_features=256, bias=True)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=64, out_features=2, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [56]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

In [57]:
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_X, batch_y in loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    
    return total_loss / len(loader), 100 * correct / total

In [58]:
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_X, batch_y in loader:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    
    return total_loss / len(loader), 100 * correct / total

In [59]:
num_epochs = 50
best_val_acc = 0

print("\nTraining started...")
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc = evaluate(model, test_loader, criterion)
    
    scheduler.step(val_loss)
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_phishing_model.pth')
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

print(f"\nBest validation accuracy: {best_val_acc:.2f}%")


Training started...
Epoch [10/50]
  Train Loss: 0.1311, Train Acc: 94.95%
  Val Loss: 0.1188, Val Acc: 95.34%
Epoch [20/50]
  Train Loss: 0.1186, Train Acc: 95.44%
  Val Loss: 0.1197, Val Acc: 95.44%
Epoch [30/50]
  Train Loss: 0.1093, Train Acc: 95.77%
  Val Loss: 0.1071, Val Acc: 96.05%
Epoch [40/50]
  Train Loss: 0.1034, Train Acc: 96.08%
  Val Loss: 0.1045, Val Acc: 96.27%
Epoch [50/50]
  Train Loss: 0.1019, Train Acc: 96.15%
  Val Loss: 0.1178, Val Acc: 95.35%

Best validation accuracy: 96.33%


In [60]:
import joblib
print("\nSaving scaler and model...")
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(scaler, 'C:/Users/meetj/OneDrive/Desktop/API_Project/API/scaler.pkl')
torch.save(model.state_dict(), 'C:/Users/meetj/OneDrive/Desktop/API_Project/API/best_phishing_model.pth')

print("✓ Scaler saved to scaler.pkl")
print("✓ Scaler saved to API/scaler.pkl")
print("✓ Model saved to best_phishing_model.pth")
print("✓ Model saved to API/best_phishing_model.pth")
print(f"✓ Number of features: {input_dim}")


Saving scaler and model...
✓ Scaler saved to scaler.pkl
✓ Scaler saved to API/scaler.pkl
✓ Model saved to best_phishing_model.pth
✓ Model saved to API/best_phishing_model.pth
✓ Number of features: 111
