In [None]:
import re
import pandas as pd
import torch
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# File path to dataset
file_path = "./malicious_phish_1000.csv"

# Load dataset
df = pd.read_csv(file_path)

# Print column names to check structure
print("Column Names in CSV:", df.columns.tolist())

# Check if 'url' and 'type' columns exist
if "url" not in df.columns or "type" not in df.columns:
    raise ValueError(f"CSV file must contain 'url' and 'type' columns. Found: {df.columns.tolist()}")

# Convert 'type' column into numerical labels
label_mapping = {
    "benign": 0,
    "phishing": 1,
    "defacement": 2,
    "malware": 3
}
df["label"] = df["type"].map(label_mapping)

# Drop rows with missing labels (if any)
df = df.dropna(subset=["label"])

# Extract URLs and Labels
urls = df["url"].astype(str).tolist()  # Convert to string
labels = df["label"].astype(int).tolist()  # Convert to integer

# Function to clean and tokenize URLs
def preprocess_url(url):
    url = re.sub(r"https?://(www\.)?", "", url)  # Remove protocol and www
    url = re.sub(r"[\./\-_=]", " ", url)  # Replace special characters
    return url.lower().strip()

# Function to tokenize URLs for BERT
def tokenize_urls(urls, max_length=32):
    processed_urls = [preprocess_url(url) for url in urls]
    tokens = tokenizer(processed_urls, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    return tokens

# Tokenize URLs
tokens = tokenize_urls(urls)

# Convert labels to PyTorch tensor
labels_tensor = torch.tensor(labels)

# Print results
print("Tokenized Input IDs:", tokens["input_ids"])
print("Attention Mask:", tokens["attention_mask"])
print("Labels Tensor:", labels_tensor)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

# Define Custom Dataset Class
class URLDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "label": self.labels[idx]
        }

# Create Dataset
dataset = URLDataset(tokens["input_ids"], tokens["attention_mask"], labels_tensor)

# Split into Train & Validation sets (80% Train, 20% Validation)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Load Pretrained BERT Model for Classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training Loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

# Evaluation
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Print Classification Report
print("Validation Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=["benign", "phishing", "defacement", "malware"]))


In [6]:
#Code to pick top 1000 from a shuffled csv file
import pandas as pd

# Load your CSV file into a DataFrame
file_path = "./malicious_phish.csv"  # Replace with your file path
df = pd.read_csv(file_path)

# Shuffle the rows of the DataFrame
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Select the first 1000 rows
df_subset = df_shuffled.head(1000)

# Save the selected subset to a new CSV file
output_path = "./malicious_phish_1000.csv"
df_subset.to_csv(output_path, index=False)

print(f"Shuffled and selected 1000 rows. Saved to {output_path}")

Shuffled and selected 1000 rows. Saved to ./malicious_phish_1000.csv
