In [None]:
import pandas as pd
import torch
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(nltk.corpus.stopwords.words('english'))

# Preprocessing function for multilingual support
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

# Load and combine datasets
data1 = pd.read_csv('/Users/ishikaupadhyay/Desktop/advanced phishing detection/dataset_phishing.csv')
data2 = pd.read_csv('/Users/ishikaupadhyay/Desktop/advanced phishing detection/emails.csv')
data3 = pd.read_csv('/Users/ishikaupadhyay/Desktop/advanced phishing detection/phishing_Legitimate_full.csv')

combined_data = pd.concat([data1, data2, data3], ignore_index=True)

# Check and clean missing values
combined_data['message'] = combined_data['message'].fillna('')
combined_data['url'] = combined_data['url'].fillna('')
combined_data['status'] = combined_data['status'].fillna('unknown')

# Prepare inputs and labels
combined_data['text'] = combined_data.apply(
    lambda row: f"Email: {preprocess_text(row['message'])} URL: {row['url']}",
    axis=1
)
combined_data['label'] = combined_data['status'].replace({'legitimate': 0, 'phishing': 1})

# Filter unknown statuses if any
combined_data = combined_data[combined_data['label'].isin([0, 1])]

# Train-test split
train_df, test_df = train_test_split(combined_data[['text', 'label']], test_size=0.2, random_state=42, stratify=combined_data['label'])

# Convert to Hugging Face Dataset
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df),
})

# Tokenizer and model setup (smaller multilingual model)
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

# Tokenize datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,  
    per_device_train_batch_size=16, 
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Training
print("Starting model training...")
trainer.train()
print("Model training completed.")

# Save the fine-tuned model
model.save_pretrained("./distilbert_phishing_model")
tokenizer.save_pretrained("./distilbert_phishing_model")

# Evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

# Predictions for the confusion matrix
predictions = trainer.predict(tokenized_datasets["test"])
predicted_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Confusion Matrix
cm = confusion_matrix(true_labels, predicted_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Legitimate", "Phishing"], yticklabels=["Legitimate", "Phishing"])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Prediction function with multilingual support
def classify_email(email, url, threshold=0.5):
    input_text = f"Email: {preprocess_text(email)} URL: {url}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=256).to(model.device)
    outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1)
    phishing_probability = probabilities[0][1].item()
    return "Phishing" if phishing_probability > threshold else "Legitimate"


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ishikaupadhyay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ishikaupadhyay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
examples = {
    "English": [
        {"email": "Dear customer, your order has been shipped.", "url": "https://www.legitstore.com/track"},
        {"email": "Urgent: Account suspended. Verify now!", "url": "http://phishing-scam.com/login"},
    ],
    "Spanish": [
        {"email": "Estimado cliente, su pedido ha sido enviado.", "url": "https://www.tiendalegitima.es/rastreo"},
        {"email": "¡Alerta! Su cuenta está bloqueada. Verifique ahora.", "url": "http://phishing-estafa.es/login"},
    ],
    "German": [
        {"email": "Sehr geehrter Kunde, Ihre Bestellung wurde versandt.", "url": "https://www.echterladen.de/verfolgung"},
        {"email": "Achtung: Ihr Konto wurde gesperrt. Jetzt verifizieren!", "url": "http://phishing-betrug.de/einloggen"},
    ],
    "French": [
        {"email": "Urgent : Compte suspendu. Vérifiez maintenant !", "url": "http://phishing-arnaque.fr/login"},
    ],
    "Italian": [
        {"email": "Urgente: Account sospeso. Verifica ora!", "url": "http://phishing-truffa.it/accesso"},
    ],
}


for language, emails in examples.items():
    print(f"Testing {language} examples:")
    for example in emails:
        email = example["email"]
        url = example["url"]
        result = classify_email(email, url)
        print(f"Email: {email}\nURL: {url}\nResult: {result}\n")
