In [None]:
import pandas as pd
import torch
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(nltk.corpus.stopwords.words('english'))

# Preprocessing function for multilingual support
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

# Load and combine datasets
data1 = pd.read_csv('/Users/ishikaupadhyay/Desktop/cyber project/dataset_phishing.csv')
data2 = pd.read_csv('/Users/ishikaupadhyay/Desktop/cyber project/emails.csv')
data3 = pd.read_csv('/Users/ishikaupadhyay/Desktop/cyber project/phishing_Legitimate_full.csv')

combined_data = pd.concat([data1, data2, data3], ignore_index=True)

# Check and clean missing values
combined_data['message'] = combined_data['message'].fillna('')
combined_data['url'] = combined_data['url'].fillna('')
combined_data['status'] = combined_data['status'].fillna('unknown')

# Prepare inputs and labels
combined_data['text'] = combined_data.apply(
    lambda row: f"Email: {preprocess_text(row['message'])} URL: {row['url']}",
    axis=1
)
combined_data['label'] = combined_data['status'].replace({'legitimate': 0, 'phishing': 1})

# Filter unknown statuses if any
combined_data = combined_data[combined_data['label'].isin([0, 1])]

# Train-test split
train_df, test_df = train_test_split(combined_data[['text', 'label']], test_size=0.2, random_state=42, stratify=combined_data['label'])

# Convert to Hugging Face Dataset
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df),
})

# Tokenizer and model setup (smaller multilingual model)
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

# Tokenize datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,  
    per_device_train_batch_size=16, 
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Training
print("Starting model training...")
# trainer.train()
print("Model training completed.")

# Save the fine-tuned model
model.save_pretrained("./distilbert_phishing_model")
tokenizer.save_pretrained("./distilbert_phishing_model")

# Prediction function with multilingual support
def classify_email(email, url, threshold=0.5):
    input_text = f"Email: {preprocess_text(email)} URL: {url}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=256).to(model.device)
    outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1)
    phishing_probability = probabilities[0][1].item()
    return "Phishing" if phishing_probability > threshold else "Legitimate"


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ishikaupadhyay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ishikaupadhyay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Example 6: Phishing attempt in Spanish
email6 = "¡Urgente! Su cuenta bancaria ha sido bloqueada. Haga clic en el enlace para verificar su identidad y desbloquear su cuenta inmediatamente."
url6 = "http://banco-seguro-verificacion.com/desbloquear"
result6 = classify_email(email6, url6)
print(f"Example 6 classification: {result6}")

# Example 7: Legitimate email in German
email7 = "Sehr geehrter Kunde, vielen Dank für Ihren Einkauf bei uns. Ihre Bestellung wurde versandt und wird in 3-5 Werktagen geliefert."
url7 = "https://www.echterversand.de/sendungsverfolgung"
result7 = classify_email(email7, url7)
print(f"Example 7 classification: {result7}")

# Example 8: Phishing attempt in Chinese
email8 = "紧急通知：您的账户安全受到威胁。请立即点击以下链接更新您的密码和个人信息，以确保账户安全。"
url8 = "http://secure-account-update.cn/login"
result8 = classify_email(email8, url8)
print(f"Example 8 classification: {result8}")

# Example 9: Legitimate email in Russian
email9 = "Уважаемый клиент, ваш заказ успешно обработан и отправлен. Вы можете отследить вашу посылку, перейдя по ссылке ниже."
url9 = "https://www.pochta.ru/tracking"
result9 = classify_email(email9, url9)
print(f"Example 9 classification: {result9}")

# Example 10: Phishing attempt in Arabic
email10 = "تحذير عاجل: تم اختراق حسابك. انقر على الرابط أدناه لتأكيد هويتك وتأمين حسابك على الفور!"
url10 = "http://secure-account-verify.ae/login"
result10 = classify_email(email10, url10)
print(f"Example 10 classification: {result10}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

def test_language_accuracy(model, tokenizer, language_samples):
    results = {}
    for language, samples in tqdm(language_samples.items(), desc="Testing languages"):
        correct = 0
        total = len(samples)
        for email, url, true_label in samples:
            prediction = classify_email(email, url)
            if prediction.lower() == true_label.lower():
                correct += 1
        accuracy = correct / total
        results[language] = accuracy
    return results

# Prepare test samples for different languages
# English
email_en1 = "Congratulations! You've won a free iPhone. Click here to claim your prize now!"
url_en1 = "http://free-iphone-giveaway.com/claim"
email_en2 = "Your package has been delayed. Please verify your address by clicking this link."
url_en2 = "http://shipping-update.net/verify"

# Spanish
email_es1 = "¡Alerta de seguridad! Su cuenta de PayPal ha sido suspendida. Haga clic aquí para reactivarla."
url_es1 = "http://paypal-cuenta-segura.es/reactivar"
email_es2 = "Felicidades, has ganado un viaje gratis. Confirma tus datos para reclamar tu premio."
url_es2 = "http://viajes-gratis-ganador.es/confirmar"

# German
email_de1 = "Wichtige Mitteilung: Ihre Kreditkarte wurde gesperrt. Klicken Sie hier, um sie zu entsperren."
url_de1 = "http://kreditkarte-entsperren.de/verifizieren"
email_de2 = "Exklusive Rabatte nur für Sie! Melden Sie sich jetzt an, um 70% Rabatt zu erhalten."
url_de2 = "http://super-rabatte-heute.de/anmelden"

# French
email_fr1 = "Votre compte Netflix a été suspendu. Cliquez ici pour mettre à jour vos informations de paiement."
url_fr1 = "http://netflix-compte-reactiver.fr/paiement"
email_fr2 = "Offre exclusive : Doublez vos points de fidélité en mettant à jour vos préférences maintenant."
url_fr2 = "http://points-fidelite-bonus.fr/mettre-a-jour"

# Chinese
email_zh1 = "紧急通知：您的银行账户已被冻结。请立即点击此处验证您的身份。"
url_zh1 = "http://银行账户验证.cn/解冻"
email_zh2 = "恭喜您获得独家优惠！点击此处领取您的专属折扣码。"
url_zh2 = "http://特别优惠.cn/领取"

# Russian
email_ru1 = "Внимание! Ваш аккаунт социальной сети заблокирован. Нажмите здесь, чтобы восстановить доступ."
url_ru1 = "http://восстановить-аккаунт.ru/разблокировать"
email_ru2 = "Специальное предложение: получите бесплатную консультацию по инвестициям. Регистрируйтесь сейчас!"
url_ru2 = "http://инвестиции-консультация.ru/регистрация"


# Test the model's accuracy for each language
accuracy_results = test_language_accuracy(model, tokenizer, language_samples)

# Visualize the results
languages = list(accuracy_results.keys())
accuracies = list(accuracy_results.values())

plt.figure(figsize=(12, 6))
bars = plt.bar(languages, accuracies, color='skyblue')
plt.title('Phishing Detection Accuracy Across Languages')
plt.xlabel('Languages')
plt.ylabel('Accuracy')
plt.ylim(0, 1)  # Set y-axis limit from 0 to 1

# Add value labels on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}',
             ha='center', va='bottom')

plt.tight_layout()
plt.show()