In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# --- Data Preparation and Tokenization (Part 1) ---
print("Starting data preparation and tokenization...")

try:
    df_text = pd.read_csv("/kaggle/input/text-phishing-dataset-csv/text_Phishing_dataset.csv")
except FileNotFoundError:
    print("Error: The file 'phishing_email.csv' was not found. Please check the file path.")
    exit()

if 'Unnamed: 0' in df_text.columns:
    df_text = df_text.drop(columns=['Unnamed: 0'])

df_text.rename(columns={'Email Text': 'text', 'Email Type': 'labels'}, inplace=True)
df_text['labels'] = df_text['labels'].map({'Safe Email': 0, 'Phishing Email': 1})

df_text.dropna(subset=['text'], inplace=True)
df_text['text'] = df_text['text'].astype(str)

dataset = Dataset.from_pandas(df_text)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

print("Data preparation complete. Starting model training...")

# --- Model Training and Evaluation (Part 2) ---
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

eval_results = trainer.evaluate()
print(eval_results)
print("Model training and evaluation complete.")

2025-08-09 16:22:40.899354: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754756561.255199      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754756561.360135      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Starting data preparation and tokenization...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/18634 [00:00<?, ? examples/s]

Data preparation complete. Starting model training...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.04822,0.981755,0.964865,0.98892,0.976744
2,0.120200,0.055921,0.982023,0.961771,0.993075,0.977172
3,0.036600,0.052471,0.983633,0.968814,0.989612,0.979102








{'eval_loss': 0.048219744116067886, 'eval_accuracy': 0.9817547625436007, 'eval_precision': 0.9648648648648649, 'eval_recall': 0.9889196675900277, 'eval_f1': 0.9767441860465117, 'eval_runtime': 73.4991, 'eval_samples_per_second': 50.708, 'eval_steps_per_second': 1.592, 'epoch': 3.0}
Model training and evaluation complete.


In [2]:
trainer.save_model("my_phishing_detector")

In [3]:
!ls /kaggle/working/my_phishing_detector

config.json  model.safetensors	training_args.bin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
# After trainer.train() and trainer.evaluate() have completed
# This saves the model weights and config
trainer.save_model("./my_phishing_detector")

# This is the new, crucial step: save the tokenizer
# The 'tokenizer' variable was defined earlier when you loaded it from 'bert-base-uncased'
tokenizer.save_pretrained("./my_phishing_detector")

('./my_phishing_detector/tokenizer_config.json',
 './my_phishing_detector/special_tokens_map.json',
 './my_phishing_detector/vocab.txt',
 './my_phishing_detector/added_tokens.json',
 './my_phishing_detector/tokenizer.json')

In [5]:
from transformers import pipeline

# Load the pipeline from the directory where you saved both the model and tokenizer
classifier = pipeline("text-classification", model="./my_phishing_detector")

# Example 1: A suspicious email
suspicious_email = "Click here to update your password. Your account has been suspended."
result_suspicious = classifier(suspicious_email)
print("Suspicious email result:", result_suspicious)

# Example 2: A safe email
safe_email = "Hello, I wanted to confirm our meeting for tomorrow. See you then."
result_safe = classifier(safe_email)
print("Safe email result:", result_safe)

Device set to use cuda:0


Suspicious email result: [{'label': 'LABEL_1', 'score': 0.9948375821113586}]
Safe email result: [{'label': 'LABEL_0', 'score': 0.9964616894721985}]


In [6]:
from transformers import pipeline

# This loads your trained model and tokenizer from the directory
classifier = pipeline("text-classification", model="./my_phishing_detector")

# Create a list of emails to test
emails_to_test = [
    "You have won a free iPhone! Click this link to claim your prize.",
   
]

# Run the predictions on the entire list at once
predictions = classifier(emails_to_test)

# Map the raw labels to single words
label_mapping = {'LABEL_0': 'Safe', 'LABEL_1': 'Phishing'}

# Create a list of just the predicted labels
predicted_labels = [label_mapping[p['label']] for p in predictions]

# Count the occurrences of each label
phishing_count = predicted_labels.count('Phishing')
safe_count = predicted_labels.count('Safe')

# Calculate the total number of emails
total_emails = len(emails_to_test)

# Print a single, overall prediction summary
print("--- Overall Prediction Summary ---")
print(f"Total Emails Tested: {total_emails}")
print(f"Emails Classified as Phishing: {phishing_count} ({phishing_count/total_emails:.1%})")

print("----------------------------------")

Device set to use cuda:0


--- Overall Prediction Summary ---
Total Emails Tested: 1
Emails Classified as Phishing: 1 (100.0%)
----------------------------------


In [7]:
import os
print(os.listdir('/kaggle/working/'))

['my_phishing_detector', '__notebook__.ipynb', 'results']


In [None]:
import os
os.chdir('/kaggle/working/')
!zip -r all_output.zip .