<a href="https://colab.research.google.com/github/MahmoudDev2/landpage/blob/main/%D9%8FEmailSavingphishing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SMS Phishing Fine-tuning (Google Colab)

هذا الدفتر جاهز للتشغيل على **Google Colab**. يقوم بتحضير البيانات من CSV، تقسيمها، إجراء الـ tokenization، ثم **fine-tune** لنموذج كشف رسائل الـ SMS الاحتيالية وحفظ النموذج النهائي في المجلد **`urlbert_phishing_model`**.

**ملاحظات قبل التشغيل**:
- فعِّل GPU: `Runtime -> Change runtime type -> GPU`.
- إن لم ترفع ملف `sms_phishing_dataset.csv` سيسجِّل الدفتر عيّنة تجريبية تلقائياً (20 صفًا).
- بعد التدريب سيُحفظ النموذج والـ tokenizer في مجلد `urlbert_phishing_model` داخل مساحة العمل `/content`.


In [None]:
# Cell 1: install requirements (run once)
!pip install -q transformers datasets evaluate scikit-learn torch accelerate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
# Cell 2: If sms_phishing_dataset.csv not present, create a small sample automatically
import os
csv_path = "/content/sms_phishing_dataset.csv"

if not os.path.exists(csv_path):
    import pandas as pd
    data = [
        # Phishing (1)
        ("URGENT: Your account has been suspended. Verify here: http://bit.ly/verify_now", 1),
        ("مهم: تم تجميد حسابك، اضغط على الرابط لاستعادته https://secure-bank.example/login", 1),
        ("You won a $1000 gift card! Click to claim: http://claims.example/win", 1),
        ("رمز التحقق الخاص بك هو 123456. لا تشاركه مع أي أحد. للتحقق: http://verify.example", 1),
        ("Action required: Unusual login detected. Confirm: http://secure.example/confirm", 1),
        ("تهانينا! ربحت جائزة. ادخل بياناتك لاستلامها: http://prize.example/collect", 1),
        ("Final notice: Your subscription will be cancelled. Reactivate here: http://subs.example/reactivate", 1),
        ("Please update your payment details: http://billing.example/update", 1),
        ("Call us immediately at +1-800-FAKE to avoid account closure.", 1),
        ("Free voucher for you! Visit http://free.example before it expires", 1),
        # Legitimate (0)
        ("Reminder: Your dentist appointment is tomorrow at 10:00 AM. Reply C to confirm.", 0),
        ("شكرًا لتسجيلك معنا. رمز التفعيل الخاص بك: 482019", 0),
        ("Promo: 20% off at our store this weekend. Show this message in-store.", 0),
        ("Your package (tracking #12345) is out for delivery today.", 0),
        ("تنبيه: تم سحب مبلغ 50 ريال من حسابك في المتجر المحلي.", 0),
        ("Bank: Your statement is ready to view in the mobile app.", 0),
        ("Welcome to ExampleApp! Use code WELCOME10 for a discount on your first order.", 0),
        ("خدمة العملاء: تم استقبال طلبك وسيتم الرد خلال 24 ساعة.", 0),
        ("Your friend sent you a photo via ExampleChat.", 0),
        ("Reminder: Subscription renews on 2025-10-01. No action required if you wish to continue.", 0),
    ]
    df = pd.DataFrame(data, columns=['text','target'])
    df.to_csv(csv_path, index=False, encoding='utf-8-sig')
    print(f"Sample CSV created at {csv_path} (rows: {len(df)})")
else:
    print(f"Found existing CSV at {csv_path}")


Found existing CSV at /content/sms_phishing_dataset.csv


In [3]:
# Cell 3: Inspect CSV and show sample rows & schema
import pandas as pd
df = pd.read_csv("/content/sms_phishing_dataset.csv", encoding="utf-8-sig")
print("Total rows:", len(df))
print("\nColumns and types:")
print(df.dtypes)
print("\nClass distribution:")
# Assuming '1' is the target column based on the variable explorer
print(df['1'].value_counts())
print("\nFirst 6 rows:")
display(df.head(6))

Total rows: 394899

Columns and types:
tml     object
1      float64
dtype: object

Class distribution:
1
1.0    352715
0.0     42183
Name: count, dtype: int64

First 6 rows:


Unnamed: 0,tml,1
0,www.lencom.com/SurfVCR.html,1.0
1,www.syvum.com/click/download.html,1.0
2,www.1tabview.com/index.htm,1.0
3,www.toplang.com/popupadkiller.htm,1.0
4,www.internetwatcher.com/Eng/Index.htm,1.0
5,www.icon.co.za/~pak/,1.0


In [6]:
# Cell 4: load CSV, split to train/test, save as datasets on disk
from datasets import Dataset
from sklearn.model_selection import train_test_split
import os

# Ensure 'text' and 'target' columns exist
assert 'tml' in df.columns and '1' in df.columns, "CSV must contain 'tml' and '1' columns."

# Drop rows with NaN in the target column for stratification
df_cleaned = df.dropna(subset=['1']).copy()


train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42, stratify=df_cleaned['1'])

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True), preserve_index=False)
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True), preserve_index=False)

os.makedirs("hf_datasets", exist_ok=True)
train_ds.save_to_disk("hf_datasets/train_dataset")
test_ds.save_to_disk("hf_datasets/test_dataset")

print("Train size:", len(train_ds), "Test size:", len(test_ds))

Saving the dataset (0/1 shards):   0%|          | 0/315918 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/78980 [00:00<?, ? examples/s]

Train size: 315918 Test size: 78980


In [None]:
# Cell 5: model & tokenizer and tokenization
from transformers import AutoTokenizer

base_model_name = "CrabInHoney/urlbert-tiny-v4-phishing-classifier"  # change if needed

tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False)

train_ds = train_ds.map(preprocess_function, batched=True)
test_ds  = test_ds.map(preprocess_function, batched=True)

# Convert label column to 'labels' name expected by Trainer if needed
def rename_label(example):
    example["labels"] = example["target"]
    return example

train_ds = train_ds.map(rename_label, batched=False)
test_ds  = test_ds.map(rename_label, batched=False)

train_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
test_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

print("Tokenization and formatting done.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Tokenization and formatting done.


In [11]:
# Cell 6: training with Trainer
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

base_model_name = "CrabInHoney/urlbert-tiny-v4-phishing-classifier"  # change if needed

tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)

# Load model (will download base model if not present)
model = AutoModelForSequenceClassification.from_pretrained(base_model_name)

# If model.num_labels != 2 adjust config (uncomment if needed)
# from transformers import AutoConfig
# config = AutoConfig.from_pretrained(base_model_name, num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained(base_model_name, config=config)

# Tokenization and formatting from Cell 5
def preprocess_function(examples):
    # Assuming 'tml' is the text column
    return tokenizer(examples["tml"], truncation=True, padding=True) # Changed padding to True

def rename_label(example):
    # Assuming '1' is the label column
    example["labels"] = int(example["1"]) # Convert to int before converting to torch.long
    return example

train_ds_processed = train_ds.map(preprocess_function, batched=True)
test_ds_processed  = test_ds.map(preprocess_function, batched=True)

train_ds_processed = train_ds_processed.map(rename_label, batched=False)
test_ds_processed  = test_ds_processed.map(rename_label, batched=False)

train_ds_processed.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
test_ds_processed.set_format(type="torch", columns=["input_ids","attention_mask","labels"])


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

save_dir = "/content/urlbert_phishing_model"  # <--- final save directory

training_args = TrainingArguments(
    output_dir=save_dir,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch", # Changed from evaluation_strategy
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_processed,
    eval_dataset=test_ds_processed,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Map:   0%|          | 0/315918 [00:00<?, ? examples/s]

Map:   0%|          | 0/78980 [00:00<?, ? examples/s]

Map:   0%|          | 0/315918 [00:00<?, ? examples/s]

Map:   0%|          | 0/78980 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0421,0.052288,0.986275,0.98811,0.996626,0.99235
2,0.0481,0.046886,0.989048,0.990828,0.996966,0.993888
3,0.0221,0.04054,0.990884,0.992968,0.996853,0.994907


TrainOutput(global_step=59235, training_loss=0.048030960413562786, metrics={'train_runtime': 2030.1023, 'train_samples_per_second': 466.85, 'train_steps_per_second': 29.178, 'total_flos': 1308987965357568.0, 'train_loss': 0.048030960413562786, 'epoch': 3.0})

In [13]:
# Cell 7: evaluate and save eval results + save tokenizer explicitly
eval_result = trainer.evaluate(eval_dataset=test_ds_processed)
print("Eval results:", eval_result)

import os
os.makedirs("/content/outputs", exist_ok=True)
with open("/content/outputs/eval_results.txt", "w") as f:
    for k,v in eval_result.items():
        f.write(f"{k} = {v}\n")

# Save model and tokenizer to the requested folder
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Model, tokenizer and config saved to {save_dir}")
print("Listing saved files:")
print(os.listdir(save_dir))

Eval results: {'eval_loss': 0.04054044559597969, 'eval_accuracy': 0.9908837680425424, 'eval_precision': 0.9929679888165606, 'eval_recall': 0.9968529832867896, 'eval_f1': 0.9949066934536863, 'eval_runtime': 39.8396, 'eval_samples_per_second': 1982.452, 'eval_steps_per_second': 61.974, 'epoch': 3.0}
Model, tokenizer and config saved to /content/urlbert_phishing_model
Listing saved files:
['runs', 'checkpoint-39490', 'checkpoint-59235', 'config.json', 'model.safetensors', 'vocab.txt', 'checkpoint-19745', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'training_args.bin']


In [14]:
# Cell 8: test on new sentences using the saved model
from transformers import pipeline
import torch
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("text-classification", model=save_dir, tokenizer=save_dir, device=device)

examples = [
    "مهم: تم تجميد حسابك، اضغط على الرابط لاستعادته https://secure-bank.example/login",
    "Reminder: Your dentist appointment is tomorrow at 10:00 AM. Reply C to confirm."
]

print(classifier(examples))


Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.99979168176651}, {'label': 'LABEL_0', 'score': 0.9996912479400635}]
