In [1]:
!pip install transformers datasets torch




In [2]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch


In [3]:
import pandas as pd

# Load the dataset (replace 'your_file.csv' with the uploaded file name)
df = pd.read_csv("spam_or_not_spam.csv")

# Show the first few rows to understand the dataset
df.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [4]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the split datasets as CSV files
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)


In [5]:
from datasets import load_dataset

# Load the dataset files (train and test CSV files)
dataset = load_dataset("csv", data_files={"train": "train.csv", "test": "test.csv"})

# Show the dataset to ensure it's loaded correctly
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['email', 'label'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['email', 'label'],
        num_rows: 600
    })
})


In [6]:
print(dataset["train"].column_names)

['email', 'label']


In [7]:
# Convert the 'text' column to string type if needed
dataset = dataset.map(lambda x: {"email": [str(t) for t in x["email"]]}, batched=True)

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [8]:
from transformers import RobertaTokenizer

# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenization function
def tokenize_function(batch):
    return tokenizer(batch["email"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove the "text" column as it's no longer needed
tokenized_dataset = tokenized_dataset.remove_columns(["email"])

# Set the dataset format to torch for PyTorch compatibility
tokenized_dataset.set_format("torch")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [9]:
# Step 6: Print the final tokenized dataset structure
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 600
    })
})


In [10]:
from transformers import RobertaForSequenceClassification

# Load the pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Where to save the results
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Weight decay for regularization
)



In [12]:
# Define the Trainer
trainer = Trainer(
    model=model,                     # The pre-trained model
    args=training_args,              # The training arguments
    train_dataset=tokenized_dataset["train"],  # The training dataset
    eval_dataset=tokenized_dataset["test"],    # The evaluation dataset
    tokenizer=tokenizer,             # The tokenizer
)

# Start training
trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33miammahnoor513[0m ([33miammahnoor513-karachi-institute-of-economics-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.066503
2,No log,0.003837
3,No log,0.004989


TrainOutput(global_step=450, training_loss=0.059129524230957034, metrics={'train_runtime': 201.1312, 'train_samples_per_second': 35.798, 'train_steps_per_second': 2.237, 'total_flos': 473599899648000.0, 'train_loss': 0.059129524230957034, 'epoch': 3.0})

In [13]:
# Save the trained model and tokenizer
output_dir = "./spam_roberta_model"
trainer.save_model(output_dir)  # Saves the model, configuration, and tokenizer
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to ./spam_roberta_model


In [14]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# Load the model and tokenizer
model_path = "./spam_roberta_model"
model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

print("Model and tokenizer loaded successfully.")


Model and tokenizer loaded successfully.


In [15]:
import torch

def predict_spam_or_not_spam(message):
    # Tokenize the input message
    inputs = tokenizer(message, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Run the message through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get prediction probabilities
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)

    # Predicted class (0 = not spam, 1 = spam)
    prediction = torch.argmax(probabilities, dim=1).item()
    confidence = probabilities[0, prediction].item()

    return "Spam" if prediction == 1 else "Not Spam", confidence

In [17]:
# Test a single message
test_message = "Congratulations! You've won a free iPhone. Click here to claim your prize."

# Get the prediction
label, confidence = predict_spam_or_not_spam(test_message)

# Print the result
print(f"Message: {test_message}")
print(f"Prediction: {label} (Confidence: {confidence:.2f})")


Message: Congratulations! You've won a free iPhone. Click here to claim your prize.
Prediction: Spam (Confidence: 1.00)
