# Load email data & preprocess

In [7]:
from datasets import Dataset, DatasetDict, ClassLabel, Value
import pandas as pd

df = pd.read_csv("extracted_emails.csv")
df = pd.DataFrame(df)

dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['id', 'label', 'subject', 'text'],
    num_rows: 1000
})


In [4]:
# define class label

new_features = dataset.features.copy()
new_features["label"] = ClassLabel(names_file="names.txt")
dataset = dataset.cast(new_features)
dataset.features

# behind the scenes: class label is int, so no need to preprocess the feature 'label' in this dataset.
# the mapping of integers to label name is stored in the names.txt file

Casting the dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'id': Value(dtype='string', id=None),
 'label': ClassLabel(names=['Investments', 'Investments/Al Meezan', 'Credit Account', 'Job Search', 'Job Search/applications', 'Coursework', 'Insurance', 'DOCS', 'Bank Accounts/RBC', 'Bills', 'FlyWire', 'Flights', 'Google', 'Bank Accounts', 'Promo', 'Google/Play Console', 'Bank Accounts/Wise', 'Receipts', 'Security', 'Fido', 'Ameen', 'Marium', 'Bank Accounts/Meezan', 'Opus 6', 'Action required', 'Communauto', 'Google/Cloud Platform', 'Investments/WealthSimple', 'Fizz', 'Job Search/rejections', 'Bank Accounts/TD', 'Hotel bookings', 'Healthcare', 'Ayra', 'Bank Accounts/Scotia'], id=None),
 'subject': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

In [6]:
# Step 1: Split into train + validation and test (10% test)
train_val_split = dataset.train_test_split(test_size=0.1)
train_val_dataset = train_val_split["train"]
test_dataset = train_val_split["test"]

# Step 2: Split train_val into train and validation (10% of remaining goes to validation)
train_val_split = train_val_dataset.train_test_split(test_size=0.1)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

# Combine into a final DatasetDict
final_splits = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

final_splits

# get DatasetDict( {train: Dataset(), validation: Dataset(), test: Dataset()})

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'subject', 'text'],
        num_rows: 810
    })
    validation: Dataset({
        features: ['id', 'label', 'subject', 'text'],
        num_rows: 90
    })
    test: Dataset({
        features: ['id', 'label', 'subject', 'text'],
        num_rows: 100
    })
})

In [7]:
# define tokenization logic
# This function takes a dictionary (like the items of our dataset) and returns
# a new dictionary with the keys input_ids, attention_mask, and token_type_ids

def safe_str(x):
    if x is None:
        return ""
    return str(x)

def tokenize_function(examples):
    subjects = [safe_str(s) for s in examples["subject"]]
    texts = [safe_str(t) for t in examples["text"]]
    return tokenizer(subjects, texts, truncation=True)

In [8]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenize the dataset
tokenized_datasets = final_splits.map(tokenize_function, batched=True)
print(tokenized_datasets)

# Our tokenize_function returns a dictionary with the keys input_ids, attention_mask,
# and token_type_ids, so those three fields are added to all splits of our dataset.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/810 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'subject', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 810
    })
    validation: Dataset({
        features: ['id', 'label', 'subject', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 90
    })
    test: Dataset({
        features: ['id', 'label', 'subject', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})


In [9]:
# test sample batch
# Remove the columns id, subject, and text as they won’t be needed and contain strings (and we can’t create tensors with strings)
samples = tokenized_datasets["train"].select(range(8))
samples = [{k: v for k, v in sample.items() if k in ["input_ids", "attention_mask", "token_type_ids", "label"]} for sample in samples]

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
batch = data_collator(samples)
print({k: v.shape for k, v in batch.items()})

{'input_ids': torch.Size([8, 512]), 'token_type_ids': torch.Size([8, 512]), 'attention_mask': torch.Size([8, 512]), 'labels': torch.Size([8])}


Now that we’ve gone from raw text to batches our model can deal with, we’re ready to fine-tune it!

# Fine-tune

In [10]:
# The first step before we can define our Trainer is to define a TrainingArguments class that will contain all the hyperparameters
# the Trainer will use for training and evaluation. The only argument you have to provide is a directory where the
# trained model will be saved, as well as the checkpoints along the way. For all the rest, you can leave the defaults,
# which should work pretty well for a basic fine-tuning.

from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [11]:
# The second step is to define our model. As in the previous chapter, we will use the
# AutoModelForSequenceClassification class, with two labels:

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=35)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import Trainer
# Note that when you pass a tokenizer as the processing_class, as we did here, the default data_collator
# used by the Trainer will be a DataCollatorWithPadding if the processing_class is a tokenizer or feature extractor,
# so you can skip the line data_collator=data_collator
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
)

This will start the fine-tuning (which should take a couple of minutes on a GPU) and report the training loss every 500 steps. It won’t, however, tell you how well (or badly) your model is performing. This is because:

We didn’t tell the Trainer to evaluate during training by setting eval_strategy in TrainingArguments to either "steps" (evaluate every eval_steps) or "epoch" (evaluate at the end of each epoch).
We didn’t provide the Trainer with a compute_metrics() function to calculate a metric during said evaluation (otherwise the evaluation would just have printed the loss, which is not a very intuitive number).

In [13]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmusabumair005[0m ([33mmusabumair0191[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


TrainOutput(global_step=306, training_loss=1.2892151876212725, metrics={'train_runtime': 253.2032, 'train_samples_per_second': 9.597, 'train_steps_per_second': 1.209, 'total_flos': 636268692884676.0, 'train_loss': 1.2892151876212725, 'epoch': 3.0})

# Evaluate

Let’s see how we can build a useful compute_metrics() function and use it the next time we train. The function must take an EvalPrediction object (which is a named tuple with a predictions field and a label_ids field) and will return a dictionary mapping strings to floats (the strings being the names of the metrics returned, and the floats their values).

In [15]:
# To get some predictions from our model, we can use the Trainer.predict() command

predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(90, 35) (90,)


The output of the predict() method above is another named tuple with three fields: predictions, label_ids, and metrics. The metrics field will just contain the loss on the dataset passed, as well as some time metrics (how long it took to predict, in total and on average). Once we complete our compute_metrics() function and pass it to the Trainer, that field will also contain the metrics returned by compute_metrics()

In [19]:
# As you can see, predictions is a two-dimensional array with shape 90 x 35 (90 being the number of elements in the dataset we used).
# Those are the logits for each element of the dataset we passed to predict() (as you saw in the previous chapter,
# all Transformer models return logits). To transform them into predictions that we can compare to our labels,
# we need to take the index with the maximum value on the second axis:

import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [17]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [20]:
# We can now compare those preds to the labels. To build our compute_metric() function,
# we will rely on the metrics from the Evaluate library.

import evaluate
import numpy as np

def compute_metrics(eval_preds):

    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"]
    }

In [21]:
# And to see it used in action to report metrics at the end of each epoch,
# here is how we define a new Trainer with this compute_metrics() function:

training_args = TrainingArguments("test-trainer", eval_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=35)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.456702,0.655556,0.5717
2,No log,0.851676,0.8,0.744194
3,No log,0.705214,0.866667,0.839263


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

TrainOutput(global_step=306, training_loss=1.2334603702320772, metrics={'train_runtime': 255.7934, 'train_samples_per_second': 9.5, 'train_steps_per_second': 1.196, 'total_flos': 636268692884676.0, 'train_loss': 1.2334603702320772, 'epoch': 3.0})

In [23]:
preds = trainer.predict(tokenized_datasets["test"])
print(preds.metrics)  # accuracy, F1, etc.
print(preds.predictions.argmax(axis=-1))  # predicted class labels

{'test_loss': 0.601922333240509, 'test_accuracy': 0.87, 'test_f1': 0.8440752001621568, 'test_runtime': 4.0966, 'test_samples_per_second': 24.411, 'test_steps_per_second': 3.173}
[34 18  4  4 25 27 17  4 17 17  4 34 29 17 17 18 25  4 29  4 17  4 34 24
  4 29  5  4  4 27 29  4 27 29 34  4  4 34  4 18  4  4  4 29 34 10  5 29
 29 18 17  4  4 23 34  4 18  4  2 34 24 27  4 29  4 10  4  4  4 10 29 27
  4  4  4  4 27 23 10 17 17  4 22  4 34 27 18 17 24 18  4 18 17 18 34 24
 34 17 34 18]


In [26]:
predictions = trainer.predict(tokenized_datasets["test"])

# Convert logits to predicted class indices
predicted_labels = np.argmax(predictions.predictions, axis=-1)

# Get the true labels
true_labels = predictions.label_ids

label_names = tokenized_datasets["train"].features["label"].int2str

for i in range(20):
    sample_id = tokenized_datasets["test"][i]["id"]
    true_label = int(true_labels[i])
    pred_label = int(predicted_labels[i])
    print(f"ID: {sample_id} | True label: {label_names(true_label):<20} | Predicted label: {label_names(pred_label)}")

ID: 194eb515033c7657 | True label: Bank Accounts/Scotia | Predicted label: Bank Accounts/Scotia
ID: 1953cf013ae78cba | True label: Security             | Predicted label: Security
ID: 19414e4c0902ccfd | True label: Job Search/applications | Predicted label: Job Search/applications
ID: 1947b64b7345a315 | True label: Job Search/applications | Predicted label: Job Search/applications
ID: 193b1f25620d78c6 | True label: Communauto           | Predicted label: Communauto
ID: 19276f24a81b38f5 | True label: Investments/WealthSimple | Predicted label: Investments/WealthSimple
ID: 192fe99d029ecdf3 | True label: Receipts             | Predicted label: Receipts
ID: 193f482d9972984a | True label: Job Search/applications | Predicted label: Job Search/applications
ID: 19438f6185ab01ef | True label: Receipts             | Predicted label: Receipts
ID: 19562b13562bdf3c | True label: Receipts             | Predicted label: Receipts
ID: 1950a7ec98cdfd28 | True label: Job Search/applications | Predicted l

# All together

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [10]:
from datasets import Dataset, DatasetDict, ClassLabel
import pandas as pd
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

# Load your larger dataset
df_large = pd.read_csv("extracted_emails.csv")

# Convert to Hugging Face Dataset
dataset_large = Dataset.from_pandas(df_large)

# Define ClassLabel feature - use the same names.txt file you used earlier
new_features = dataset_large.features.copy()
new_features["label"] = ClassLabel(names_file="names.txt")
dataset_large = dataset_large.cast(new_features)

# Split dataset: 10% test
splits_large = dataset_large.train_test_split(test_size=0.1)
train_val_large = splits_large["train"]
test_large = splits_large["test"]

# Split train_val into train and validation (10% validation)
train_val_split_large = train_val_large.train_test_split(test_size=0.1)
train_large = train_val_split_large["train"]
val_large = train_val_split_large["test"]

# Final DatasetDict
final_splits_large = DatasetDict({
    "train": train_large,
    "validation": val_large,
    "test": test_large
})

# Initialize tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def safe_str(x):
    if x is None:
        return ""
    return str(x)

def tokenize_function(examples):
    subjects = [safe_str(s) for s in examples["subject"]]
    texts = [safe_str(t) for t in examples["text"]]
    return tokenizer(subjects, texts, truncation=True)

# Tokenize datasets
tokenized_datasets_large = final_splits_large.map(tokenize_function, batched=True)

# Define data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define metrics
def compute_metrics(eval_preds):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"]
    }

# Training arguments (you can adjust epochs and batch size)
training_args = TrainingArguments(
    "fine_tuned_gmail_sorter",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=35)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_large["train"],
    eval_dataset=tokenized_datasets_large["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train!
trainer.train()

# Evaluate on test set
test_results = trainer.evaluate(tokenized_datasets_large["test"])
print("Test set results:", test_results)

RuntimeError: Failed to import transformers.models.auto.tokenization_auto because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
module 'sympy' has no attribute 'core'

In [1]:
import torch
import numpy as np
from scipy.special import softmax  # or use torch.nn.functional.softmax
from datasets import Dataset

# Run prediction on the test set
predictions = trainer.predict(tokenized_datasets_large["test"])

# Extract raw logits and true labels
logits = predictions.predictions
true_labels = predictions.label_ids

# Apply softmax to get probabilities
probs = softmax(logits, axis=1)  # shape: (num_samples, num_classes)


NameError: name 'trainer' is not defined

In [None]:
label_names = tokenized_datasets_large["test"].features["label"].names

for i in range(5):
    print(f"Email ID: {final_splits_large['test'][i]['id']}")
    print(f"True label: {label_names[true_labels[i]]}")
    print("Predicted probabilities:")
    for label_idx, prob in enumerate(probs[i]):
        print(f"  {label_names[label_idx]}: {prob:.4f}")
    print("-" * 40)
