# FT-Transformer Binary Text Classifier

Install required packages:



1.   pip install -U sentence-transformers
2.   pip install torch transformers
3.   pip install sentencepiece
4.   pip install pytorch_frame[full]
5.   pip install voyageai
6.   pip install openai -U







Before importing components upload the listed files to your Colab environment folder.



*   fttransformermodel.py
*   text_embedder.py

Simply do:



```
from google.colab import files

files.upload()
```




In [3]:
import torch
import torch_frame
import pandas as pd
import torch.nn as nn
from openai import OpenAI
import text_embedder as te
import torch.optim as optim
import torch.nn.functional as F
import fttransformermodel as ftt

from tqdm import tqdm
from typing import List
from torch import Tensor
from torch_frame import stype
from torch_frame.config import ModelConfig
from torch_frame.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from torch_frame.config.text_embedder import TextEmbedderConfig
from torch_frame.config.text_tokenizer import TextTokenizerConfig
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

## Auxiliary Functions

In [43]:
def plot_confusion_matrix(labels, preds, class_names=['non-toxic', 'toxic']):
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(10, 7))

    # Customize the heatmap
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',  # Different color map
        xticklabels=class_names,
        yticklabels=class_names,
        annot_kws={"size": 24},  # Annotation font size
        cbar_kws={"shrink": 0.75}  # Color bar size
    )

    plt.xlabel('Predicted', fontsize=18)
    plt.ylabel('True', fontsize=18)
    plt.title('Confusion Matrix', fontsize=20)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)

    # Save and show the plot
    plt.savefig('YOUR-PATH/confusion_matrix.png')
    plt.show()

## Load Dataset Splits

In [11]:
train_data = pd.read_csv('/content/drive/MyDrive/LLMeFT-Transformer/Binary Datasets/RafaAnchieta - ToLD Sets/train.csv')
val_data = pd.read_csv('/content/drive/MyDrive/LLMeFT-Transformer/Binary Datasets/RafaAnchieta - ToLD Sets/dev.csv')
test_data = pd.read_csv('/content/drive/MyDrive/LLMeFT-Transformer/Binary Datasets/RafaAnchieta - ToLD Sets/test.csv')

## SET YOUR API KEYS

In [None]:
#Set your OpenAI and Voyage AI API Keys
openai_key = 'YOUR-API-KEY'
client = OpenAI(api_key=openai_key) # or simply client = OpenAI(api_key='YOUR-API-KEY')

voyageai_key = 'YOUR-API-KEY'
voyageai.api_key = voyageai_key # or simply voyageai.api_key = 'YOUR-API-KEY'

## Load your Data or Splits


In [None]:
train_data = pd.read_csv('train_split.csv') # change it to the actual file path
val_data = pd.read_csv('val_split.csv')
test_data = pd.read_csv('test_split.csv')

## Set your Text Embedder

List of Text Embedders:



*   BertTextEncoder
*   AlbertinaTextEncoder (same for DeBERTa V2 models)
*   SentenceTransformerTextEncoder
*   GPTTextEncoder
*   VoyageAIEmbedding

In [None]:
em_model = 'chosen_model_name' # Example: em_model = 'rufimelo/bert-large-portuguese-cased-sts'
text_encoder = te.SentenceTransformerTextEncoder(model, device)

96.0

In [None]:
text_embedder_cfg = TextEmbedderConfig(text_embedder=text_encoder, batch_size=text_encoder.text_embedder_batch_size)

## Dataset Materialization and DataLoaders

In [None]:
# Specifying Column Stypes
col_to_stype = {"text": torch_frame.text_embedded,"toxic": torch_frame.categorical}

In [None]:
# Set "y" as the target column.
train_dataset = Dataset(train_data, col_to_stype=col_to_stype, target_col="toxic",split_col= None,col_to_text_embedder_cfg=text_embedder_cfg)
val_dataset = Dataset(val_data, col_to_stype=col_to_stype, target_col="toxic",split_col= None,col_to_text_embedder_cfg=text_embedder_cfg)
test_dataset = Dataset(test_data, col_to_stype=col_to_stype, target_col="toxic",split_col= None,col_to_text_embedder_cfg=text_embedder_cfg)

In [None]:
# Materialize each split
# Use path parameter to store generated tensor on cache dataset.materialize(path='your-path/data.pt')
train_dataset.materialize()
val_dataset.materialize()
test_dataset.materialize()

In [None]:
# Shuffle the data in each split
train_dataset.shuffle()
val_dataset.shuffle()
test_dataset.shuffle()

In [None]:
# Set up data loaders
train_tensor_frame = train_dataset.tensor_frame
val_tensor_frame = val_dataset.tensor_frame
test_tensor_frame = test_dataset.tensor_frame

train_loader = DataLoader(train_tensor_frame, batch_size=args.batch_size, shuffle=True)
val_loader = DataLoader(val_tensor_frame, batch_size=args.batch_size)
test_loader = DataLoader(test_tensor_frame, batch_size=args.batch_size)

## Model preparations and Training

In [None]:
# Setting task for for classification
is_classification = train_dataset.task_type.is_classification

output_channels = train_dataset.num_classes # {Number of different labels found in the target_col (y = toxic)}

In [None]:
# Setting a parser
parser = {
    'output_channels': train_dataset.num_classes,
    'col_stats': tensor_frame.col_stats,
    'col_names_dict': tensor_frame.col_names_dict,
    'em_model': 'rufimelo/bert-large-portuguese-cased-sts'
}

In [None]:
# Create and Compile FTT model
ftt_model = ftt.FTTransformerModel(parser)

ftt_model = torch.compile(ftt_model, dynamic=True) if args.compile else ftt_model

# Setting AdamW with Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=1000)

In [None]:
# Training Loop

metric = "Acc"
best_val_metric = 0
best_test_metric = 0
best_val_report = None
best_test_report = None

for epoch in range(1, args.epochs + 1):
    train_loss = ftt_model.train(epoch)
    train_results = ftt_model.test(train_loader, "Train")
    val_results = ftt_model.test(val_loader, "Validation")
    test_results = ftt_model.test(test_loader, "Test")

    if is_classification:
        train_metric = train_results["accuracy"]
        val_metric = val_results["accuracy"]
        test_metric = test_results["accuracy"]
    else:
        train_metric = train_results["rmse"]
        val_metric = val_results["rmse"]
        test_metric = test_results["rmse"]

    if is_classification and val_metric > best_val_metric:
        best_val_metric = val_metric
        best_test_metric = test_metric
        best_val_report = val_results
        best_test_report = test_results
    elif not is_classification and val_metric < best_val_metric:
        best_val_metric = val_metric
        best_test_metric = test_metric
        best_val_report = val_results
        best_test_report = test_results

    print(f"Train Loss: {train_loss:.4f}, Train {metric}: {train_metric:.4f}, "
          f"Val {metric}: {val_metric:.4f}, Test {metric}: {test_metric:.4f}")

print(f"Best Val {metric}: {best_val_metric:.4f}, "
      f"Best Test {metric}: {best_test_metric:.4f}")

## Evaluation and Prediction Results

In [21]:
# Validation and Test Classification Reports
print("\nValidation Classification Report:")
print(classification_report(best_val_report["labels"], best_val_report["preds"], digits=4))

print("\nTest Classification Report:")
print(classification_report(best_test_report["labels"], best_test_report["preds"], digits=4))

## Plot Confusion Matrix

In [22]:
plot_confusion_matrix(best_test_report["labels"], best_test_report["preds"])

Dataset()