# SwanBERT Pretraining on Labelled Finance Sentiment Classification Data (Financial PhraseBank)

### Import Libraries

In [1]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Prepare Data for Training

In [2]:
# Using sentneces will all levels of agreeance (>50%)
file_path = './FinancialPhraseBank/Data/Sentences_50Agree.txt'

texts = []
labels = []

with open(file_path, 'r', encoding='latin-1') as f:
    for line in f:
        line = line.strip()  # remove newline characters
        if line:  # skip empty lines
            text, label = line.rsplit('@', 1)
            texts.append(text.strip())
            labels.append(label.strip())


# Convert labels to numerical values
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
numeric_labels = [label_map[label] for label in labels]

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, numeric_labels, 
    test_size=0.2, 
    stratify=numeric_labels,
    random_state=42
)

# Create Dataset objects with correct structure
train_dataset = Dataset.from_dict({
    'text': train_texts,
    'label': train_labels
})
test_dataset = Dataset.from_dict({
    'text': test_texts,
    'label': test_labels
})

# 3. Tokenization with proper format
tokenizer = DistilBertTokenizer.from_pretrained("./financial-corpus-distilbert")

def tokenize_function(examples):
    # Return as lists, not tensors
    return tokenizer(
        examples['text'], 
        truncation=True, 
        max_length=128,
        padding=False  # Let collator handle padding
    )

# Apply tokenization
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# 4. Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 5. Verify one batch works
try:
    sample_batch = next(iter(DataLoader(
        tokenized_train, 
        batch_size=2, 
        collate_fn=data_collator
    )))
    print("Batch verification successful!")
    print({k: v.shape for k, v in sample_batch.items()})
except Exception as e:
    print("Batch verification failed:", str(e))
    # Debug the dataset structure
    print("\nSample dataset item:", tokenized_train[0])
    print("Tokenized keys:", list(tokenized_train.features.keys()))

Map: 100%|██████████| 3876/3876 [00:02<00:00, 1770.80 examples/s]
Map: 100%|██████████| 970/970 [00:00<00:00, 1760.67 examples/s]

Batch verification failed: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

Sample dataset item: {'text': 'The major breweries increased their domestic beer sales by 4.5 per cent last year , to 256.88 million litres from 245.92 million litres in 2004 .', 'label': 2, 'input_ids': [101, 1996, 2350, 18710, 3111, 3445, 2037, 4968, 5404, 4341, 2011, 1018, 1012, 1019, 2566, 9358, 2197, 2095, 1010, 2000, 17273, 1012, 6070, 2454, 25783, 2013, 21005, 1012, 6227, 2454, 25783, 1999, 2432, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Tokenized keys: ['text', 'label', 'input_ids', 'attention_mask']





### Model Configuration
SwanBERT was trained as a masked language model and needs to be reconfigured for sentiment classification (sequence classification)

In [3]:
config = DistilBertConfig.from_pretrained(
    "./financial-corpus-distilbert",
    num_labels=3,
    id2label={0: "negative", 1: "neutral", 2: "positive"},
    label2id=label_map,
    architectures=["DistilBertForSequenceClassification"]
)

model = DistilBertForSequenceClassification.from_pretrained(
    "./financial-corpus-distilbert",
    config=config,
    ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at ./financial-corpus-distilbert were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ./financial-corpus-distilbert and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_c

### Train and Save SwanBERT Using Trainer

In [4]:
training_args = TrainingArguments(
    output_dir="./results/FinancialPhraseBank",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=data_collator,
)

# Phased training
print("\nPhase 1: Train only classifier head")
for param in model.distilbert.parameters():
    param.requires_grad = False

trainer.train()

print("\nPhase 2: Full fine-tuning")
for param in model.distilbert.parameters():
    param.requires_grad = True

trainer.train()

# Save model
model.save_pretrained('./sentiment_model_simple')
tokenizer.save_pretrained('./sentiment_model_simple')



Phase 1: Train only classifier head


Epoch,Training Loss,Validation Loss
1,0.7826,0.809082
2,0.8079,0.739848
3,0.6887,0.703707
4,0.6391,0.673638
5,0.6228,0.654245
6,0.6402,0.640753
7,0.6085,0.630886
8,0.6771,0.626066
9,0.6325,0.62207
10,0.657,0.620878



Phase 2: Full fine-tuning


Epoch,Training Loss,Validation Loss
1,0.6351,0.630836
2,0.6775,0.630835
3,0.6129,0.630836
4,0.6069,0.630835
5,0.6024,0.630834
6,0.6184,0.630834
7,0.5878,0.630833
8,0.6433,0.630833
9,0.6125,0.630833
10,0.6624,0.630833


('./sentiment_model_simple\\tokenizer_config.json',
 './sentiment_model_simple\\special_tokens_map.json',
 './sentiment_model_simple\\vocab.txt',
 './sentiment_model_simple\\added_tokens.json')