# Pre-Training the DistilBERT Language Model on Financial News (PROJECT: SwanBERT)

### Import libraries

In [None]:
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

### Ensure we can train our model using our GPU (much faster than CPU)

In [None]:
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU name: NVIDIA GeForce GTX 1080 Ti


### DistilBERT Pre-Training on Unlabelled Financial Corpora
In the FinBERT paper, they filtered Reuter's TRC-2 corpus, which contained 46,143 documents (29M+ words, 400K sentneces). I cannot get access to TRC-2 so I'm using https://github.com/Kriyszig/financial-news-data. This is a collection of financial news articles scraped from the Reuter website. It containes 106,521 files (word and sentence count unknown).

In [None]:
# Import Reuters Financial News Articles as DataFrame
financial_data_df = pd.read_parquet('financial-data-to-dataframe/financial_data.parquet.gzip')

In [26]:
# Extract the relevant text fields (Headline and Article) from our DataFrame
financial_texts = financial_data_df['Headline'] + ' ' + financial_data_df['Article']

# Tokenize the texts
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

encodings = tokenizer(
    financial_texts.to_list(),
    truncation=True,   # List of strings
    padding=True,
    max_length=128,    # Satisfactory length to extract entire sentences
    return_tensors='pt'
).to('cuda')

In [27]:
# Using torch's Dataset and DataLoader for efficiency
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

dataset = TextDataset(encodings)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
# Initialize DistilBERT for pre-training
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
model.to('cuda')

# Using an MLM (masked language modeling) data collator for dynamic masking
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # 15% of tokens masked
)

# Set up training
training_args = TrainingArguments(
    output_dir="./financial-corpus-distilbert",
    per_device_train_batch_size=8,  # Adjust based on GPU memory (e.g., 8, 16, 32)
    fp16=True,                      # Enable mixed-precision training (faster, less memory)
    logging_steps=100,
    num_train_epochs=3,
    save_steps=10_000,
    logging_dir="./logs",
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()

Step,Training Loss
100,2.5763
200,2.4111
300,2.2418
400,2.24
500,2.231
600,2.202
700,2.1202
800,2.1485
900,2.1008
1000,2.0609


TrainOutput(global_step=39936, training_loss=1.6094325996744328, metrics={'train_runtime': 4054.0376, 'train_samples_per_second': 78.806, 'train_steps_per_second': 9.851, 'total_flos': 1.0587725817965568e+16, 'train_loss': 1.6094325996744328, 'epoch': 3.0})

In [None]:
# Save pre-trained model
model.save_pretrained('./financial-corpus-distilbert')
tokenizer.save_pretrained('./financial-corpus-distilbert')

('/ssfinancial-corpus-distilbert\\tokenizer_config.json',
 '/ssfinancial-corpus-distilbert\\special_tokens_map.json',
 '/ssfinancial-corpus-distilbert\\vocab.txt',
 '/ssfinancial-corpus-distilbert\\added_tokens.json')