In [None]:
!pip install --upgrade transformers datasets accelerate evaluate


In [2]:
!pip install transformers datasets scikit-learn --upgrade
!pip install --upgrade transformers datasets accelerate evaluate --quiet


Successfully installed scikit-learn-1.7.2


In [None]:
#Install libs and disable W&B

!pip install --upgrade transformers datasets accelerate evaluate

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

import transformers
print("Transformers version:", transformers.__version__)


In [4]:
from google.colab import files

print("Upload FinancialPhraseBank-v1.0.zip:")
uploaded = files.upload()

print("Upload tokenizer folder (upload all files inside /tokenizer):")
uploaded = files.upload()


Upload FinancialPhraseBank-v1.0.zip:


Saving FinancialPhraseBank-v1.0.zip to FinancialPhraseBank-v1.0.zip
Upload tokenizer folder (upload all files inside /tokenizer):


Saving vocab.txt to vocab.txt
Saving tokenizer_config.json to tokenizer_config.json
Saving tokenizer.json to tokenizer.json
Saving added_tokens.json to added_tokens.json
Saving special_tokens_map.json to special_tokens_map.json


In [5]:
# Create tokenizer directory
os.makedirs("tokenizer", exist_ok=True)

# Move uploaded tokenizer files into folder
import shutil

for fname in uploaded.keys():
    if fname.startswith("tokenizer") or fname in [
        "tokenizer.json",
        "tokenizer_config.json",
        "special_tokens_map.json",
        "vocab.txt",
    ]:
        shutil.move(fname, f"tokenizer/{fname}")


In [6]:
import shutil
import os

src_dir = "tokenizer"
dst_dir = "finbert_financial_tokenizer"

os.makedirs(dst_dir, exist_ok=True)

for fname in ["vocab.txt", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
    src = os.path.join(src_dir, fname)
    dst = os.path.join(dst_dir, fname)
    if not os.path.exists(src):
        raise FileNotFoundError(f"Missing {src}, please upload it.")
    shutil.copy(src, dst)

print("Copied tokenizer files successfully.")


Copied tokenizer files successfully.


In [7]:
import zipfile

zip_path = "FinancialPhraseBank-v1.0.zip"

with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall("FinancialPhraseBank")

print("Unzipped to FinancialPhraseBank/")

Unzipped to FinancialPhraseBank/


In [8]:
import os
import pandas as pd

sentences_path = "FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_50Agree.txt"

if not os.path.exists(sentences_path):
    raise FileNotFoundError(
        f"{sentences_path} not found. Check your folder name and upload it correctly."
    )

data = pd.read_csv(
    sentences_path,
    sep='.@',
    names=['text', 'label'],
    engine='python',
    encoding='latin-1',
    on_bad_lines='skip'
)

print("Loaded rows:", len(data))
data.head()


Loaded rows: 4846


Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive


In [9]:
from transformers import BertTokenizerFast

tokenizer_path = "finbert_financial_tokenizer"

tokenizer = BertTokenizerFast.from_pretrained(
    tokenizer_path,
    local_files_only=True
)

print("Loaded custom tokenizer. Vocab size:", len(tokenizer))


Loaded custom tokenizer. Vocab size: 30861


In [10]:
import zipfile
from pathlib import Path
from sklearn.model_selection import train_test_split
from datasets import Dataset

zip_path = "FinancialPhraseBank-v1.0.zip"

assert os.path.exists(zip_path), f"Zip file not found: {zip_path}"

extract_dir = "FinancialPhraseBank"
with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_dir)

# find Sentences_50Agree.txt inside the extracted folder
base = Path(extract_dir)
txt_files = list(base.rglob("Sentences_50Agree.txt"))
assert len(txt_files) > 0, "Could not find Sentences_50Agree.txt inside the unzipped folder."

sentences_path = str(txt_files[0])
print("Found Sentences_50Agree.txt at:", sentences_path)

out_dir = "data/sentiment_data"
os.makedirs(out_dir, exist_ok=True)

data = pd.read_csv(
    sentences_path,
    sep='.@',
    names=['text', 'label'],
    engine='python',
    encoding='latin-1',
    on_bad_lines='skip'
)

print("Total rows in raw PhraseBank file:", len(data))
print("Sample:")
display(data.head())

train, test = train_test_split(data, test_size=0.2, random_state=0)
train, valid = train_test_split(train, test_size=0.1, random_state=0)

train_path = os.path.join(out_dir, "train.csv")
valid_path = os.path.join(out_dir, "validation.csv")
test_path  = os.path.join(out_dir, "test.csv")

train.to_csv(train_path, sep='\t', index=False)
valid.to_csv(valid_path, sep='\t', index=False)
test.to_csv(test_path,  sep='\t', index=False)

print("Saved splits to:")
print("  ", train_path)
print("  ", valid_path)
print("  ", test_path)

for p in [train_path, valid_path, test_path]:
    print("\nPreview:", p)
    display(pd.read_csv(p, sep='\t').head())

def load_split_csv(path: str) -> Dataset:
    df = pd.read_csv(path, sep='\t')
    return Dataset.from_pandas(df)

train_ds = load_split_csv(train_path)
valid_ds = load_split_csv(valid_path)
test_ds  = load_split_csv(test_path)

print(train_ds[0])



Found Sentences_50Agree.txt at: FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_50Agree.txt
Total rows in raw PhraseBank file: 4846
Sample:


Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive


Saved splits to:
   data/sentiment_data/train.csv
   data/sentiment_data/validation.csv
   data/sentiment_data/test.csv

Preview: data/sentiment_data/train.csv


Unnamed: 0,text,label
0,"After the reporting period , BioTie North Amer...",positive
1,They will cover all Forest Industry 's units a...,negative
2,"( ADP News ) - Nov 28 , 2008 - Finnish power-s...",positive
3,"Following the transaction , Lundbeck has world...",positive
4,A few employees would remain at the Oulu plant...,neutral



Preview: data/sentiment_data/validation.csv


Unnamed: 0,text,label
0,Our in-depth expertise extends to the fields o...,neutral
1,"Profit for the period was EUR 9.8 mn , up from...",positive
2,Favourable currency rates also contributed to ...,positive
3,"Upgrades include a five megapixel camera , voi...",positive
4,"Making matters more difficult , the company sa...",negative



Preview: data/sentiment_data/test.csv


Unnamed: 0,text,label
0,The Bristol Port Company has sealed a one mill...,positive
1,A paper mill in the central Maine town of Madi...,neutral
2,"ALEXANDRIA , Va. , Oct. 23 -- Hans-Otto Scheck...",neutral
3,Altona stated that the private company of Alto...,neutral
4,Registration is required,neutral


{'text': 'After the reporting period , BioTie North American licensing partner Somaxon Pharmaceuticals announced positive results with nalmefene in a pilot Phase 2 clinical trial for smoking cessation ', 'label': 'positive'}


In [11]:
# Load CSVs as datasets and map labels to ids

from datasets import Dataset, DatasetDict

train_df = pd.read_csv("data/sentiment_data/train.csv", sep="\t")
valid_df = pd.read_csv("data/sentiment_data/validation.csv", sep="\t")
test_df  = pd.read_csv("data/sentiment_data/test.csv", sep="\t")

print("Train label sample:", train_df["label"].unique())

# Build label mapping from the actual data
label_list = sorted(train_df["label"].unique().tolist())
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
print("Label2id:", label2id)

train_df["labels"] = train_df["label"].map(label2id)
valid_df["labels"] = valid_df["label"].map(label2id)
test_df["labels"]  = test_df["label"].map(label2id)

# Keep only text and labels columns in the dataset
train_ds = Dataset.from_pandas(train_df[["text", "labels"]])
valid_ds = Dataset.from_pandas(valid_df[["text", "labels"]])
test_ds  = Dataset.from_pandas(test_df[["text", "labels"]])

dataset = DatasetDict({
    "train": train_ds,
    "validation": valid_ds,
    "test": test_ds,
})

dataset


Train label sample: ['positive' 'negative' 'neutral']
Label2id: {'negative': 0, 'neutral': 1, 'positive': 2}


DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 3488
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 388
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 970
    })
})

In [12]:
from transformers import BertTokenizerFast

tokenizer_path = "finbert_financial_tokenizer"

tokenizer = BertTokenizerFast.from_pretrained(
    tokenizer_path,
    local_files_only=True
)

print("Loaded custom tokenizer. Vocab size:", len(tokenizer))


Loaded custom tokenizer. Vocab size: 30861


In [13]:
# Tokenize using the custom tokenizer

max_length = 128

def tokenize_batch(batch):
    enc = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )
    enc["labels"] = batch["labels"]
    return enc

tokenized_dataset = dataset.map(tokenize_batch, batched=True, remove_columns=["text"])
tokenized_dataset.set_format(type="torch")

tokenized_dataset


Map:   0%|          | 0/3488 [00:00<?, ? examples/s]

Map:   0%|          | 0/388 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3488
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 388
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 970
    })
})

In [14]:
# Load FinBERT and adapt to custom tokenizer

from transformers import AutoModelForSequenceClassification

model_name = "ProsusAI/finbert"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
)

# Make embeddings match tokenizer vocab size
model.resize_token_embeddings(len(tokenizer))

model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30861, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
# Define TrainingArguments


from transformers import TrainingArguments


training_args = TrainingArguments(
   output_dir="finbert_custom_with_my_tokenizer",


   # train a bit longer but with a smaller LR
   num_train_epochs=8,
   learning_rate=2e-5,


   # smaller per-device batch, but keep similar effective batch via grad accumulation
   per_device_train_batch_size=8,
   per_device_eval_batch_size=32,
   gradient_accumulation_steps=2,


   # regularization / stability
   weight_decay=0.001,
   max_grad_norm=1.0,
   warmup_steps=500,
   label_smoothing_factor=0.05,


   # logging / saving
   logging_dir="./logs",
   logging_steps=50,
   save_steps=500,
   save_total_limit=2,


   # make sure Trainer actually trains and evaluates
   do_train=True,
   do_eval=True,
)


training_args


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
eval_use_gather_object=False,
fp1

In [16]:
# Create Trainer

from transformers import Trainer

def compute_metrics(eval_pred):
    import numpy as np
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    accuracy = (preds == labels).astype(float).mean().item()
    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer


  trainer = Trainer(


<transformers.trainer.Trainer at 0x79e5fcac0d40>

In [17]:
#Train

train_result = trainer.train()
train_result


Step,Training Loss
50,3.1947
100,1.784
150,0.9402
200,0.6933
250,0.572
300,0.4854
350,0.4315
400,0.4248
450,0.3638
500,0.337


TrainOutput(global_step=1744, training_loss=0.4144154781595283, metrics={'train_runtime': 653.8733, 'train_samples_per_second': 42.675, 'train_steps_per_second': 2.667, 'total_flos': 1835479202070528.0, 'train_loss': 0.4144154781595283, 'epoch': 8.0})

In [18]:
# Evaluate on test set

metrics = trainer.evaluate(tokenized_dataset["test"])
print(metrics)


{'eval_loss': 0.5976389050483704, 'eval_accuracy': 0.856701030927835, 'eval_runtime': 6.6292, 'eval_samples_per_second': 146.323, 'eval_steps_per_second': 4.676, 'epoch': 8.0}


In [19]:
# Inference helper

import torch
import numpy as np

def predict_sentiment(text):
    model.eval()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_length,
    )

    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
        pred_id = int(np.argmax(probs))

    return {
        "text": text,
        "prediction_id": pred_id,
        "prediction_label": id2label[pred_id],
        "probs": probs,
    }

example = "The company reported strong earnings and raised its full-year guidance."
predict_sentiment(example)


{'text': 'The company reported strong earnings and raised its full-year guidance.',
 'prediction_id': 2,
 'prediction_label': 'positive',
 'probs': array([0.01681711, 0.01568846, 0.9674944 ], dtype=float32)}