In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# 1. Load raw data
def load_financial_phrasebank(filepath):
    data = []
    with open(filepath, encoding="iso-8859-1") as f:
        for line in f:
            sentence, label = line.rsplit("@", 1)
            data.append({
                "sentence": sentence.strip(),
                "label": label.strip()
            })
    return data

path_to_files = "data/FinancialPhraseBank-v1.0/"
files_base_name = "Sentences_"
possible_datasets = ["50Agree", "66Agree", "75Agree", "AllAgree"]
files_ends_with = ".txt"

selected_dataset = "50Agree"  # or another from possible_datasets
file_path = os.path.join(path_to_files, f"{files_base_name}{selected_dataset}{files_ends_with}")
data = load_financial_phrasebank(file_path)

# 2. Convert to DataFrame
df = pd.DataFrame(data)

# 3. Label encoding
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}
df = df[df.label.str.lower().isin(label2id)]
df["label"] = df.label.str.lower().map(label2id)

# 4. Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df.label, random_state=42)

# 5. Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 6. Load model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id)

# 7. Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
    save_steps=500,
    logging_steps=500
)


# 8. Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    report = classification_report(labels, preds, output_dict=True, target_names=["negative", "neutral", "positive"])
    return {
        "accuracy": report["accuracy"],
        "macro_f1": report["macro avg"]["f1-score"],
    }

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 10. Train
trainer.train()

# 11. Evaluate
results = trainer.evaluate()
print("Final evaluation:", results)

# Optional: detailed classification report
preds_output = trainer.predict(test_dataset)
preds = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids
print(classification_report(y_true, preds, target_names=["negative", "neutral", "positive"]))


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 3876/3876 [00:01<00:00, 3150.53 examples/s]
Map: 100%|██████████| 970/970 [00:00<00:00, 3154.54 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.4156




Final evaluation: {'eval_loss': 0.6137823462486267, 'eval_accuracy': 0.8422680412371134, 'eval_macro_f1': 0.8327633154306349, 'eval_runtime': 19.3197, 'eval_samples_per_second': 50.208, 'eval_steps_per_second': 3.157, 'epoch': 4.0}




              precision    recall  f1-score   support

    negative       0.79      0.89      0.84       121
     neutral       0.90      0.84      0.87       576
    positive       0.76      0.83      0.79       273

    accuracy                           0.84       970
   macro avg       0.82      0.85      0.83       970
weighted avg       0.85      0.84      0.84       970

