In [2]:
!pip install transformers datasets accelerate -q

In [20]:
!pip install -U transformers



In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import kagglehub
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [2]:
# Download latest version
path = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/ankurzing/sentiment-analysis-for-financial-news?dataset_version_number=5...


100%|██████████| 903k/903k [00:00<00:00, 17.5MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/ankurzing/sentiment-analysis-for-financial-news/versions/5





In [3]:
df = pd.read_csv(f"{path}/all-data.csv", header=None, encoding="latin1")
df.columns = ["label", "headline"]

print(df.head())
print("Dataset size:", len(df))


      label                                           headline
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...
Dataset size: 4846


In [4]:
label_map = {"negative": 0, "neutral": 1, "positive": 2}
df["label_id"] = df["label"].map(label_map)

# train/val split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])


In [5]:
model_name = "yiyanghkust/finbert-tone"


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3  # positive/neutral/negative
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [7]:
train_ds = Dataset.from_pandas(train_df[["headline", "label_id"]])
val_ds   = Dataset.from_pandas(val_df[["headline", "label_id"]])


In [8]:
def tokenize(batch):
    return tokenizer(
        batch["headline"],
        padding="max_length",
        truncation=True,
        max_length=64
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)

train_ds = train_ds.rename_column("label_id", "labels")
val_ds   = val_ds.rename_column("label_id", "labels")

train_ds.set_format("torch")
val_ds.set_format("torch")


Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc  = accuracy_score(labels, preds)
    f1   = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./finbert_sentiment_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none",
)



In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.441348,0.831959,0.832647
2,No log,0.464854,0.837113,0.837509
3,0.558300,0.520079,0.850515,0.850771


TrainOutput(global_step=729, training_loss=0.4264220589636448, metrics={'train_runtime': 261.7873, 'train_samples_per_second': 44.418, 'train_steps_per_second': 2.785, 'total_flos': 382435352667648.0, 'train_loss': 0.4264220589636448, 'epoch': 3.0})

In [11]:
import numpy as np
from sklearn.metrics import confusion_matrix

pred_output = trainer.predict(val_ds)

y_true = pred_output.label_ids
y_pred = np.argmax(pred_output.predictions, axis=1)

cm = confusion_matrix(y_true, y_pred)
print(cm)

[[105  12   4]
 [ 28 497  51]
 [ 16  52 205]]


In [20]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [21]:
import os
os.makedirs("/content/drive/Shared drives/comp 576/finbert_model", exist_ok=True)

save_path = "/content/drive/Shared drives/comp 576/finbert_model"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model saved to:", save_path)


Model saved to: /content/drive/Shared drives/comp 576/finbert_model
