#### Library

In [1]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, EarlyStoppingCallback
from transformers import Trainer
from datasets import Dataset
import pandas as pd
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


#### Model

In [2]:
MODEL_NAME = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, ignore_mismatched_sizes=True)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
df = pd.read_csv("G:\Malk\Qafza\Final_Project\data\cleaned_news.csv") 
df.drop(columns=['date', 'news', 'neg', 'neu', 'pos', 'compound'],inplace=True)
df = df.rename(columns={'sentiment': 'label', 'clean_text':'text'})

df=df[:2000]
label_map = {"NEGATIVE": 0, "POSITIVE": 1}
df["label"] = df["label"].map(label_map)
dataset = Dataset.from_pandas(df)

In [13]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 2000/2000 [00:00<00:00, 4129.16 examples/s]


#### Training

In [14]:
from datasets import DatasetDict

dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

In [15]:
training_args = TrainingArguments(
    output_dir="./finbert_results",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save model at the end of each epoch
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=5,  # Set a high number of epochs to allow early stopping to take effect
    weight_decay=0.01,
    logging_dir="./finbert_logs",
    load_best_model_at_end=True,  # Load best model when early stopping is triggered
    metric_for_best_model="eval_loss",  # Use evaluation loss for stopping
    greater_is_better=False  # Lower loss is better
)



In [17]:
# Define a function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

for param in model.base_model.parameters():
    param.requires_grad = False  

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Stops if no improvement for 3 epochs
)

In [18]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.486126,0.785,0.612903,0.204301,0.306452
2,No log,0.485298,0.7825,0.6,0.193548,0.292683
3,No log,0.486652,0.7925,0.613636,0.290323,0.394161
4,0.535200,0.48764,0.795,0.634146,0.27957,0.38806
5,0.535200,0.487882,0.795,0.634146,0.27957,0.38806


TrainOutput(global_step=800, training_loss=0.52701322555542, metrics={'train_runtime': 6991.4494, 'train_samples_per_second': 1.144, 'train_steps_per_second': 0.114, 'total_flos': 2104888442880000.0, 'train_loss': 0.52701322555542, 'epoch': 5.0})

In [19]:
model.save_pretrained('Model')
tokenizer.save_pretrained('Model')

('Model\\tokenizer_config.json',
 'Model\\special_tokens_map.json',
 'Model\\vocab.txt',
 'Model\\added_tokens.json',
 'Model\\tokenizer.json')