In [2]:
!pip install opencv-python datasets evaluate transformers

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from evaluate import load
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import torch

In [4]:
df = pd.read_csv('/content/all-data.csv', encoding='ISO-8859-1')
df.columns = ['Sentiment', 'Text']
df.head()

Unnamed: 0,Sentiment,Text
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [5]:
# Checking for any nul values
df.isnull().sum()

Unnamed: 0,0
Sentiment,0
Text,0


In [6]:
# Map sentiment labels to numerical values
label_mapping = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["Sentiment"].str.lower().map(label_mapping)

In [7]:
# Split into training & testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

In [8]:
# Convert pandas DataFrames to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

In [9]:
# Tokenization
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/969 [00:00<?, ? examples/s]

In [11]:
# Fine-tune FinBERT
# Load pre-trained FinBERT model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Training arguments
training_args = TrainingArguments(
    output_dir="/content/finbert-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)



In [12]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [13]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhhassanhack123[0m ([33mhhassanhack123-home[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.4371,0.425014
2,0.2833,0.404974
3,0.2359,0.464021


TrainOutput(global_step=729, training_loss=0.42152969099693666, metrics={'train_runtime': 331.1027, 'train_samples_per_second': 35.119, 'train_steps_per_second': 2.202, 'total_flos': 764870705335296.0, 'train_loss': 0.42152969099693666, 'epoch': 3.0})

In [14]:
# Save the final model and tokenizer
trainer.save_model("/content/finbert-finetuned")
tokenizer.save_pretrained("/content/finbert-finetuned")

('/content/finbert-finetuned/tokenizer_config.json',
 '/content/finbert-finetuned/special_tokens_map.json',
 '/content/finbert-finetuned/vocab.txt',
 '/content/finbert-finetuned/added_tokens.json',
 '/content/finbert-finetuned/tokenizer.json')

In [15]:
# Evaluate the fine-tuned model on the test dataset
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 0.4640210270881653, 'eval_runtime': 6.9634, 'eval_samples_per_second': 139.156, 'eval_steps_per_second': 8.76, 'epoch': 3.0}


In [16]:
from sklearn.metrics import accuracy_score

# Get predictions
predictions = trainer.predict(tokenized_datasets["test"])
preds = predictions.predictions.argmax(axis=-1)  # Convert logits to class labels

# Compute accuracy
accuracy = accuracy_score(tokenized_datasets["test"]["label"], preds)
print("Accuracy:", accuracy)


Accuracy: 0.8596491228070176
