<a href="https://colab.research.google.com/github/NastasiaMazur/Finance-Sentiment-Analysis/blob/main/albert_xlarge_v2_FinanceIncauditor_sentiment_UPDATED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downstream Task: Sentiment Analysis

Model: **albert-xlarge-v2**

Dataset: **FinanceInc/auditor_sentiment**

# 1. Activate GPU and Install Dependencies

In [None]:
#check if GPU is available
import torch
torch.cuda.is_available()

True

In [None]:
# Install required libraries
!pip install transformers
!pip install datasets
!pip install bertviz transformers
!pip install transformers[torch]



In [None]:
# Conntext Google Drive to store data
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# 2. Preprocess data

In [None]:
# Load data
from datasets import load_dataset
finance_dataset = load_dataset("FinanceInc/auditor_sentiment")

# Create a smaller training dataset for faster training times
from datasets import DatasetDict

small_finance_dataset = DatasetDict(
    train=finance_dataset['train'].shuffle(seed=24).select(range(500)), # for training
    val=finance_dataset['train'].shuffle(seed=24).select(range(500, 600)), #  for validation
    test=finance_dataset['train'].shuffle(seed=24).select(range(600, 700)) # for testing
)

In [None]:
small_finance_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 500
    })
    val: Dataset({
        features: ['sentence', 'label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 100
    })
})

In [None]:
small_finance_dataset['train'][:5]

{'sentence': ['FCC Chairman Kevin Martin said that fair play required extending the same deregulatory rules to the digital subscriber lines that telecom providers use for broadband networks .',
  'Metso Foundries Jyvaskyla Oy will discontinue production on this line by 30 September 2008 , the company said .',
  'Finnish business software group AffectoGenimap Oyj said its net profit halved to 1.2 mln euro ( $ 1.5 mln ) in the first nine months of 2006 from 2.2 mln euro ( $ 2.8 mln ) in the same period of 2005 .',
  'Finnish financial software developer Basware Oyj said today it will provide its invoice automation ( IA ) solution to an unnamed major retail company in the USA in a deal , worth more than EUR300 ,000 .',
  'According to the company , a decision in the issue will be made in the summer of 2010 , at the earliest , and in the summer of 2011 , at the latest .'],
 'label': [1, 1, 0, 2, 1]}

In [None]:
# Set distilgpt2 tokenizer
from transformers import AlbertTokenizer                         #
tokenizer = AlbertTokenizer.from_pretrained('albert-xlarge-v2')    #
print(tokenizer)
print(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

AlbertTokenizer(name_or_path='albert-xlarge-v2', vocab_size=30000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '<unk>', 'sep_token': '[SEP]', 'pad_token': '<pad>', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
30000


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True)

small_tokenized_dataset = small_finance_dataset.map(tokenize_function, batched=True, batch_size=16)
small_tokenized_dataset = small_tokenized_dataset.remove_columns(["sentence"])
small_tokenized_dataset = small_tokenized_dataset.rename_column("label", "labels")
small_tokenized_dataset.set_format("torch")

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
small_tokenized_dataset['train'][0:3]


{'labels': tensor([1, 1, 0]),
 'input_ids': tensor([[    2, 18213,  1757,  3480,  1189,    87,    30,  1768,   418,  1390,
           8176,    14,   205,   121,    99, 15628,  7496,  1761,    20,    14,
           1888,    13, 20330,   139,  1560,    30, 18712, 13488,   275,    26,
          22831,  5540,    13,     9,     3,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0],
         [    2,   798,   656,   216,  2829,   487,    93,  1385,  2397,   531,
             13,  7452,   129,  1460, 13391,  4185,   637,    27,    48,   293,
             34,   712,   299,   570,    13,    15,    14,   237,    87,    13,
              9,     3,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0, 

In [None]:
import torch
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_tokenized_dataset['train'], batch_size=16)
eval_dataloader = DataLoader(small_tokenized_dataset['val'], batch_size=16)

# 3. Training the model

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

In [None]:
# Define distilgpt2 as our base model:
from transformers import AlbertForSequenceClassification #
model = AlbertForSequenceClassification.from_pretrained("albert-xlarge-v2", num_labels=3)    #


pytorch_model.bin:   0%|          | 0.00/236M [00:00<?, ?B/s]

In [None]:
num_epochs = 3
num_training_steps = 3 * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

AlbertModel(
  (embeddings): AlbertEmbeddings(
    (word_embeddings): Embedding(30000, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (encoder): AlbertTransformer(
    (embedding_hidden_mapping_in): Linear(in_features=128, out_features=2048, bias=True)
    (albert_layer_groups): ModuleList(
      (0): AlbertLayerGroup(
        (albert_layers): ModuleList(
          (0): AlbertLayer(
            (full_layer_layer_norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
            (attention): AlbertAttention(
              (query): Linear(in_features=2048, out_features=2048, bias=True)
              (key): Linear(in_features=2048, out_features=2048, bias=True)
              (value): Linear(in_features=2048, out_features=2048, bias=True)
              (attention_dropout): Dropout(p=0, inplace=False

In [None]:
import numpy as np
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification
#model = AlbertModel.from_pretrained("albert-xlarge-v2", num_labels=3)         #
from transformers import AlbertForSequenceClassification

model = AlbertForSequenceClassification.from_pretrained("albert-xlarge-v2", num_labels=3)

arguments = TrainingArguments(
    output_dir="store_the_checkpoints_albert-xlarge_1",                           #ADJUST FOLDER !!!   #
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    lr_scheduler_type='cosine',
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
    seed=224
)


early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)


optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    # calculates the accuracy
    return {"accuracy": np.mean(predictions == labels)}

trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=small_tokenized_dataset['train'],
    eval_dataset=small_tokenized_dataset['val'], # change to test when you do your final evaluation!
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-xlarge-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.007041,0.6
2,No log,0.911098,0.66
3,No log,0.835303,0.59
4,No log,0.802033,0.7
5,No log,0.779806,0.69
6,No log,0.830615,0.7
7,No log,0.93638,0.62
8,No log,0.837807,0.7
9,No log,0.849479,0.71
10,No log,0.849135,0.71


TrainOutput(global_step=320, training_loss=0.6130453109741211, metrics={'train_runtime': 966.0597, 'train_samples_per_second': 5.176, 'train_steps_per_second': 0.331, 'total_flos': 137846390229072.0, 'train_loss': 0.6130453109741211, 'epoch': 10.0})

In [None]:
results = trainer.predict(small_tokenized_dataset['val'])
print(results)

PredictionOutput(predictions=array([[-1.2687045e+00,  4.9133632e-01, -8.8280616e-03],
       [-1.8051389e+00,  1.1618570e+00, -6.8173957e-01],
       [-1.8005830e+00,  9.3178982e-01, -4.9899712e-01],
       [-1.8641576e+00,  1.0019222e+00, -5.5984575e-01],
       [ 1.5214349e-02, -3.8031203e-01,  7.0843291e-01],
       [-1.9080950e+00,  1.0053567e+00, -6.1798245e-01],
       [-1.7820301e+00,  1.1851892e+00, -7.1042240e-01],
       [-5.3204542e-01,  2.7635562e-01,  3.2854167e-01],
       [-1.7244110e+00,  9.5054191e-01, -5.2591228e-01],
       [-3.0176294e-01,  1.2035610e-01,  5.6523800e-01],
       [-1.8952618e+00,  1.8559334e+00, -8.7545097e-01],
       [-1.8558878e+00,  1.5203997e+00, -8.1018150e-01],
       [-1.6970119e+00,  1.1197305e+00, -6.2665087e-01],
       [-1.7606114e+00,  1.2967255e+00, -7.3641062e-01],
       [-1.3944485e+00,  8.8771647e-01, -3.6871853e-01],
       [-1.8867441e+00,  8.3135152e-01, -4.7588664e-01],
       [-1.8342746e+00,  1.3713988e+00, -7.3573476e-01],
  

In [None]:
test_str = "The company's strong quarterly earnings report resulted in a surge in stock prices, reflecting investor confidence in its future prospects."

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("store_the_checkpoints_albert-xlarge_1/checkpoint-160") #ADJUST FOLDER !!!  500/600/700-folder320'  #
model_inputs = tokenizer(test_str, return_tensors="pt")
prediction = torch.argmax(fine_tuned_model(**model_inputs).logits)
print(["NEGATIVE", "POSITIVE", "NEUTRAL"][prediction])

NEUTRAL


In [None]:
results = trainer.predict(small_tokenized_dataset['test'])
print(results)

PredictionOutput(predictions=array([[-1.7898917 ,  1.2717791 , -0.73453075],
       [-1.7857322 ,  1.1703728 , -0.70032316],
       [-1.7824513 ,  1.3856863 , -0.76789224],
       [-1.7889098 ,  1.039094  , -0.5949826 ],
       [ 0.29243526, -0.62839067,  0.77413535],
       [-1.8629041 ,  1.1701026 , -0.68270564],
       [-0.01064318, -0.29362682,  0.7441394 ],
       [-1.835591  ,  1.037164  , -0.61298805],
       [-1.8741895 ,  1.1864417 , -0.75199157],
       [-0.06206595, -0.11877836,  0.6562964 ],
       [-1.776291  ,  1.1106576 , -0.63820314],
       [-1.8795663 ,  1.1252416 , -0.6492351 ],
       [-1.7404974 ,  1.270117  , -0.72318393],
       [-1.8734955 ,  1.0295188 , -0.60557294],
       [-1.7741578 ,  0.924975  , -0.5779422 ],
       [-1.783206  ,  1.1811452 , -0.67916834],
       [-1.7462403 ,  1.4445971 , -0.7219101 ],
       [-1.8247559 ,  1.0730048 , -0.6574823 ],
       [-1.8187633 ,  0.9693945 , -0.53051215],
       [-1.7814837 ,  0.95908225, -0.5231291 ],
       [-0.

In [None]:
small_finance_dataset['test'][5]

{'sentence': "The government has instead proposed an exchange of the state 's stake in LMT to TeliaSonera 's stake in Lattelecom .",
 'label': 1}

In [None]:
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("store_the_checkpoints_albert-xlarge_1/checkpoint-160") #ADJUST FOLDER !!!  500/600/700-folder320     #

model_inputs = tokenizer(small_tokenized_dataset['test']['sentence'], padding=True, truncation=True, return_tensors='pt')
outputs = fine_tuned_model(**model_inputs, output_hidden_states=True)

In [None]:
from torch.utils.tensorboard import SummaryWriter
import os
import re
import torch
import tensorflow as tf
import tensorboard as tb

In [None]:
import torch

path = "store_the_checkpoints_distilgpt2_1/results_vis_albert-xlarge_2"                #
layer=0
if not os.path.exists(path):
  os.mkdir(path)

while layer in range(len(outputs['hidden_states'])):
  if not os.path.exists(path+'/layer_' + str(layer)):
    os.mkdir(path+'/layer_' + str(layer))

  example = 0
  tensors = []
  labels = []

  while example in range(len(outputs['hidden_states'][layer])):
    sp_token_position = 0
    for token in model_inputs['input_ids'][example]:
      if token != 0:
        sp_token_position += 1
      else:
        tensor = outputs['hidden_states'][layer][example][sp_token_position]
        tensors.append(tensor)
        break

    label = [small_tokenized_dataset['test']['sentence'][example],str(small_tokenized_dataset['test']['label'][example])]
    labels.append(label)
    example +=1

  writer=SummaryWriter(path+'/layer_' + str(layer))
  writer.add_embedding(torch.stack(tensors), metadata=labels, metadata_header=['Sentence','Emotion'])

  layer+=1


TensorFlow Embedding Projector API.

# 4. Analyzing new data with the model

You can also use pipeline (uncomment a cell with huggingface above):

In [None]:
# Upload the model to the Hub
trainer.push_to_hub()

In [None]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="") # Add your saved mode in ""

sentiment_model(["The company's strong quarterly earnings report resulted in a surge in stock prices, reflecting investor confidence in its future prospects."])