<a href="https://colab.research.google.com/github/NastasiaMazur/Finance-Sentiment-Analysis/blob/main/distilbert_base_uncased_FinanceInc_auditor_sentiment__UPDATED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downstream Task: Sentiment Analysis

Model: **distilbert-base-uncased**

Dataset: **FinanceInc/auditor_sentiment**

# 1. Activate GPU and Install Dependencies

In [1]:
#check if GPU is available
import torch
torch.cuda.is_available()

True

In [2]:
# Install required libraries
!pip install datasets
!pip install transformers
!pip install bertviz transformers
!pip install transformers[torch]
#!apt-get install git-lfs



In [3]:
# Conntext Google Drive so data can be stored there
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# 2. Preprocess data

In [4]:
# Load data
from datasets import load_dataset
finance_dataset = load_dataset("FinanceInc/auditor_sentiment")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# Create a smaller training dataset for faster training times
from datasets import DatasetDict

small_finance_dataset = DatasetDict(
    train=finance_dataset['train'].shuffle(seed=24).select(range(500)), # for training
    val=finance_dataset['train'].shuffle(seed=24).select(range(500, 600)), #  for validation
    test=finance_dataset['train'].shuffle(seed=24).select(range(600, 700)) # for testing
)

In [6]:
small_finance_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 500
    })
    val: Dataset({
        features: ['sentence', 'label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 100
    })
})

In [7]:
small_finance_dataset['train'][:5]

{'sentence': ['FCC Chairman Kevin Martin said that fair play required extending the same deregulatory rules to the digital subscriber lines that telecom providers use for broadband networks .',
  'Metso Foundries Jyvaskyla Oy will discontinue production on this line by 30 September 2008 , the company said .',
  'Finnish business software group AffectoGenimap Oyj said its net profit halved to 1.2 mln euro ( $ 1.5 mln ) in the first nine months of 2006 from 2.2 mln euro ( $ 2.8 mln ) in the same period of 2005 .',
  'Finnish financial software developer Basware Oyj said today it will provide its invoice automation ( IA ) solution to an unnamed major retail company in the USA in a deal , worth more than EUR300 ,000 .',
  'According to the company , a decision in the issue will be made in the summer of 2010 , at the earliest , and in the summer of 2011 , at the latest .'],
 'label': [1, 1, 0, 2, 1]}

In [8]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print(tokenizer)
print(len(tokenizer))

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
30522


In [9]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True)

small_tokenized_dataset = small_finance_dataset.map(tokenize_function, batched=True, batch_size=16)
small_tokenized_dataset = small_tokenized_dataset.remove_columns(["sentence"])
small_tokenized_dataset = small_tokenized_dataset.rename_column("label", "labels")
small_tokenized_dataset.set_format("torch")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
small_tokenized_dataset['train'][0:2]


{'labels': tensor([1, 1]),
 'input_ids': tensor([[  101, 14420,  3472,  4901,  3235,  2056,  2008,  4189,  2377,  3223,
           8402,  1996,  2168,  4315, 13910, 20350,  2100,  3513,  2000,  1996,
           3617,  4942, 29234,  2099,  3210,  2008, 18126, 11670,  2224,  2005,
          19595,  6125,  1012,   102,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0],
         [  101, 15253,  2080,  2179,  5134,  1046,  2100, 12044,  4801,  2721,
           1051,  2100,  2097, 12532, 16778, 11231,  2063,  2537,  2006,  2023,
           2240,  2011,  2382,  2244,  2263,  1010,  1996,  2194,  2056,  1012,
            102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,    

In [11]:
import torch
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_tokenized_dataset['train'], batch_size=16)
eval_dataloader = DataLoader(small_tokenized_dataset['val'], batch_size=16)

# 3. Training the model

In [12]:

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import pipeline
from tqdm.notebook import tqdm

In [13]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
num_epochs = 3
num_training_steps = 3 * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [15]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Uncomment if you want to log in to your Hugging Face account:

In [17]:
# Log in to your Hugging Face account
# Get your API token here https://huggingface.co/settings/token
#from huggingface_hub import notebook_login

#notebook_login()

In [18]:
# Define a new Trainer with all the objects we constructed so far
import numpy as np
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=3)

arguments = TrainingArguments(
    output_dir="store_the_checkpoints_distilbert_3",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    lr_scheduler_type='cosine',
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
    seed=224
    #push_to_hub=True, # uncomment to push into hugginhface hub
)


early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)


optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)


def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculates the accuracy
    return {"accuracy": np.mean(predictions == labels)}

trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=small_tokenized_dataset['train'],
    eval_dataset=small_tokenized_dataset['val'], # change to test when you do your final evaluation!
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [19]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.965448,0.66
2,No log,0.889382,0.66
3,No log,0.891198,0.63
4,No log,0.96048,0.64
5,No log,0.983399,0.65
6,No log,1.065786,0.67
7,No log,1.002142,0.65
8,No log,1.020725,0.66
9,No log,1.040846,0.65
10,No log,1.04017,0.65


Checkpoint destination directory store_the_checkpoints_distilbert_3/checkpoint-32 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory store_the_checkpoints_distilbert_3/checkpoint-64 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory store_the_checkpoints_distilbert_3/checkpoint-96 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory store_the_checkpoints_distilbert_3/checkpoint-128 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory store_the_checkpoints_distilbert_3/checkpoint-160 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory store_the_checkpoints_distilbert_3/checkpoint-192 already exists and is non-empty. Saving will proceed but saved results

TrainOutput(global_step=320, training_loss=0.6518671035766601, metrics={'train_runtime': 114.069, 'train_samples_per_second': 43.833, 'train_steps_per_second': 2.805, 'total_flos': 93543314763096.0, 'train_loss': 0.6518671035766601, 'epoch': 10.0})

In [20]:
results = trainer.predict(small_tokenized_dataset['val'])
print(results)

PredictionOutput(predictions=array([[-0.9883296 ,  1.0663089 , -0.19555055],
       [-1.0075653 ,  1.0238501 , -0.20102467],
       [-0.9743034 ,  0.8903042 , -0.10181137],
       [-1.0141464 ,  1.1176709 , -0.28669825],
       [-0.80501634,  0.5868265 ,  0.04448523],
       [-1.0918534 ,  1.1999589 , -0.31343073],
       [-1.1213168 ,  1.2949274 , -0.3600805 ],
       [-1.0289601 ,  1.1206353 , -0.22335055],
       [-1.0982958 ,  1.2857779 , -0.3293508 ],
       [-0.7942334 ,  0.5759672 ,  0.04039393],
       [-1.099818  ,  1.2173591 , -0.32070878],
       [-1.0547999 ,  1.1025351 , -0.31002665],
       [-0.96267444,  0.90857536, -0.12384115],
       [-0.864079  ,  0.84005016, -0.1228763 ],
       [-0.84884995,  0.7809959 , -0.10800491],
       [-1.1136881 ,  1.2661328 , -0.30569682],
       [-1.0974951 ,  1.2130656 , -0.2755209 ],
       [-1.074966  ,  1.1995877 , -0.31873593],
       [-1.0170207 ,  1.0441893 , -0.21396542],
       [-0.97989994,  1.0250355 , -0.16959293],
       [-1.

In [21]:
test_str = "The company's strong quarterly earnings report resulted in a surge in stock prices, reflecting investor confidence in its future prospects."

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("store_the_checkpoints_distilbert_3/checkpoint-320") # pass checkpoint to the model
model_inputs = tokenizer(test_str, return_tensors="pt")

prediction = torch.argmax(fine_tuned_model(**model_inputs).logits)
print(["NEGATIVE", "POSITIVE", "NEUTRAL"][prediction])

NEUTRAL


In [22]:
results = trainer.predict(small_tokenized_dataset['test'])
print(results)

PredictionOutput(predictions=array([[-1.0096958 ,  1.1633741 , -0.36124346],
       [-0.91207147,  0.8892056 , -0.13056034],
       [-1.1081475 ,  1.2072246 , -0.31708384],
       [-1.029619  ,  1.1084139 , -0.2846068 ],
       [-0.83741546,  0.6304093 ,  0.07402465],
       [-1.0062007 ,  1.0679936 , -0.21069936],
       [-0.85467976,  0.6351199 ,  0.03737538],
       [-1.0081426 ,  1.1209158 , -0.26125833],
       [-0.89479905,  0.77574944, -0.06085769],
       [-1.0068668 ,  1.0513003 , -0.17720805],
       [-1.031467  ,  1.1454219 , -0.25403574],
       [-1.059274  ,  1.2410553 , -0.30050936],
       [-1.0226493 ,  1.153878  , -0.3029826 ],
       [-1.0365709 ,  1.0695225 , -0.21671854],
       [-0.8722071 ,  0.746572  , -0.0494629 ],
       [-1.0736469 ,  1.0528276 , -0.2051871 ],
       [-0.8821162 ,  0.7152684 , -0.02819435],
       [-1.1030569 ,  1.3090581 , -0.35124192],
       [-1.0420303 ,  1.0877094 , -0.24052937],
       [-0.93578243,  0.97898924, -0.19884661],
       [-0.

In [23]:
small_finance_dataset['test'][5]

{'sentence': "The government has instead proposed an exchange of the state 's stake in LMT to TeliaSonera 's stake in Lattelecom .",
 'label': 1}

In [24]:
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("store_the_checkpoints_distilbert_3/checkpoint-320")

model_inputs = tokenizer(small_tokenized_dataset['test']['sentence'], padding=True, truncation=True, return_tensors='pt')

outputs = fine_tuned_model(**model_inputs, output_hidden_states=True)

In [25]:
import os
import torch
from torch.utils.tensorboard import SummaryWriter

path = "/content/drive/MyDrive/results_vis_distilbert_2"
layer = 0

if not os.path.exists(path):
    os.mkdir(path)

while layer in range(len(outputs['hidden_states'])):
    if not os.path.exists(path+'/layer_' + str(layer)):
        os.mkdir(path+'/layer_' + str(layer))

    tensors = []
    labels = []

    for example in range(len(outputs['hidden_states'][layer])):
        sp_token_position = 0
        for token in model_inputs['input_ids'][example]:
            if token != 0:
                sp_token_position += 1
            else:
                tensor = outputs['hidden_states'][layer][example][sp_token_position]
                tensors.append(tensor)
                label = [small_tokenized_dataset['test']['sentence'][example], str(small_tokenized_dataset['test']['label'][example])]
                labels.append(label)
                break

    #print("Length of tensors:", len(tensors))
    #print("Length of labels:", len(labels))

    global_step = layer  # Set global_step to the current layer number
    writer = SummaryWriter(path+'/layer_' + str(layer))
    writer.add_embedding(torch.stack(tensors), metadata=labels, metadata_header=['Sentence','Emotion'], global_step=global_step)

    layer += 1




 TensorFlow Embedding Projector [API](https://projector.tensorflow.org/).

____________________



To avoid AssertionError:(#labels should equal with #data points) and ensure that labels are extracted consistently with the corresponding tensors ensure that you're extracting labels consistently with how you're extracting tensors.

To avoid a warning: (Embedding dir exists, did you set global_step for add_embedding()? ) add "global_step = layer"

# 4. Analyzing new data with the model

You can also use pipeline (uncomment a cell with huggingface above):

In [26]:
# Upload the model to the Hub
trainer.push_to_hub()

ValueError: Token is required (write-access action) but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.

In [None]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="") # Add your saved mode in ""

sentiment_model(["The company's strong quarterly earnings report resulted in a surge in stock prices, reflecting investor confidence in its future prospects."])