<a href="https://colab.research.google.com/github/LordLean/Extracting-Green-Bonds-Use-of-Proceeds/blob/main/QA_FinBert_Finetuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Weights and Biases

In [None]:
!pip install wandb

import wandb
wandb.login()

In [3]:
%env WANDB_PROJECT=squad_question_answering
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all

env: WANDB_PROJECT=squad_question_answering
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


# Load Model 

In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 8.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 63.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 76.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [6]:
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
    
model_checkpoint = "yiyanghkust/finbert-pretrain"

# Instance model with an untrained QA head.
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/421M [00:00<?, ?B/s]

Some weights of the model checkpoint at yiyanghkust/finbert-pretrain were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at yiyanghk

Downloading vocab.txt:   0%|          | 0.00/221k [00:00<?, ?B/s]

# Load Dataset

In [7]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 6.9 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 77.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 64.7 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 80.3 MB/s 
Installing collected packages: urllib3, xxhash, responses, multiprocess, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.

In [8]:
# Max feature length
max_length = 384
# Overlap length
doc_stride = 128

In [9]:
def preprocess_dataset(samples):
  """
  Function to handle truncation, padding, and store mapping positions.
  """

  samples["question"] = [question.lstrip() for question in samples["question"]]

  # Tokenize our samples with truncation and padding, but keep the overflows using a stride. 
  tokenized_samples = tokenizer(
      samples["question"],
      samples["context"],
      truncation="only_second",
      max_length=max_length,
      stride=doc_stride,
      padding="max_length",
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
  )

  # Mapping for longer sequences which will give multiples features.
  sample_mapping = tokenized_samples.pop("overflow_to_sample_mapping")
  # Mapping from token to char positions in context. 
  offset_mapping = tokenized_samples.pop("offset_mapping")

  tokenized_samples["start_positions"] = []
  tokenized_samples["end_positions"] = []
  for i, offset in enumerate(offset_mapping):

      input_ids = tokenized_samples["input_ids"][i]
      cls_index = input_ids.index(tokenizer.cls_token_id)
      # Context vs question
      sequence_ids = tokenized_samples.sequence_ids(i)

      # Spanned features
      sample_index = sample_mapping[i]
      answers = samples["answers"][sample_index]
      # If no answers, make cls_index the answer.
      if len(answers["answer_start"]) == 0:
          tokenized_samples["start_positions"].append(cls_index)
          tokenized_samples["end_positions"].append(cls_index)
      else:
          start_char = answers["answer_start"][0]
          end_char = start_char + len(answers["text"][0])

          # curr span start token index
          token_start_index = 0
          # curr span end token index
          token_end_index = len(input_ids) - 1
          while sequence_ids[token_start_index] != (1):
              token_start_index += 1
          while sequence_ids[token_end_index] != (1):
              token_end_index -= 1

          if not (offset[token_start_index][0] <= start_char and offset[token_end_index][1] >= end_char):
              tokenized_samples["start_positions"].append(cls_index)
              tokenized_samples["end_positions"].append(cls_index)
          else:
              while token_start_index < len(offset) and offset[token_start_index][0] <= start_char:
                  token_start_index += 1
              tokenized_samples["start_positions"].append(token_start_index - 1)
              while offset[token_end_index][1] >= end_char:
                  token_end_index -= 1
              tokenized_samples["end_positions"].append(token_end_index + 1)

  return tokenized_samples

In [10]:
from datasets import load_dataset
from transformers import DefaultDataCollator

# Load squad dataset
dataset = load_dataset("squad")
# Apply preprocessing function to all sentence pairs in dataset.
tokenized_dataset = dataset.map(preprocess_dataset, batched=True, remove_columns=dataset["train"].column_names)

# Training and evaluation dataset
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]

# Data collator to batch the processed examples together.
data_collator = DefaultDataCollator()

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

# Training Stage 

## Training Args

In [11]:
# default to an instance of AdamW and a scheduler given by get_linear_schedule_with_warmup()
batch_size = 8
learning_rate = 1e-4
num_train_epochs = 2
weight_decay = 0.05

In [12]:
import string
from random import choice
chars = string.digits
run_name = ''.join(choice(chars) for _ in range(7))
print(run_name)

1589230


In [13]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_train_epochs,
    weight_decay = weight_decay,
    fp16 = True,
    # load_best_model_at_end = True,
    report_to = "wandb",
    run_name = run_name,
)

## Train

In [14]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using cuda_amp half precision backend


In [None]:
trainer.train()

wandb.finish()

In [16]:
from google.colab import files

trainer.save_model(f"{model_name}-finetuned-squad/model")

Saving model checkpoint to finbert-pretrain-finetuned-squad/model
Configuration saved in finbert-pretrain-finetuned-squad/model/config.json
Model weights saved in finbert-pretrain-finetuned-squad/model/pytorch_model.bin
tokenizer config file saved in finbert-pretrain-finetuned-squad/model/tokenizer_config.json
Special tokens file saved in finbert-pretrain-finetuned-squad/model/special_tokens_map.json


In [20]:
print(run_name)

1892258


In [18]:
!zip -r /content/1892258.zip /content/finbert-pretrain-finetuned-squad/model

  adding: content/finbert-pretrain-finetuned-squad/model/ (stored 0%)
  adding: content/finbert-pretrain-finetuned-squad/model/vocab.txt (deflated 50%)
  adding: content/finbert-pretrain-finetuned-squad/model/tokenizer.json (deflated 70%)
  adding: content/finbert-pretrain-finetuned-squad/model/pytorch_model.bin (deflated 7%)
  adding: content/finbert-pretrain-finetuned-squad/model/training_args.bin (deflated 48%)
  adding: content/finbert-pretrain-finetuned-squad/model/special_tokens_map.json (deflated 42%)
  adding: content/finbert-pretrain-finetuned-squad/model/config.json (deflated 47%)
  adding: content/finbert-pretrain-finetuned-squad/model/tokenizer_config.json (deflated 45%)


In [19]:
files.download("{}.zip".format(run_name))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>