In [75]:
%pip install transformers datasets tokenizers seqeval -q
%pip install torch torchvision torchaudio
%pip install transformers[torch]
%pip install accelerate==0.20.1

Collecting accelerate==0.20.1
  Downloading accelerate-0.20.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.5/227.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.21.0
    Uninstalling accelerate-0.21.0:
      Successfully uninstalled accelerate-0.21.0
Successfully installed accelerate-0.20.1


In [78]:
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer

In [10]:
import json

with open("annotations.json", "r") as json_file:
  data = json.load(json_file)

In [11]:
data['annotations'][0][1]['entities'][0:10]

[[18, 26, 'B-ORG'],
 [27, 34, 'I-ORG'],
 [35, 41, 'B-MISC'],
 [42, 46, 'I-MISC'],
 [47, 51, 'I-MISC'],
 [52, 55, 'I-MISC'],
 [58, 62, 'B-LOC'],
 [63, 68, 'I-LOC'],
 [69, 74, 'I-LOC'],
 [75, 78, 'I-LOC']]

## Transform annotations to huggingface datadict

In [12]:
text = data["annotations"][0][0]
tags = data["annotations"][0][1]["entities"]
classes = data["classes"]

classes_dict = {}
for i in range(len(classes)):
  classes_dict[classes[i]] = i

# print(classes_dict)

tokens = []
named_tags = []

for tag in tags:
  tokens.append(text[tag[0]:tag[1]])
  named_tags.append(tag[2])


anno_dict = dict(zip(tokens, named_tags))
print(anno_dict)

all_tokens = text.split(" ")
named_labels = []
numbered_labels = []

for token in all_tokens:
  if token not in anno_dict:
    named_labels.append("NONE")
  else:
    named_labels.append(anno_dict[token])

for label in named_labels:
  numbered_labels.append(classes_dict[label])

print(all_tokens)
# print(named_labels)
print(numbered_labels)

{'Inveniam': 'B-ORG', 'Private': 'I-ORG', 'Equity': 'B-MISC', 'Fund': 'I-ORG', 'Demo': 'I-MISC', 'IV.': 'I-MISC', '8500': 'B-LOC', 'World': 'I-LOC', 'Trade': 'I-LOC', 'New': 'B-LOC', 'York': 'I-LOC', '10022': 'I-LOC', 'January': 'B-DATE', '1': 'I-DATE', '2019': 'I-DATE', 'Maria': 'B-PER', 'Sharapova': 'B-PER', '31': 'B-LOC', 'Blandford': 'I-LOC', 'Street': 'I-LOC', 'London': 'I-LOC', 'IV': 'I-MISC', 'Delaware': 'B-ORG', 'limited': 'B-MISC', 'partnership': 'I-AGRE', 'letter': 'B-AGRE', 'agreement': 'I-AGRE', '"Letter': 'B-AGRE', 'Agreement"': 'I-AGRE', '"Investor"': 'B-MISC', 'Amended': 'B-AGRE', 'Restated': 'I-AGRE', 'Limited': 'B-MISC', 'Partnership': 'I-AGRE', 'Agreement': 'I-AGRE', 'August': 'B-DATE', '27': 'I-DATE', '2021': 'I-DATE', '"Limited': 'B-AGRE', 'Subscription': 'I-AGRE', 'Investor': 'B-MISC', '"Subscription': 'B-AGRE', '$': 'B-CURR', '50,000,000': 'I-CURR', '"Capital': 'B-AGRE', 'Commitment"': 'I-AGRE', 'Capital': 'I-MISC', 'Partners': 'I-MISC', 'GP': 'I-ORG', 'L.L.C.': '

In [13]:
all_tokens_array = []
numbered_labels_array = []

temp1 = []
temp2 = []
for i in range(len(all_tokens)):
  temp1.append(all_tokens[i])
  temp2.append(numbered_labels[i])
  if i%10 == 0 and i != 0:
    all_tokens_array.append(temp1)
    numbered_labels_array.append(temp2)
    temp1 = []
    temp2 = []
all_tokens_array.append(temp1)
numbered_labels_array.append(temp2)


print(all_tokens_array)
print(numbered_labels_array)

[['EXECUTION', 'VERSION', 'Inveniam', 'Private', 'Equity', 'Fund', 'Demo', 'IV.', '.', '8500', 'World'], ['Trade', 'New', 'York,', 'New', 'York', '10022', 'January', '1,', '2019', 'Maria'], ['Sharapova', '31', 'Blandford', 'Street', 'London,', 'W1U', '3DN', 'Ladies', 'and', 'Gentlemen:'], ['RE:', 'Inveniam', 'Private', 'Equity', 'Fund', 'Demo', 'IV,', 'a', 'Delaware', 'limited'], ['partnership', '(the', '"Fund")', 'This', 'letter', 'agreement', '(this', '"Letter', 'Agreement")', 'is'], ['entered', 'into', 'in', 'connection', 'with', 'the', 'purchase', 'by', 'Sharapova', '(the'], ['"Investor")', 'of', 'a', 'limited', 'partnership', 'interest', 'in', 'the', 'Fund.', 'Capitalized'], ['terms', 'used', 'and', 'not', 'defined', 'herein', 'shall', 'have', 'the', 'meanings'], ['given', 'to', 'them', 'in', 'the', 'Fourth', 'Amended', 'and', 'Restated', 'Limited'], ['Partnership', 'Agreement', 'of', 'the', 'Fund', 'dated', 'as', 'of', 'August', '27,'], ['2021', '(the', '"Limited', 'Partnership',

In [14]:
print(classes_dict)

{'NONE': 0, 'B-ORG': 1, 'I-ORG': 2, 'B-DATE': 3, 'I-DATE': 4, 'B-TIME': 5, 'I-TIME': 6, 'B-PER': 7, 'I-PER': 8, 'B-CURR': 9, 'I-CURR': 10, 'B-LOC': 11, 'I-LOC': 12, 'B-AGRE': 13, 'I-AGRE': 14, 'B-MISC': 15, 'I-MISC': 16}


In [15]:
data_dict_train = {
    "tokens" : all_tokens_array[0:200],
    "labels" : numbered_labels_array[0:200]
}

data_dict_val = {
    "tokens" : all_tokens_array[200:225],
    "labels" : numbered_labels_array[200:225]
}

data_dict_test = {
    "tokens" : all_tokens_array[225:],
    "labels" : numbered_labels_array[225:]
}

dataset_train = Dataset.from_dict(data_dict_train)
dataset_val = Dataset.from_dict(data_dict_val)
dataset_test = Dataset.from_dict(data_dict_test)

dataset_dict = DatasetDict({"train": dataset_train, "validate": dataset_val, "test": dataset_test})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 200
    })
    validate: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 25
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 25
    })
})


In [16]:
dataset_dict['train'][11]

{'tokens': ['Investor',
  'and',
  'the',
  'Fund',
  'dated',
  'as',
  'of',
  'the',
  'date',
  'hereof'],
 'labels': [15, 0, 0, 2, 0, 0, 0, 0, 0, 0]}

### Modeling

In [44]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [45]:
example_tokens = dataset_dict["train"][11]
inputs = tokenizer(example_tokens["tokens"], is_split_into_words=True,)
print("Miss Matched Length!!")
print(len(inputs.tokens()))
print(len(example_tokens["labels"]))

Miss Matched Length!!
13
10


In [46]:
inputs = tokenizer(dataset_dict["train"][1]["tokens"], is_split_into_words=True,)
print(inputs.tokens())
print(dataset_dict["train"][1]["tokens"])
print(dataset_dict['train'][1]['labels'])
print(inputs.word_ids())

['[CLS]', 'trade', 'new', 'york', ',', 'new', 'york', '100', '##22', 'january', '1', ',', '2019', 'maria', '[SEP]']
['Trade', 'New', 'York,', 'New', 'York', '10022', 'January', '1,', '2019', 'Maria']
[12, 11, 0, 11, 12, 12, 3, 0, 4, 7]
[None, 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, None]


In [47]:
def align_labels_with_tokens(labels, word_ids):
    for label in labels:
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(labels[word_idx])
            previous_word_idx = word_idx

        return label_ids

In [48]:
labels = dataset_dict["train"][11]["labels"]
inputs = tokenizer(dataset_dict["train"][11]["tokens"], is_split_into_words=True,)
word_ids = inputs.word_ids()
print(dataset_dict["train"][11]["tokens"])
print(align_labels_with_tokens(labels, word_ids))
print(labels)

['Investor', 'and', 'the', 'Fund', 'dated', 'as', 'of', 'the', 'date', 'hereof']
[-100, 15, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, -100]
[15, 0, 0, 2, 0, 0, 0, 0, 0, 0]


In [49]:
example_tokens = dataset_dict["train"][11]
inputs = tokenizer(example_tokens["tokens"], is_split_into_words=True,)
print("Miss Matched Length!!")
print(len(inputs.tokens()))
print(len(example_tokens["labels"]))
print("Label matching successful!!")
print(len(align_labels_with_tokens(example_tokens["labels"], inputs.word_ids())))

Miss Matched Length!!
13
10
Label matching successful!!
13


In [50]:
def tokenizer_function(dataset):
  tokenized_dataset = tokenizer(dataset['tokens'],truncation=True,is_split_into_words=True,)
  tokenized_dataset['labels'] = align_labels_with_tokens(dataset['labels'],tokenized_dataset.word_ids())
  return tokenized_dataset

In [51]:
dataset_dict['train'][15]

{'tokens': ['$50,000,000',
  'as',
  'of',
  'the',
  'date',
  'hereof',
  '(the',
  '"Capital',
  'Commitment"),',
  'Inveniam'],
 'labels': [0, 0, 0, 0, 0, 0, 0, 13, 0, 1]}

In [52]:
# print(dataset_dict)
tokenized_dataset = dataset_dict.map(tokenizer_function, remove_columns=['tokens'])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [53]:
# print(len(tokenized_dataset['train'][15]['input_ids']))
# print(len(tokenized_dataset['train'][15]['labels']))

In [66]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=17)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

In [79]:
args = TrainingArguments(
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

ImportError: ignored