----------------------
**Author**: Gunnvant

**Description**: Do relevant data manipulations using hugging face datasets and model training flow

----------------------

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_data = load_dataset("csv",data_files="../preprocessed.csv")

In [3]:
raw_data['train'][0]

{'Word': "['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']",
 'ner_tags': '[11, 11, 11, 11, 11, 11, 7, 11, 11, 11, 11, 11, 7, 11, 11, 11, 11, 11, 16, 11, 11, 11, 11, 11]'}

In [4]:
import json
with open("tag_mapping.json","r") as f:
    label2id = json.loads(f.read())

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [6]:
import ast
example = raw_data['train'][0]
tokenized_input = tokenizer(ast.literal_eval(example["Word"]), is_split_into_words=True)

In [7]:
tokenized_input

{'input_ids': [101, 5190, 1997, 28337, 2031, 9847, 2083, 2414, 2000, 6186, 1996, 2162, 1999, 5712, 1998, 5157, 1996, 10534, 1997, 2329, 3629, 2013, 2008, 2406, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])

['[CLS]',
 'thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'london',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'british',
 'troops',
 'from',
 'that',
 'country',
 '.',
 '[SEP]']

In [9]:
### Need to treat each word and ner tag entry in data as a list
def convert_list(example):
    example['Word'] = ast.literal_eval(example['Word'])
    example['ner_tags'] = ast.literal_eval(example['ner_tags'])
    return example

In [10]:
raw_data = raw_data.map(convert_list)

In [11]:
raw_data['train'][0]

{'Word': ['Thousands',
  'of',
  'demonstrators',
  'have',
  'marched',
  'through',
  'London',
  'to',
  'protest',
  'the',
  'war',
  'in',
  'Iraq',
  'and',
  'demand',
  'the',
  'withdrawal',
  'of',
  'British',
  'troops',
  'from',
  'that',
  'country',
  '.'],
 'ner_tags': [11,
  11,
  11,
  11,
  11,
  11,
  7,
  11,
  11,
  11,
  11,
  11,
  7,
  11,
  11,
  11,
  11,
  11,
  16,
  11,
  11,
  11,
  11,
  11]}

In [12]:
### Repeat the preprocessing step again on a single example
example = raw_data['train'][0]
tokenized_input = tokenizer(example["Word"], is_split_into_words=True)
tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])

['[CLS]',
 'thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'london',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'british',
 'troops',
 'from',
 'that',
 'country',
 '.',
 '[SEP]']

In [13]:
### tokenization and alognment logic
examples = raw_data['train'][0:10]
tokenized_inputs = tokenizer(examples["Word"], truncation=True, is_split_into_words=True)

In [14]:
tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'][2]) ### guns and ##hips are one word

['[CLS]',
 'helicopter',
 'guns',
 '##hips',
 'saturday',
 'pounded',
 'militant',
 'hideout',
 '##s',
 'in',
 'the',
 'or',
 '##ak',
 '##zai',
 'tribal',
 'region',
 ',',
 'where',
 'many',
 'taliban',
 'militants',
 'are',
 'believed',
 'to',
 'have',
 'fled',
 'to',
 'avoid',
 'an',
 'earlier',
 'military',
 'offensive',
 'in',
 'nearby',
 'south',
 'wa',
 '##zi',
 '##rist',
 '##an',
 '.',
 '[SEP]']

In [15]:
tokenized_inputs.word_ids(batch_index=2) ## this is able to find out parts of words for single word (1,1 refers to gunship)

[None,
 0,
 1,
 1,
 2,
 3,
 4,
 5,
 5,
 6,
 7,
 8,
 8,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 30,
 30,
 30,
 31,
 None]

In [16]:
examples['ner_tags'][0:2]

[[11,
  11,
  11,
  11,
  11,
  11,
  7,
  11,
  11,
  11,
  11,
  11,
  7,
  11,
  11,
  11,
  11,
  11,
  16,
  11,
  11,
  11,
  11,
  11],
 [16,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  12,
  11,
  11,
  11,
  8,
  11,
  11,
  11,
  11,
  11]]

In [17]:
### Label Alignment 
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["Word"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [18]:
raw_data = raw_data.map(tokenize_and_align_labels,batched=True)

In [19]:
raw_data['train'][2]

{'Word': ['Helicopter',
  'gunships',
  'Saturday',
  'pounded',
  'militant',
  'hideouts',
  'in',
  'the',
  'Orakzai',
  'tribal',
  'region',
  ',',
  'where',
  'many',
  'Taliban',
  'militants',
  'are',
  'believed',
  'to',
  'have',
  'fled',
  'to',
  'avoid',
  'an',
  'earlier',
  'military',
  'offensive',
  'in',
  'nearby',
  'South',
  'Waziristan',
  '.'],
 'ner_tags': [11,
  11,
  12,
  11,
  11,
  11,
  11,
  11,
  7,
  11,
  11,
  11,
  11,
  11,
  8,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  7,
  6,
  11],
 'input_ids': [101,
  7739,
  4409,
  19801,
  5095,
  13750,
  16830,
  29588,
  2015,
  1999,
  1996,
  2030,
  4817,
  25290,
  8807,
  2555,
  1010,
  2073,
  2116,
  16597,
  17671,
  2024,
  3373,
  2000,
  2031,
  6783,
  2000,
  4468,
  2019,
  3041,
  2510,
  5805,
  1999,
  3518,
  2148,
  11333,
  5831,
  15061,
  2319,
  1012,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,

In [20]:
from transformers import DataCollatorForTokenClassification

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [23]:
import evaluate
seqeval = evaluate.load("seqeval")

Downloading builder script: 100%|████████████████████████████████| 6.34k/6.34k [00:00<00:00, 6.97MB/s]


In [29]:
label2id

{'I-org': 0,
 'I-art': 1,
 'I-gpe': 2,
 'B-art': 3,
 'I-nat': 4,
 'I-tim': 5,
 'I-geo': 6,
 'B-geo': 7,
 'B-org': 8,
 'B-nat': 9,
 'B-eve': 10,
 'O': 11,
 'B-tim': 12,
 'I-per': 13,
 'I-eve': 14,
 'B-per': 15,
 'B-gpe': 16}

In [31]:
label_list = list(label2id.keys())

In [32]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [33]:
id2label = {k:v for v,k in label2id.items()}

In [37]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=17, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=10,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=raw_data["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()