In [None]:
!pip install transformers datasets

In [None]:
# Import nltk and the brown module
import nltk
from nltk.corpus import brown

In [None]:
# Download the ncessary data from the dataset
nltk.download('brown')
nltk.download('universal_tagset')

In [None]:
# Load in our corpus using the universal tag-set. The dataset is in the form of a ist of lists of tuples
corpus = brown.tagged_sents(tagsets='universal')
corpus

In [None]:
# Separate the inputs and targets to make the dataset usable to hugingface
inputs = []
targets = []

for sentence_tag_pairs in corpus:
  tokens = []
  target = []
  for token, tag in sentence_tag_pairs:
    tokens.append(toekn)
    target.append(tag)
  inputs.append(tokens)
  targets.append(target)

In [None]:
# Save data to json format
import json

with open('data.jason', 'w') as f:
  for x, y in zip(inputs,targets):
    j = {'inputs':x, 'targets': y}
    s = json.dumps(j)
    f.write(f"{s}\n")


In [None]:
# Import function
from datasets import load_dataset

In [None]:
# Call load_dataset
data = load_dataset("json", data_files='data.json')

In [None]:
# Inspect the data
data

In [None]:
# 60k samples is to much, shuffle the dataset and take the first 20k samples
small = data["train"].shuffle(seed=42).select(range(20_00))
small

In [None]:
# Do a train test split
data = small.train_test_split(seed=42)

In [None]:
# Check out the first sample of the dataset, the input is a list of words, the target is a list of tags
data["train"][0]

In [None]:
# Check out the features attribute of our dataset
data["train"].features

In [None]:
# Map targets to ints
target_set = set()
for target in targets:
  target_set = target_set.union(target)
target_set


In [None]:
# Create id2label and label2id
target_list = list(target_set)
id2label = {k: v for k, v in enumerate(target_list)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
# Create the auto-tokenizer, you can try to use bert if u wish and compare the results
from transformers import AutoTokenizer

# Also try using Bert
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# Test the tokenizer on the first sample of our dataset
idx = 0
t = tokenizer(data["train"][idx]["inputs"], is_split_into_words=True)
t

In [None]:
# The output is not a dict
type(t)

In [None]:
# the batchencoding object has a tokens method
t.tokens()

In [None]:
# Value of i indicates it is the i'th word
# In the input sentence (counting from 0)
t.word_ids()

In [None]:
# Define method, tak in list of labels and word id's, transforming frpm string to int
def align_targets(labels, word_ids):
  aligned_labels = []
  for word in word_ids:
    if word is None:
      # It's a token like [CLS]
      label = -100
    else:
      # It's a real word
      label = label2id[labels[word]]

    # Add the label
    aligned_labels.append(label)

  return aligned_labels

In [None]:
# Try our function
labels = data['train'][idx]['targets']
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
aligned_targets

In [None]:
# Print out the aligned labels with the tokenized inputs
aligned_labels = [id2label[i] if i >= 0 else None for i in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
  print(f"{x}\t{y}")

In [None]:
# Tokenize both inputs and targets
def tokenize_fn(batch):
  # tokenize the input sequence first
  # this populates input_ids. attention_mask, etc
  tokenized_inputs = tokenizer(
      batch['inputs'], truncation=True, is_split_into_words=True
  )

  labels_batch = batch['targets'] # original targets
  aligned_labels_batch = []
  for i, labels in enumerate(labels_batch):
    word_ids = tokenized_inputs.word_ids(i)
    aligned_labels_batch.append(align_targets(labels, word_ids))

    # recall: the 'target' must be stored in key called 'labels'
    tokenized_inputs['labels'] = aligned_labels_batch

    return tokenized_inputs

In [None]:
# Remove XX from model inputs - they are neither inputs nor targets
data["train"].column_names

In [None]:
# Map the tokenize function to each sample in our dataset, batching to make it more efficent and we remove the colums
tokenized_datasets = data.map(
    tokenize_fn,
    batched=True,
    remove_columns=data["train"].column_names,
)

In [None]:
# Check the outputs
tokenized_datasets

In [None]:
# Create the data collator
from transformaers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
# Flattens a list of lists
def flatten(lit_of_lists):
  flattened = [val for sublist in list of lists for val in sublist]
  return flattened

In [None]:
# Compute metrics function
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  preds = np.argmax(logits, axis= -1)

  # remove -100 from labels and predictions
  labels_jagged = [[t for t in label if t !=100] for label in labels]

  # do the same for predictions whenever true label is -100
  preds_jagged = [[p for p , t in zip(ps, ts) if t != -100] \
      for ps, ts in zip(preds, labels)
  ]

  # flatten labels nd preds
  labels_flat = flatten(labels_jagged)
  preds_flat = flatten(preds_jagged)

  acc = accuracy_score(labels_flat, preds_flat)
  f1 = f1_score(labels_flat, preds_flat, average='macro')

  return {
      'f1':f1,
      'accuracy':acc,
  }


In [None]:
# Test our compute metrics function
labels = [[-100, 0, 0, 1, 2, 1, -100]]
logits = np.array([[
    [0.8, 0.1, 0.1],
    [0,8, 0.1, 0.1],
    [0,8, 0.1, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
]])
compute_metrics((logits, labels))

In [None]:
# Load up our pre-trained model
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
# Create the training-arguments object
from transformers import TrainingArguments

training_args = TrainingArguments(
    "distilbert-finetuned-ner",
    evaluation_strategy="epoch",
    num_train_epochs=2,
)

In [None]:
# Create the trainer object an passing in what we've created above
from transformers import Trainer

trainer = Triner(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenier=tokenizer
)
trainer.train() # Begins the fine-tuning process

In [None]:
# Save our model
trainer save_model('my_saved_model')

In [None]:
# Load up our model as a pipeline object
from ransformers import pipeline

pipe = pipelie(
    "token-classification",
    model='my_saved_model',
    device=0,
)

In [None]:
# Test the pipeline on a simple sentence
s = "Bill Gates was the CEO of Microsoft in Seattle, Washington."
ner(s)

In [None]:
s = "Bruce Wayne livs in Gotham City, he goes under the name Batman."
ner(s)

In [None]:
s = "Peter Parker is an American teenager .He lives in New York.What is he like ? He has got short straight brown hair, brown eyes, a long noseand a square face. He has got glasses. He is tall and slim but he isn't very strong !"
ner(s)