In [None]:
!pip install transformers datasets

In [None]:
# Import nltk and the brown module
import nltk
from nltk.corpus import brown

In [None]:
# Download the ncessary data from the dataset
nltk.download('brown')
nltk.download('universal_tagset')

In [None]:
# Load in our corpus using the universal tag-set. The dataset is in the form of a ist of lists of tuples
corpus = brown.tagged_sents(tagsets='universal')
corpus

In [None]:
# Separate the inputs and targets to make the dataset usable to hugingface
inputs = []
targets = []

for sentence_tag_pairs in corpus:
  tokens = []
  target = []
  for token, tag in sentence_tag_pairs:
    tokens.append(toekn)
    target.append(tag)
  inputs.append(tokens)
  targets.append(target)

In [None]:
# Save data to json format
import json

with open('data.jason', 'w') as f:
  for x, y in zip(inputs,targets):
    j = {'inputs':x, 'targets': y}
    s = json.dumps(j)
    f.write(f"{s}\n")


In [None]:
# Import function
from datasets import load_dataset

In [None]:
# Call load_dataset
data = load_dataset("json", data_files='data.json')

In [None]:
# Inspect the data
data

In [None]:
# 60k samples is to much, shuffle the dataset and take the first 20k samples
small = data["train"].shuffle(seed=42).select(range(20_00))
small

In [None]:
# Do a train test split
data = small.train_test_split(seed=42)

In [None]:
# Check out the first sample of the dataset, the input is a list of words, the target is a list of tags
data["train"][0]

In [None]:
# Check out the features attribute of our dataset
data["train"].features

In [None]:
# Map targets to ints
target_set = set()
for target in targets:
  target_set = target_set.union(target)
target_set


In [None]:
# Create id2label and label2id
target_list = list(target_set)
id2label = {k: v for k, v in enumerate(target_list)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
# Create the auto-tokenizer, you can try to use bert if u wish and compare the results
from transformers import AutoTokenizer

# Also try using Bert
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# Test the tokenizer on the first sample of our dataset
idx = 0
t = tokenizer(data["train"][idx]["inputs"], is_split_into_words=True)
t

In [None]:
# The output is not a dict
type(t)

In [None]:
# the batchencoding object has a tokens method
t.tokens()

In [None]:
# Value of i indicates it is the i'th word
# In the input sentence (counting from 0)
t.word_ids()

In [None]:
# Define method, tak in list of labels and word id's, transforming frpm string to int
def align_targets(labels, word_ids):
  aligned_labels = []
  for word in word_ids:
    if word is None:
      # It's a token like [CLS]
      label = -100
    else:
      # It's a real word
      label = label2id[labels[word]]

    # Add the label
    aligned_labels.append(label)

  return aligned_labels

In [None]:
# Try our function
labels = data['train'][idx]['targets']
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
aligned_targets

In [None]:
# Print out the aligned labels with the tokenized inputs
aligned_labels = [id2label[i] if i >= 0 else None for i in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
  print(f"{x}\t{y}")

In [None]:
# Tokenize both inputs and targets
def tokenize_fn(batch):
  # tokenize the input sequence first
  # this populates input_ids. attention_mask, etc
  tokenized_inputs = tokenizer(
      batch['inputs'], truncation=True, is_split_into_words=True
  )

  labels_batch = batch['targets'] # original targets
  aligned_labels_batch = []
  for i, labels in enumerate(labels_batch):
    word_ids = tokenized_inputs.word_ids(i)
    aligned_labels_batch.append(align_targets(labels, word_ids))

    # recall: the 'target' must be stored in key called 'labels'
    tokenized_inputs['labels'] = aligned_labels_batch

    return tokenized_inputs

In [None]:
# Remove XX from model inputs - they are neither inputs nor targets
data["train"].column_names

In [None]:
# 
tokenized_datasets = data.map(
    tokenize_fn,
    batched=True,
    remove_columns=data["train"].column_names,
)