In [None]:
"""CONLLPP Dataset
https://huggingface.co/datasets/conllpp"""

In [None]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets

In [None]:
import pandas as pd
from datasets import load_dataset

In [None]:
data = pd.read_csv('conllpp.csv') # ????
data

In [None]:
data["train"].features

In [None]:
pd.DataFrame(data['train'][:])[['tokens', 'ner_tags']].iloc[0]


In [None]:
tags = data['train'].features['ner_tags'].feature

index2tag = {idx:tag for idx, tag in enumerate(tags.names)}
tag2index = {tag:idx for idx, tag in enumerate(tags.names)}

In [None]:
index2tag

In [None]:
tags.int2str(3)

In [None]:
def create_tag_names(batch):
  tag_name = {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}
  return tag_name

In [None]:
data = data.map(create_tag_names)

In [None]:
pd.DataFrame(data['train'][:])[['tokens', 'ner_tags', 'ner_tags_str']].iloc[0]

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer.is_fast


In [None]:
inputs = data['train'][0]['tokens']
inputs = tokenizer(inputs, is_split_into_words=True)
print(inputs.tokens())

In [None]:
print(data['train'][0]['tokens'])
print(data['train'][0]['ner_tags_str'])

In [None]:
inputs.word_ids()

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word=None
    for word_id in word_ids:
        if word_id != current_word:
        current_word = word_id
        label = -100 if word_id is None else labels[word_id]
        new_labels.append(label)

    elif word_id is None:
        new_labels.append(-100)

    else:
        label = labels[word_id]

        if label%2==1:
            label = label + 1
        new_labels.append(label)

    return new_labels

In [None]:
labels = data['train'][0]['ner_tags']
word_ids = inputs.word_ids()
print(labels, word_ids)

In [None]:
align_labels_with_tokens(labels, word_ids)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, 
                                 is_split_into_words=True)

    all_labels = examples['ner_tags']

    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs['labels'] = new_labels

    return tokenized_inputs

In [None]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, 
                              remove_columns=data['train'].column_names)


In [None]:
tokenized_datasets

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch

In [None]:
!pip install seqeval
!pip install evaluate

import evaluate
metric = evaluate.load('seqeval')

In [None]:

ner_feature = data['train'].features['ner_tags']
ner_feature

In [None]:
label_names = ner_feature.feature.names
label_names

In [None]:
labels = data['train'][0]['ner_tags']
labels = [label_names[i] for i in labels]
labels

In [None]:
predictions = labels.copy()
predictions[2] = "O"

metric.compute(predictions=[predictions], references=[labels])

In [None]:
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l!=-100] for label
                   in labels]

    true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

In [None]:
id2label = {i:label for i, label in enumerate(label_names)}
label2id = {label:i for i, label in enumerate(label_names)}

In [None]:
print(id2label)


In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
                                                    model_checkpoint,
                                                    id2label=id2label,
                                                    label2id=label2id)

In [None]:
model.config.num_labels

In [None]:
from transformers import TrainingArguments

args = TrainingArguments("distilbert-finetuned-ner",
                         evaluation_strategy = "epoch",
                         save_strategy="epoch",
                         learning_rate = 2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)

In [None]:
from transformers import Trainer
trainer = Trainer(model=model,
                  args=args,
                  train_dataset = tokenized_datasets['train'],
                  eval_dataset = tokenized_datasets['validation'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

trainer.train()

In [None]:
from transformers import pipeline

checkpoint = "/content/distilbert-finetuned-ner/checkpoint-5268"
token_classifier = pipeline(
    "token-classification", model=checkpoint, aggregation_strategy="simple"
)

token_classifier("My name is Laxmi Kant Tiwari. I work at KGP Talkie and live in Mumbai")