# Named Entity Recognition (NER)

### XLM-RoBERTa

## Road Map:
* Load XTREME Dataset
* Data Preprocessing
* The XLM - RoBERTa Model
* Evaluation

### Loading the dataset

In [None]:
!pip -q install datasets

In [None]:
from datasets import load_dataset
from collections import defaultdict
from datasets import DatasetDict

langs = ['de', 'fr', 'it', 'en'] # dataset'de ki bulunan diller
fracs = [0.629, 0.229, 0.084, 0.059] # bu dillerin dataset'de ki bulunma oranlari

panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset('xtreme', name=f'PAN-X.{lang}')
    for split in ds:
        panx_ch[lang][split] = (ds[split].shuffle(seed=0).select(range(int(frac*ds[split].num_rows))))

In [None]:
import pandas as pd

pd.DataFrame({lang:[panx_ch[lang]['train'].num_rows] for lang in langs}, index=['number of training examples'])

In [None]:
element = panx_ch['de']['train'][0]

for key, value in element.items():
    print(f'{key} : {value}')

In [None]:
for key, value in panx_ch['de']['train'].features.items():
    print(f'{key} : {value}')

In [None]:
tags = panx_ch['de']['train'].features['ner_tags'].feature
print(tags)

In [None]:
def create_tag_names(batch):
    return {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}

panx_de = panx_ch['de'].map(create_tag_names)
de_example = panx_de['train'][0]
pd.DataFrame([de_example['tokens'], de_example['ner_tags_str']], ['Tokens', 'Tags'])

In [None]:
from collections import Counter

split2freqs = defaultdict(Counter)

for split, dataset in panx_de.items():
    for row in dataset['ner_tags_str']:
        for tag in row:
            if tag.startswith('B'):
                tag_type = tag.split('-')[1]
                split2freqs[split][tag_type] += 1

pd.DataFrame.from_dict(split2freqs, orient='index')

## Data Preprocessing
### Multilingual Transformers kullanacagiz
#### XLM-Roberta modelini kullanacagiz

#### Tokenize

Bert Tokenizer vs XLM-R Tokenizer

In [None]:
from transformers import AutoTokenizer

bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
xlmr_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

In [None]:
text = 'Tim Sparrow lives San Diego.'

bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

pd.DataFrame([bert_tokens, xlmr_tokens], index=['BERT', 'XLM-R'])

In [None]:
from transformers import XLMRobertaForTokenClassification
import torch

xlmr_model_name = 'xlm-roberta-base'

index2tag = {idx:tag for idx, tag in enumerate(tags.names)}
tag2index = {tag:idx for idx, tag in enumerate(tags.names)}

num_labels = tags.num_classes

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

xlmr_model = XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name,
                                                             num_labels=num_labels,
                                                             id2label=index2tag,
                                                             label2id=tag2index,
                                                             ).to(device)

In [None]:
print(text)

In [None]:
input_ids = xlmr_tokenizer.encode(text, return_tensors='pt')

pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=['Tokens', 'Input Ids'])

In [None]:
outputs = xlmr_model(input_ids.to(device)).logits
print(outputs.shape)

predictions = torch.argmax(outputs, dim=-1)
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds],
            index=['Tokens', 'Tags'])

Yukaridaki islemleri yapmasi icin bir fonksiyon yazalim;

In [None]:
def tag_text(text, tags, model, tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = xlmr_tokenizer.encode(text, return_tensors='pt').to(device)
    outputs = model(input_ids)[0]
    predictions = torch.argmax(outputs, dim=2)
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=['Tokens', 'Tags'])

In [None]:
words, labels = de_example['tokens'], de_example['ner_tags']
pd.DataFrame([words, labels], index=['words', 'labels'])

In [None]:
tokenized_input = xlmr_tokenizer(de_example['tokens'], is_split_into_words=True)

tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])

pd.DataFrame([tokens], index=['Tokens'])

In [None]:
word_ids = tokenized_input.word_ids()

pd.DataFrame([tokens, word_ids], index=['Tokens', 'Word Ids'])

In [None]:
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx

labels = [index2tag[l] if l != -100 else 'IGN' for l in label_ids]

index = ['Tokens', 'Word Ids', 'Label Ids', 'Labels']

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [None]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels,
                     batched=True,
                     remove_columns=['langs', 'ner_tags', 'tokens'])

panx_de_encoded = encode_panx_dataset(panx_ch['de'])

### Evaluate Metrics

In [None]:
!pip install -q seqeval

In [None]:
from seqeval.metrics import classification_report

y_true = [['0', '0', '0', 'B-MISC', 'I-MISC', 'I-MISC', '0', 'B-PER', 'I-PER', '0']]
y_pred = [['0', '0', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', '0', 'B-PER', 'I-PER', '0']]

print(classification_report(y_true, y_pred))

In [None]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])
        
        labels_list.append(example_labels)
        preds_list.append(example_labels)

    return preds_list, labels_list

## Model Training

In [None]:
from transformers import TrainingArguments

In [None]:
num_epochs = 5
batch_size = 24
logging_steps = len(panx_de_encoded['train']) // batch_size
model_name = 'multilingual-xlm-roberta-for-ner'

In [None]:
training_args = TrainingArguments(
    output_dir=model_name,
    log_level="error",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    save_steps=1e6,
    weight_decay=0.01,
    logging_steps=logging_steps,
    report_to='none',
    push_to_hub=True,
)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    
    return {'f1': f1_score(y_true, y_pred)}

In [None]:
from transformers import DataCollatorForTokenClassification

In [None]:
data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [None]:
def model_init():
    return xlmr_model

In [None]:
from transformers import Trainer

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=panx_de_encoded['train'],
    eval_dataset=panx_de_encoded['validation'],
    tokenizer=xlmr_tokenizer,
)

In [None]:
trainer.train()
trainer.push_to_hub(commit_message='Training completed.')