# Installs

In [1]:
# !pip install -q torch==1.13.1 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

In [2]:
# !pip install -q -r requirements.txt

# Specs

In [3]:
!nvidia-smi

Fri Sep 15 13:51:38 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:00:05.0 Off |                  Off |
| 30%   31C    P8    22W / 300W |      1MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Imports

In [4]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

import subprocess
from joblib import Parallel, delayed
import multiprocessing

import cv2
import PIL
from PIL import Image
import matplotlib.pyplot as plt

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, StratifiedGroupKFold, GroupKFold
from sklearn.metrics import f1_score, mean_squared_error, accuracy_score

In [60]:
from collections import Counter
from glob import glob

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
from torch import LongTensor
from torch import nn, optim
from torch.nn import CrossEntropyLoss
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler

In [7]:
import argparse
import logging

from scipy.sparse import save_npz, load_npz
# from seqeval.metrics import f1_score, precision_score, recall_score
from tqdm import tqdm, trange
import datasets
from datasets import load_dataset, DatasetDict, Dataset as HFDataset
from datasets import concatenate_datasets, interleave_datasets
from datasets import ClassLabel, load_metric

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_MASKED_LM_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    DataCollatorForTokenClassification,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version

%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


# Envs

In [8]:
def disable_warnings(strict=False):
	warnings.simplefilter('ignore')
	if strict:
		logging.disable(logging.WARNING)

def seed_everything(seed=42):
	random.seed(seed)
	os.environ['PYTHONHASHSEED'] = str(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False

In [9]:
SEED = 42

disable_warnings()
seed_everything(SEED)

# Data

In [12]:
path  = '../lacuna_pos_ner/language_corpus/'

In [41]:
def load_test_data_ner(data_path, lang, split='train'):
    language = lang.split('_')[0]
    
    try:
        with open(f'{data_path}{lang}/{split}.txt', 'r') as f:
            sentences = []
            
            for line in f.readlines():
                line = line.strip().rstrip()
                line = re.sub('.$', ' .', line)
                line = line.replace('. ', ' . ')
                for punct in list("!#$%&()*+,/:;<=>?@[]^_{|}~"):
                    line = line.replace(punct, f' {punct}')
                line = line.strip().rstrip()
                
                if len(line):
                    sentences.append(line)

        data = pd.DataFrame({
            'Word': sentences,
            'Language': language.split('_')[0]
        })
        data['family'] = language
        data['region'] = language
        return data
    except FileNotFoundError:
        return pd.DataFrame()

In [42]:
# dir(string)
# list("!#$%&\()*+,/:;<=>?@[\\]^_{|}~")

In [43]:
data = pd.concat([
    load_test_data_ner(path, 'luo', 'luo'),
    load_test_data_ner(path, 'tsn', 'tsn'),
    load_test_data_ner(path, 'tsn', 'tsn2'),
]).drop_duplicates().reset_index(drop=True)

display(data.head())
display(data.tail())

Unnamed: 0,Word,Language,family,region
0,Jalup Ker Ruto nobedo e nyasi achiel kod Ker U...,luo,luo,luo
1,"Jalup Ker , William Ruto mosebedo kabare e ny...",luo,luo,luo
2,"Nyasino timore Mei tarik 27 e Od Piny Owacho ,...",luo,luo,luo
3,Ma en kinde mar adek ma jotelo ariyogo obedo e...,luo,luo,luo
4,"Chieng' Tich Ariyo , Dan Maanzo , ma en jakon...",luo,luo,luo


Unnamed: 0,Word,Language,family,region
12175,Bagaka ba Bundesliga ba ba tshwanang le motshw...,tsn,tsn,tsn
12176,"Re tshwanetse go diragatsa togamaano ya rona ,...",tsn,tsn,tsn
12177,Hungary le Portugal di tshamekile mengwe ya me...,tsn,tsn,tsn
12178,Hungary e ne e itlhomile kwa pele mo setlhophe...,tsn,tsn,tsn
12179,Ditlhopha di le robedi di tlaa tsenela dikgais...,tsn,tsn,tsn


# Training

In [44]:
data.head()

Unnamed: 0,Word,Language,family,region
0,Jalup Ker Ruto nobedo e nyasi achiel kod Ker U...,luo,luo,luo
1,"Jalup Ker , William Ruto mosebedo kabare e ny...",luo,luo,luo
2,"Nyasino timore Mei tarik 27 e Od Piny Owacho ,...",luo,luo,luo
3,Ma en kinde mar adek ma jotelo ariyogo obedo e...,luo,luo,luo
4,"Chieng' Tich Ariyo , Dan Maanzo , ma en jakon...",luo,luo,luo


In [51]:
text_column_name = 'Word'
label_column_name = 'Pos'
num_labels = 18

In [64]:
id_to_label = {0: 'ADJ',
 1: 'ADP',
 2: 'ADV',
 3: 'AUX',
 4: 'CCONJ',
 5: 'DET',
 6: 'INTJ',
 7: 'NOUN',
 8: 'NUM',
 9: 'PART',
 10: 'PRON',
 11: 'PROPN',
 12: 'PUNCT',
 13: 'SCONJ',
 14: 'SYM',
 15: 'VERB',
 16: 'X',
 17: 'NAW'}

In [56]:
def load_model(model_path):
    model = AutoModelForTokenClassification.from_pretrained(
        model_path,
        num_labels=num_labels
    )
    
    return model

In [48]:
model_name = 'Davlan/afro-xlmr-large-75L'

max_seq_length = 1024
padding = False

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels
)

tokenizer_name_or_path = model_name
if config.model_type in {"gpt2", "roberta"}:
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name_or_path,
        use_fast=True,
        add_prefix_space=True,
    )
else:
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name_or_path,
        use_fast=True,
    )
    
data_collator = DataCollatorForTokenClassification(tokenizer, max_length=max_seq_length)

Downloading (…)lve/main/config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [49]:
def process_dataset(examples):
    is_test = examples.get(label_column_name) is None
    
    for idx in range(len(examples[text_column_name])):
        if not is_test:
            examples[label_column_name][idx] = examples[label_column_name][idx].split(' ')
        examples[text_column_name][idx] = examples[text_column_name][idx].split(' ')
        
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=max_seq_length if not is_test else None,
        is_split_into_words=True,
    )
    
    if not is_test:
        labels = []
        for i, label in enumerate(examples[label_column_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                else:
                    label_ids.append(label_to_id['NAW'])
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [53]:
test_dataset = HFDataset.from_pandas(data).remove_columns(column_names=['Language', 'family', 'region'])
test_dataset = test_dataset.map(
    process_dataset,
    batched=True,
    remove_columns=['Word'],
)
test_dataset

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 12180
})

In [73]:
max([len(x) for x in test_dataset['input_ids']])

287

In [58]:
def predict_for_fold(fold):
    training_args = TrainingArguments(
        output_dir=f'pos-tagging-ner/{model_name.replace("/", "-")}/{fold}',
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=1,
        fp16=True,
        report_to='none'
    )
    
    model_path = glob(f'./pos-tagging-ner/Davlan-afro-xlmr-large-75L-best/{fold}/checkpoint-*')[0]

    model = load_model(model_path)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=test_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )
    
    test_pos_ids = trainer.predict(test_dataset)
    
    return test_pos_ids

In [61]:
all_preds = []

for fold in [1, 2, 3, 4]:
    fold_pred = predict_for_fold(fold)
    
    all_preds.append(fold_pred)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [62]:
test_pos_ids = np.mean([p.predictions for p in all_preds], axis=0)
print(test_pos_ids.shape)
test_pos_ids = test_pos_ids.argmax(axis=-1)

(12180, 287, 18)


In [86]:
final_pos = []
for pos_ids, sentence in zip(test_pos_ids, data.Word.values):
    length = len(sentence.split())
    final_pos.append(
        " ".join(list(map(id_to_label.get, [x for x in pos_ids if x not in [-100, 17]]))[1:length+1])
    )
    
    assert length == len(final_pos[-1].split())

In [87]:
data['Pos'] = final_pos

In [88]:
data.head()

Unnamed: 0,Word,Language,family,region,Pos
0,Jalup Ker Ruto nobedo e nyasi achiel kod Ker U...,luo,luo,luo,NOUN PROPN PROPN VERB ADP NOUN NUM ADP PROPN P...
1,"Jalup Ker , William Ruto mosebedo kabare e ny...",luo,luo,luo,NOUN NOUN PUNCT PROPN PROPN VERB ADV ADP NOUN ...
2,"Nyasino timore Mei tarik 27 e Od Piny Owacho ,...",luo,luo,luo,NOUN VERB PROPN NOUN NUM ADP NOUN PROPN PROPN ...
3,Ma en kinde mar adek ma jotelo ariyogo obedo e...,luo,luo,luo,PRON AUX NOUN ADP NUM DET NOUN NUM VERB ADP NO...
4,"Chieng' Tich Ariyo , Dan Maanzo , ma en jakon...",luo,luo,luo,NOUN NOUN NUM PUNCT PROPN PROPN PUNCT PRON AUX...


In [89]:
data.to_csv(f'pseudos/pos-ner-{model_name.split("/")[-1]}-best.csv', index=False)