In [2]:
import os 
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, BertConfig, BertForTokenClassification
from utils import get_tokenizer_training_corpus, load_data, ID2LABEL

In [6]:
LANG = 'ewe'
DATA_DIR = os.path.join('..', 'data_source', 'masakhane-pos', 'data')
BASE_MODEL = 'bert-base-uncased'

In [7]:
ewe_dataset = {split: Dataset.from_generator(load_data, gen_kwargs={'lang':LANG, 'split': split, 'data_dir': DATA_DIR}) for split in ['train', 'test', 'dev']}
ewe_dataset = DatasetDict(ewe_dataset)
ewe_dataset

Generating train split: 728 examples [00:00, 21918.56 examples/s]
Generating train split: 582 examples [00:00, 44958.84 examples/s]
Generating train split: 145 examples [00:00, 26178.29 examples/s]


DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_tags'],
        num_rows: 728
    })
    test: Dataset({
        features: ['tokens', 'pos_tags'],
        num_rows: 582
    })
    dev: Dataset({
        features: ['tokens', 'pos_tags'],
        num_rows: 145
    })
})

In [8]:
def build_and_train_tokenizer(lang, data_dir, base_model):
    training_corpus = get_tokenizer_training_corpus(lang, data_dir)
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    tokenizer = tokenizer.train_new_from_iterator(training_corpus, tokenizer.vocab_size)

    return tokenizer

In [9]:
tokenizer = build_and_train_tokenizer(LANG, DATA_DIR, BASE_MODEL)
tokenizer.tokenize(ewe_dataset['train'][0]['tokens'][1])






[':']

In [10]:
def create_model(vocab_size, num_hidden, num_attention):

    config = BertConfig.from_pretrained(BASE_MODEL, 
                                        num_hidden_layers = num_hidden, 
                                        num_attention_heads = num_attention, 
                                        id2label = ID2LABEL, 
                                        vocab_size=vocab_size)
    model = BertForTokenClassification(config)

    return model

In [11]:
num_hidden = 2
num_attention = 1
model = create_model(tokenizer.vocab_size, num_hidden, num_attention)
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4672, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme