In [1]:
!pip install -q transformers
!pip install --upgrade accelerate
!pip install -q sentencepiece



In [2]:
import os
import torch
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

warnings.simplefilter("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['WANDB_DISABLED'] = 'true'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=69)

In [3]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/Test.csv")
train_df = train_df[train_df.word.notnull()]
train_df.head()

Unnamed: 0,word,tag,lang
0,Do,VERB,pcm
1,senator,NOUN,pcm
2,tok,VERB,pcm
3,dis,DET,pcm
4,one,NUM,pcm


In [4]:
df = train_df[train_df['lang'].isin(['wol','pcm'])]

def get_samples(df, full_val_samples=True):
    sentences = []
    taggings = []

    # Temporary variables to store sentence and tagging for current sentence
    current_sentence = []
    current_tagging = []

    for lang in tqdm(df.lang.unique(), total=len(df.lang.unique())):
        sentence_count = 0
        # Process each row in the CSV data
        for index, row in df[df.lang==lang].iterrows():
            if not full_val_samples:
                if sentence_count==200:
                    break
            word = row['word']
            tag = row['tag']

            # removing soft hyphens
            word = word.replace('\x8d', '')


            current_sentence.append(word)
            current_tagging.append(tag)

            if word.strip() in ['.', '?', '!']:
                sentence_count+=1
                assert len(current_sentence)==len(current_tagging)
                sentences.append(current_sentence)
                taggings.append(current_tagging)
                current_sentence = []
                current_tagging = []



    return sentences, taggings

train_sent, _ = get_samples(df)

  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
class PretrainingDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        super().__init__()

        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def tokenize(self, text, text_pair=None):
        return self.tokenizer(
            text=text,
            max_length=self.max_length,
            truncation=True,
            padding=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_special_tokens_mask=True,
            return_token_type_ids=False,
            return_offsets_mapping=False,
            return_tensors=None,
        )

    def __getitem__(self, index):
        text = self.texts[index]
        tokenized = self.tokenize(text)

        return tokenized

In [6]:
def main(output_dir, model_name_or_path):
    model = AutoModelForMaskedLM.from_pretrained(model_name_or_path).to(device)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.25,
    )

    training_args = TrainingArguments(
                output_dir=output_dir,
                num_train_epochs=5,
                per_device_train_batch_size=8,
                save_strategy="steps",
                save_steps=150,
                save_total_limit=1,


            )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model(training_args.output_dir)
    tokenizer.save_pretrained(training_args.output_dir)


In [7]:
import glob
# files = glob.glob("/content/lacuna-luo-tsn-txt-files/*")
files = glob.glob("/content/*.txt")


t_sentences = []
for path in files:
    with open(path, 'r') as f:
        text = f.read()


    text = text.split("\n")
    t_sentences.extend(text)

In [8]:
train_sentences = []

for s in train_sent:
    train_sentences.append(" ".join(s))

In [9]:
t_sentences = train_sentences + t_sentences

In [10]:
model_name_or_path = "Davlan/afro-xlmr-large-75L"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
output_dir = "/content/mlm_model"
dataset = PretrainingDataset(t_sentences, tokenizer)

In [None]:
main(output_dir, model_name_or_path)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
