In [1]:
import os
from os.path import exists
from typing import Dict, List, Optional
from collections import Counter
import csv
import torch
from torch import nn, Tensor
import torch.optim as optim
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from tqdm import tqdm
import torchmetrics
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [2]:
def load_raw_data(src_filepath: str, target_filepath: str = None):
    data = {'src_text': []}
    with open(src_filepath) as f:
        for line in f:
            data['src_text'].append(line.strip())

    if target_filepath:
        data['target_text'] = []
        with open(target_filepath) as f:
            for line in f:
                data['target_text'].append(line.strip())

    return data

def predict(model: nn.Module, dataloader: DataLoader, tokenizer: AutoTokenizer, device: torch.device) -> List[List[str]]:
    ### Temperaty 

    model.eval()
    preds = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs = tokenizer(batch, return_tensors="pt").to(device)
            logits = model.generate(**inputs, max_length=512,
                                    forced_bos_token_id=tokenizer.lang_code_to_id["spa_Latn"])

            # only consider non-padded tokens
            # impement later
            
            preds.append(tokenizer.batch_decode(logits, skip_special_tokens=True))
                    
    return preds

In [3]:
class LanguageDataset:   
    ### Temperay dataset class until issue 1 fixed 

    def __init__(self, raw_data: Dict[str, List[str]], src_tokenizer: AutoTokenizer, target_tokenizer: AutoTokenizer = None):
        self.src_tokenizer = src_tokenizer
        self.src_text = raw_data['src_text']
        self.target_text = []
        self.with_target = False

        if 'target_text' in raw_data:
            if not target_tokenizer:
                raise Exception("Expect tokenizer for target language: Got None")

            self.with_target = True
            self.target_tokenizer = target_tokenizer
            self.target_text = raw_data['target_text']

        
    def __len__(self):
        return len(self.src_text)
    

    def __getitem__(self, idx):
        if self.with_target:
            # for training and validation
            return self.src_text[idx], self.target_text[idx]
        else:
            # for testing
            return self.src_text[idx]

In [5]:
main_folder =  './processed_data/'
aymara_folder = main_folder + 'aymara/'

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "facebook/nllb-200-distilled-600M"

print("Model Loading . . . . . . . . . . . . . . . .")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
print("Model Loaded")

Model Loading . . . . . . . . . . . . . . . .
Model Loaded


In [7]:
aymara_dev_raw = load_raw_data(aymara_folder + 'dev.aym')
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="aym_Latn", padding='max_length', truncation=256, max_length=256)

aymara_dev_data = LanguageDataset(aymara_dev_raw, tokenizer)
aymara_dev_dataloader = DataLoader(aymara_dev_data, batch_size = 1)


In [8]:
# ISSUE 1 
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="ayr_Latn", padding='max_length', max_length=20)
"""
https://huggingface.co/docs/transformers/model_doc/nllb
padding (bool, str or PaddingStrategy, optional, defaults to False) — Activates and controls padding. Accepts the following values:
True or 'longest': Pad to the longest sequence in the batch (or no padding if only a single sequence if provided).
'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
False or 'do_not_pad' (default): No padding (i.e., can output a batch with sequences of different lengths).

However, the return still have no padding. Therefore, cannot generate tensor for batch_size > 1. For now batch_size are set to 1.
"""
tokenizer(aymara_dev_raw['src_text'][0])

{'input_ids': [256018, 32360, 80651, 21616, 2], 'attention_mask': [1, 1, 1, 1, 1]}

In [None]:
preds = predict(model, aymara_dev_dataloader, tokenizer, device)
with open(aymara_folder+"pretrain_result.txt", "w") as f:
    for text in preds:
        f.write(" ".join(text) + "\n")