In [2]:
import os
from os.path import exists
from typing import Dict, List, Optional
from collections import Counter
import csv
import torch
from torch import nn, Tensor
import torch.optim as optim
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from tqdm import tqdm
import torchmetrics
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

from LanguageDataset import Tokenizer, LanguageDataset
from utils import load_raw_data, predict

In [3]:
main_folder =  './processed_data/'
aymara_folder = main_folder + 'aymara/'

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "facebook/nllb-200-distilled-600M"

print("Model Loading . . . . . . . . . . . . . . . .")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
print("Model Loaded")

Model Loading . . . . . . . . . . . . . . . .
Model Loaded


In [4]:
def predict(model: nn.Module, dataloader: DataLoader, tokenizer: AutoTokenizer, device: torch.device) -> List[List[str]]:
    ### Temperaty 

    model.eval()
    preds = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, attention_mask = batch[0].to(device), batch[1].to(device)
            inputs = {}
            inputs['input_ids'] = input_ids
            inputs['attention_mask'] = attention_mask
            
            logits = model.generate(**inputs, max_length=256,
                                    forced_bos_token_id=tokenizer.lang_code_to_id["spa_Latn"])

            # only consider non-padded tokens
            # impement later
            
            preds.append(logits)
                    
    return preds

In [5]:
class LanguageDataset:   
    def __init__(self, raw_data: Dict[str, List[str]], src_tokenizer: AutoTokenizer, target_tokenizer: AutoTokenizer = None):
        self.src_tokenizer = src_tokenizer
        self.src_text = raw_data['src_text']
        self.target_text = []
        self.with_target = False

        if 'target_text' in raw_data:
            if not target_tokenizer:
                raise Exception("Expect tokenizer for target language: Got None")

            self.with_target = True
            self.target_tokenizer = target_tokenizer
            self.target_text = raw_data['target_text']

        
    def __len__(self):
        return len(self.src_text)
    

    def __getitem__(self, idx):
        src_output = {}
        input_ids = torch.LongTensor(self.src_tokenizer.encode(self.src_text[idx], max_length=256))
        attention_mask = input_ids > -1e4
        if self.with_target:
            # for training and validation
            return input_ids, attention_mask.float(), torch.LongTensor.encode(self.target_tokenizer(self.target_text[idx], max_length=256))
        else:
            # for testing
            return input_ids, attention_mask.float()

In [6]:
aymara_dev_raw = load_raw_data(aymara_folder + 'dev.aym')
aymara_dev_raw['src_text'] = aymara_dev_raw['src_text'][:50]

tokenizer = Tokenizer(AutoTokenizer.from_pretrained(model_name, src_lang="ayr_Latn"))

aymara_dev_data = LanguageDataset(aymara_dev_raw, tokenizer)
aymara_dev_dataloader = DataLoader(aymara_dev_data, batch_size = 2)


In [7]:
preds = predict(model, aymara_dev_dataloader, AutoTokenizer.from_pretrained(model_name, src_lang="ayr_Latn"), device)


  0%|          | 0/25 [00:02<?, ?it/s]


RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
with open(aymara_folder+"pretrain_result.txt", "w") as f:
    for text in preds:
        f.write(" ".join(text) + "\n")