<a href="https://colab.research.google.com/github/Kanakanajm/nnti/blob/main/nnti/NNTIProject%20/notebooks/task3-full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install libraries
!pip install datasets torch transformers[torch] # wandb


In [2]:
# !wandb login

In [3]:
# libs
import os
# import wandb
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from torch.cuda import empty_cache as cuda_empty_cache, mem_get_info
from gc import collect as garbage_collect

# consts
MODEL_NAME = "facebook/xglm-564M"
CACHE_DIR_DATASETS = "cache/datasets"
CACHE_DIR_TOKENIZERS = "cache/tokenizers"
CACHE_DIR_MODELS = "cache/models"

# env vars
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="xglm-full"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="false"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [None]:
# tokenizer init
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR_TOKENIZERS)

In [None]:
# helper funcs
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": np.mean(predictions == labels)}

def clean():
    # release memory
    garbage_collect()
    cuda_empty_cache()

    mem_info = mem_get_info()
    print(f"Freeing GPU Memory\nFree: %d MB\tTotal: %d MB" % (mem_info[0] // 1024**2, mem_info[1] // 1024**2))

# set padding token to -100 in labels
def to_label_id(id):
    if (id == tokenizer.pad_token_id):
        return -100
    return id

# preprocess sentence into length 16 token chunks (w/padding)
def preprocess(batch):
    result = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=16
        # return_overflowing_tokens=True,
    )
    result['labels'] = list(map(to_label_id, result['input_ids']))

    return result

In [None]:
def postprocess(dataset):
    return dataset.remove_columns('text').with_format('torch')

def load_task3_datasets():
    train_dataset = load_dataset("Llamacha/monolingual-quechua-iic", split="train", cache_dir=CACHE_DIR_DATASETS)
    test_dataset = load_dataset("facebook/flores", "quy_Latn", split="devtest", cache_dir=CACHE_DIR_DATASETS).remove_columns(['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink']).rename_column("sentence", "text")

    # try a smaller dataset
    train_dataset = train_dataset.select(range(16384))
    # test_dataset = test_dataset.select(range(128))

    # tokenize
    # no dynamic padding
    tokenized_train_dataset = postprocess(train_dataset.map(preprocess, batched=True))
    tokenized_test_dataset = postprocess(test_dataset.map(preprocess, batched=True))

    del train_dataset, test_dataset
    return tokenized_train_dataset, tokenized_test_dataset


In [None]:
#from torch.utils.data import DataLoader

#train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=16, shuffle=False)
#test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=16, shuffle=False)


In [None]:
# consts for training
DEFAULT_TRAIN_ARGS = TrainingArguments(
    output_dir='models',
    # report_to="wandb",
    evaluation_strategy="epoch",
    # do_eval=False
    push_to_hub=False,
    # logging_steps=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    # eval_accumulation_steps = 20,
    # evaluation_strategy="steps",
    # eval_steps=20,
    # max_steps = 100,
    # save_steps = 100,
    save_total_limit = 2,
)

TRAIN_DATASET, TEST_DATASET = load_task3_datasets()

def get_default_trainer(model):
    return Trainer(
        model=model,
        args=DEFAULT_TRAIN_ARGS,
        train_dataset=TRAIN_DATASET,
        eval_dataset=TEST_DATASET,
        # compute_metrics=compute_metrics,
    )
def get_default_model():
    return AutoModelForCausalLM.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR_MODELS).to("cuda")

# Full fine tune

In [None]:
def train_model():
    # adaptation (full fine tune)
    model = get_default_model()
    get_default_trainer(model).train(
        # resume_from_checkpoint = True
        )


In [None]:
clean()
train_model()
wandb.finish()