<a href="https://colab.research.google.com/github/Kanakanajm/nnti/blob/main/NNTIProject/notebooks/task3/full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install libraries
!pip install datasets torch transformers[torch] # wandb




In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import wandb

In [2]:
# libs
import os

import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from torch.cuda import empty_cache as cuda_empty_cache, mem_get_info
from gc import collect as garbage_collect

# consts
MODEL_NAME = "facebook/xglm-564M"
CACHE_DIR_DATASETS = "cache/datasets"
CACHE_DIR_TOKENIZERS = "cache/tokenizers"
CACHE_DIR_MODELS = "cache/models"

# env vars
# set the wandb project where this run will be logged
# os.environ["WANDB_PROJECT"]="xglm-full"

# save your trained model checkpoint to wandb
# os.environ["WANDB_LOG_MODEL"]="false"

# turn off watch to log faster
# os.environ["WANDB_WATCH"]="false"

In [3]:
# tokenizer init
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR_TOKENIZERS)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# helper funcs
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": np.mean(predictions == labels)}

def clean():
    # release memory
    garbage_collect()
    cuda_empty_cache()

    mem_info = mem_get_info()
    print(f"Freeing GPU Memory\nFree: %d MB\tTotal: %d MB" % (mem_info[0] // 1024**2, mem_info[1] // 1024**2))

# set padding token to -100 in labels
def to_label_id(id):
    if (id == tokenizer.pad_token_id):
        return -100
    return id

# preprocess sentence into length 16 token chunks (w/padding)
def preprocess(batch):
    result = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=16
        # return_overflowing_tokens=True,
    )
    result['labels'] = list(map(to_label_id, result['input_ids']))

    return result

In [7]:
def postprocess(dataset):
    return dataset.remove_columns('text').with_format('torch')

def load_task3_datasets():
    train_dataset = load_dataset("Llamacha/monolingual-quechua-iic", split="train", cache_dir=CACHE_DIR_DATASETS)
    test_dataset = load_dataset("facebook/flores", "quy_Latn", split="devtest", cache_dir=CACHE_DIR_DATASETS).remove_columns(['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink']).rename_column("sentence", "text")

    # shuffle
    train_dataset = train_dataset.shuffle(seed=42)
    # test_dataset = test_dataset.shuffle(seed=42)

    # try a smaller dataset
    train_dataset = train_dataset.select(range(16384))
    # test_dataset = test_dataset.select(range(128))

    # tokenize
    # no dynamic padding
    tokenized_train_dataset = postprocess(train_dataset.map(preprocess, batched=True))
    tokenized_test_dataset = postprocess(test_dataset.map(preprocess, batched=True))

    del train_dataset, test_dataset
    return tokenized_train_dataset, tokenized_test_dataset


In [None]:
#from torch.utils.data import DataLoader

#train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=16, shuffle=False)
#test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=16, shuffle=False)


In [None]:
# consts for training
DEFAULT_TRAIN_ARGS = TrainingArguments(
    output_dir='models',
    # report_to="wandb",
    evaluation_strategy="steps",
    # do_eval=False
    push_to_hub=False,
    # logging_steps=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    # eval_accumulation_steps = 20,
    # evaluation_strategy="steps",
    # eval_steps=20,
    # max_steps = 100,
    # save_steps = 100,
    save_total_limit = 2,
    load_best_model_at_end=True
)

TRAIN_DATASET, TEST_DATASET = load_task3_datasets()

def get_default_trainer(model):
    return Trainer(
        model=model,
        args=DEFAULT_TRAIN_ARGS,
        train_dataset=TRAIN_DATASET,
        eval_dataset=TEST_DATASET,
        # compute_metrics=compute_metrics,
    )
def get_default_model():
    return AutoModelForCausalLM.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR_MODELS).to("cuda")

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Map:   0%|          | 0/16384 [00:00<?, ? examples/s]

# Full fine tune

In [None]:
def train_model():
    # adaptation (full fine tune)
    model = get_default_model()
    get_default_trainer(model).train(
        # resume_from_checkpoint = True
        )


In [None]:
clean()
wandb.init()
train_model()
wandb.finish()

Freeing GPU Memory
Free: 12771 MB	Total: 15102 MB


Step,Training Loss,Validation Loss
500,5.7887,6.180258
1000,3.0522,5.758701
1500,2.8505,5.698051
2000,2.7701,5.620964
2500,2.5955,5.65639
3000,2.4484,5.591767
3500,2.4696,5.583153
4000,2.4181,5.560064
4500,2.3794,5.537163


In [37]:
# manual clean
clean()

Freeing GPU Memory
Free: 14999 MB	Total: 15102 MB


# Evaluate model on task1 datasets

In [9]:
from torch.utils.data import DataLoader
from torch import inference_mode

LANGS = ["eng_Latn", "spa_Latn", "ita_Latn", "deu_Latn", "arb_Arab", "tel_Telu", "tam_Taml", "quy_Latn", "zho_Hans"]

def load_task1_dataset(lang):
    dataset = load_dataset("facebook/flores", lang, split="devtest", cache_dir=CACHE_DIR_DATASETS).remove_columns(['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink']).rename_column("sentence", "text")

    # shuffle
    # dataset = dataset.shuffle(seed=42)

    # try a smaller dataset
    # dataset = dataset.select(range(128))

    # tokenize
    # no dynamic padding
    tokenized_dataset = postprocess(dataset.map(preprocess, batched=True))

    del dataset
    return tokenized_dataset

def rerun_task1():
    model = AutoModelForCausalLM.from_pretrained('/content/drive/MyDrive/full-tuned-xglm-models/models/checkpoint-5500', cache_dir=CACHE_DIR_MODELS)
    model = model.to("cuda").eval()
    loss =  {l: [] for l in LANGS}
    for lang in LANGS:
        print("Evaluating", lang)
        ds = load_task1_dataset(lang)
        dl = DataLoader(ds, batch_size=8, shuffle=False)
        for _, ds in enumerate(dl):
            with inference_mode():
                outputs = model(
                    ds["input_ids"].to("cuda"),
                    labels=ds["labels"].to("cuda"),
                    attention_mask=ds["attention_mask"].to("cuda"),
                )
                loss[lang].append(outputs.loss.item())
                del outputs
        del ds, dl

    del model
    return loss





In [10]:
task1_loss = rerun_task1()

Evaluating eng_Latn


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Evaluating spa_Latn


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Evaluating ita_Latn


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Evaluating deu_Latn


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Evaluating arb_Arab


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Evaluating tel_Telu


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Evaluating tam_Taml


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Evaluating quy_Latn


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Evaluating zho_Hans


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

In [22]:
import json
j = {k: {"devtest": v} for k, v in task1_loss.items()}

with open('losses_full-fine-tuned-xglm.json', 'w') as f:
    json.dump(j, f)

Rerun Task2

In [18]:
clean()

Freeing GPU Memory
Free: 14953 MB	Total: 15102 MB


In [20]:
!pip install h5py matplotlib scikit-learn openTSNE


Collecting openTSNE
  Downloading openTSNE-1.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openTSNE
Successfully installed openTSNE-1.0.1
