<a href="https://colab.research.google.com/github/Kanakanajm/nnti/blob/main/NNTIProject%20/notebooks/task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install libraries
!pip install datasets torch transformers[torch]



In [2]:
# libs
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from torch.cuda import empty_cache as cuda_empty_cache, mem_get_info
from gc import collect as garbage_collect
# consts
MODEL_NAME = "facebook/xglm-564M"
CACHE_DIR_DATASETS = "cache/datasets"
CACHE_DIR_TOKENIZERS = "cache/tokenizers"
CACHE_DIR_MODELS = "cache/models"

In [3]:
# tokenizer init
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR_TOKENIZERS)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# helper funcs
def clean():
    # release memory
    garbage_collect()
    cuda_empty_cache()

    mem_info = mem_get_info()
    print(f"Freeing GPU Memory\nFree: %d MB\tTotal: %d MB" % (mem_info[0] // 1024**2, mem_info[1] // 1024**2))

# set padding token to -100 in labels
def to_label_id(id):
    if (id == tokenizer.pad_token_id):
        return -100
    return id

# preprocess sentence into length 16 token chunks (w/padding)
def preprocess(batch):
    result = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=16
        # return_overflowing_tokens=True,
    )
    result['labels'] = list(map(to_label_id, result['input_ids']))

    return result

In [5]:
def postprocess(dataset):
    return dataset.remove_columns('text').with_format('torch')

def load_task3_datasets():
    train_dataset = load_dataset("Llamacha/monolingual-quechua-iic", split="train", cache_dir=CACHE_DIR_DATASETS)
    test_dataset = load_dataset("facebook/flores", "quy_Latn", split="devtest", cache_dir=CACHE_DIR_DATASETS).remove_columns(['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink']).rename_column("sentence", "text")

    # try a smaller dataset
    train_dataset = train_dataset.select(range(8192))
    # test_dataset = test_dataset.select(range(128))

    # tokenize
    # no dynamic padding
    tokenized_train_dataset = postprocess(train_dataset.map(preprocess, batched=True))
    tokenized_test_dataset = postprocess(test_dataset.map(preprocess, batched=True))
    return tokenized_train_dataset, tokenized_test_dataset


In [6]:
#from torch.utils.data import DataLoader

#train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=16, shuffle=False)
#test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=16, shuffle=False)


In [7]:
# consts for training
DEFAULT_TRAIN_ARGS = TrainingArguments(
    output_dir="fine-tuned-xglm-564M",
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

TRAIN_DATASET, TEST_DATASET = load_task3_datasets()

def get_default_trainer(model):
    return Trainer(
        model=model,
        args=DEFAULT_TRAIN_ARGS,
        train_dataset=TRAIN_DATASET,
        eval_dataset=TEST_DATASET,
    )
def get_default_model():
    return AutoModelForCausalLM.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR_MODELS).to("cuda")

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Map:   0%|          | 0/8192 [00:00<?, ? examples/s]

# Adaptation

In [8]:
def train_adp_model():
    # adaptation (full fine tune)
    model = get_default_model()
    get_default_trainer(model).train()


In [9]:
clean()
train_adp_model()

Freeing GPU Memory
Free: 40090 MB	Total: 40513 MB


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,3.0749,5.70961
2,2.5992,5.705432
3,2.4787,5.699472


# BitFit

In [10]:
def train_bitfit_model():
    model = get_default_model()
    # bias term fine tuning
    for name, param in model.named_parameters():
        if 'bias' not in name:
            param.requires_grad = False

    get_default_trainer(model).train()


In [11]:
clean()
train_bitfit_model()

Freeing GPU Memory
Free: 39850 MB	Total: 40513 MB


Epoch,Training Loss,Validation Loss
1,9.8344,7.564484
2,8.8226,7.3314
3,8.5878,7.283455


# LoRA

In [12]:
from functools import partial
from lora import LinearWithLoRA

In [13]:
assign_lora = partial(LinearWithLoRA, r=8, a=16)
def train_lora_model():
    model = get_default_model()
    # low-rank adaptation
    for param in model.parameters():
        param.requires_grad = False

    for layer in model.model.layers:
        # query
        layer.self_attn.q_proj = assign_lora(layer.self_attn.q_proj)
        # value
        layer.self_attn.v_proj = assign_lora(layer.self_attn.v_proj)

    get_default_trainer(model).train()


In [14]:
clean()
train_lora_model()

Freeing GPU Memory
Free: 39850 MB	Total: 40513 MB


Epoch,Training Loss,Validation Loss
1,7.6718,6.753075
2,7.3438,6.660978
3,7.2464,6.569739
