<a href="https://colab.research.google.com/github/Kanakanajm/nnti/blob/main/NNTIProject/notebooks/task3/ia3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install libraries
!pip install datasets torch transformers[torch] wandb

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting wandb
  Downloading wandb-0.16.4-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [2]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
# libs
import wandb
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from torch.cuda import empty_cache as cuda_empty_cache, mem_get_info
from gc import collect as garbage_collect
# consts
MODEL_NAME = "facebook/xglm-564M"
CACHE_DIR_DATASETS = "cache/datasets"
CACHE_DIR_TOKENIZERS = "cache/tokenizers"
CACHE_DIR_MODELS = "cache/models"

# env vars
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="xglm-full"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="false"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [4]:
# tokenizer init
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR_TOKENIZERS)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.92M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.03M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/276 [00:00<?, ?B/s]

In [5]:
# helper funcs
def clean():
    # release memory
    garbage_collect()
    cuda_empty_cache()

    mem_info = mem_get_info()
    print(f"Freeing GPU Memory\nFree: %d MB\tTotal: %d MB" % (mem_info[0] // 1024**2, mem_info[1] // 1024**2))

# set padding token to -100 in labels
def to_label_id(id):
    if (id == tokenizer.pad_token_id):
        return -100
    return id

# preprocess sentence into length 16 token chunks (w/padding)
def preprocess(batch):
    result = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=16
        # return_overflowing_tokens=True,
    )
    result['labels'] = list(map(to_label_id, result['input_ids']))

    return result

In [105]:
def postprocess(dataset):
    return dataset.remove_columns('text').with_format('torch')

def load_task3_datasets():
    train_dataset = load_dataset("Llamacha/monolingual-quechua-iic", split="train", cache_dir=CACHE_DIR_DATASETS)
    test_dataset = load_dataset("facebook/flores", "quy_Latn", split="devtest", cache_dir=CACHE_DIR_DATASETS).remove_columns(['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink']).rename_column("sentence", "text")

    # try a smaller dataset
    train_dataset = train_dataset.select(range(8192))
    # test_dataset = test_dataset.select(range(128))

    # tokenize
    # no dynamic padding
    tokenized_train_dataset = postprocess(train_dataset.map(preprocess, batched=True))
    tokenized_test_dataset = postprocess(test_dataset.map(preprocess, batched=True))
    return tokenized_train_dataset, tokenized_test_dataset


In [106]:
# consts for training
DEFAULT_TRAIN_ARGS = TrainingArguments(
    output_dir="fine-tuned-xglm-564M",
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)
DEFAULT_TRAIN_ARGS = TrainingArguments(
    output_dir='models',
    report_to="wandb",
    evaluation_strategy="epoch",
    # do_eval=False
    push_to_hub=False,
    # logging_steps=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    # eval_accumulation_steps = 20,
    # evaluation_strategy="steps",
    # eval_steps=20,
    # max_steps = 100,
    # save_steps = 100,
    # save_total_limit = 2,
    save_strategy="no",
    # load_best_model_at_end=True
)

TRAIN_DATASET, TEST_DATASET = load_task3_datasets()

def get_default_trainer(model):
    return Trainer(
        model=model,
        args=DEFAULT_TRAIN_ARGS,
        train_dataset=TRAIN_DATASET,
        eval_dataset=TEST_DATASET,
    )
def get_default_model():
    return AutoModelForCausalLM.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR_MODELS).to("cuda")

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


# IA3

In [107]:
import torch

class IA3Layer(torch.nn.Module):
    def __init__(self, size):
        super().__init__()
        self.size = size
        self.v = torch.nn.Parameter(torch.randn((1, size)))
    def forward(self, x):
        # print(self.v.expand(self.batch_size, self.sequence_size, self.size).shape)
        # print(x.shape)
        x = self.v.expand(x.shape[0], x.shape[1], self.size) * x
        return x

class IA3AfterLinear(torch.nn.Module):
    def __init__(self, linear):
        super().__init__()
        self.linear = linear
        self.ia3 = IA3Layer(
            linear.out_features
        )

    def forward(self, x):
        return self.ia3(self.linear(x))

class IA3BeforeLinear(torch.nn.Module):
    def __init__(self, linear):
        super().__init__()
        self.linear = linear
        self.ia3 = IA3Layer(
            linear.in_features
        )

    def forward(self, x):
        return self.linear(self.ia3(x))

class ActivationWithIA3(torch.nn.Module):
    def __init__(self, activation, size):
        super().__init__()
        self.activation = activation
        self.ia3 = IA3Layer(size)

    def forward(self, x):
        # print(x.shape)
        return self.ia3(self.activation(x))

In [108]:
def train_ia3_model():
    model = get_default_model()
    # freeze all
    for param in model.parameters():
        param.requires_grad = False
    # adapt
    for layer in model.model.layers:
        layer.self_attn.k_proj = IA3AfterLinear(layer.self_attn.k_proj)
        layer.self_attn.v_proj = IA3AfterLinear(layer.self_attn.v_proj)
        # layer.fc2 = IA3BeforeLinear(layer.fc2)
        # active_fn_in = layer.self_attn.out_proj.out_features
        # layer.activation_fn = ActivationWithIA3(layer.activation_fn, 4096)

    get_default_trainer(model).train()


In [110]:
print(model)

XGLMForCausalLM(
  (model): XGLMModel(
    (embed_tokens): Embedding(256008, 1024, padding_idx=1)
    (embed_positions): XGLMSinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-23): 24 x XGLMDecoderLayer(
        (self_attn): XGLMAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine

In [109]:
clean()
wandb.init()
train_ia3_model()
wandb.finish()

Freeing GPU Memory
Free: 10523 MB	Total: 15102 MB


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,▆▆▆▆▁▁▁███
eval/runtime,▂▁▁▅▁▂▃▂▃█
eval/samples_per_second,▆██▃█▇▆▇▆▁
eval/steps_per_second,▆██▃█▇▆▇▆▁
train/epoch,▁▂▂▂▅██▂▅██▂▅██
train/global_step,▄██▁▁▂▂▁▁▂▂▁▁▂▂
train/grad_norm,█▁
train/learning_rate,█▁
train/loss,█▁
train/total_flos,█▁█

0,1
eval/loss,38.23228
eval/runtime,7.9942
eval/samples_per_second,126.591
eval/steps_per_second,15.886
train/epoch,3.0
train/global_step,192.0
train/grad_norm,69.39159
train/learning_rate,1e-05
train/loss,37.2187
train/total_flos,44599376609280.0


Epoch,Training Loss,Validation Loss
1,20.1039,14.921561
2,19.4437,14.446134
3,19.2283,14.330435


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▂▁
eval/runtime,▁▆█
eval/samples_per_second,█▃▁
eval/steps_per_second,█▃▁
train/epoch,▁▂▂▄▅▅▆███
train/global_step,▁▂▂▄▅▅▆███
train/grad_norm,█▂▂▄▁▁
train/learning_rate,█▇▅▄▂▁
train/loss,█▆▄▂▂▁
train/total_flos,▁

0,1
eval/loss,14.33043
eval/runtime,6.6798
eval/samples_per_second,151.501
eval/steps_per_second,19.012
train/epoch,3.0
train/global_step,3072.0
train/grad_norm,3.15255
train/learning_rate,0.0
train/loss,19.2283
train/total_flos,713358097514496.0
