In [1]:
import os
import torch
import deepspeed
import jsonlines

from typing import List, Optional, Tuple, Union

import sys
sys.path.append("/lid/home/saydalie/multimodal_cot")
sys.path.append("/lid/home/saydalie/multimodal_cot/anole")

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss

from transformers import ChameleonForCausalLM, ChameleonProcessor, Trainer, TrainingArguments
from transformers.modeling_outputs import CausalLMOutputWithPast

from transformers.models.auto.modeling_auto import (
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)

from training.constants_training import (
    ANOLE_PATH_HF,
    ANOLE_PATH_HF_TRAINED,
    DATASET_TOKENIZED_DIR
)

[2025-03-04 10:26:03,699] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  from .autonotebook import tqdm as notebook_tqdm
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




In [2]:
# Define the dataset class
class TokenizedDataset(Dataset):
    def __init__(self, filepath, start_id=0, max_data_size=1000):
        self.tokenized_data = []
        with jsonlines.open(filepath) as reader:
            idx = 0
            for idx, obj in enumerate(reader):
                if idx < start_id:
                    continue
                if idx >= start_id + max_data_size:
                    break

                self.tokenized_data.append(torch.tensor(obj['input_text_ids'], dtype=torch.long))
    
    def __len__(self):
        return len(self.tokenized_data)
    
    def __getitem__(self, idx):
        return self.tokenized_data[idx]

# Define custom collate function for DataLoader
def collate_fn(batch):
    batch_flipped = [item.flip(0) for item in batch]
    batch_inputs_padded = pad_sequence(batch_flipped, batch_first=True, padding_value=-100).flip(1)
    
    # Create attention masks
    attention_masks = torch.zeros_like(batch_inputs_padded, dtype=torch.long)
    attention_masks = attention_masks.masked_fill(batch_inputs_padded != -100, 1)
   
    return {'input_ids': batch_inputs_padded, 'attention_mask': attention_masks, 'labels': batch_inputs_padded.clone()}

In [3]:
# Initialize the dataset
pattern_name = 'color_grid'
dataset_tokenized_path = os.path.join(DATASET_TOKENIZED_DIR, f"{pattern_name}.jsonl")
dataset = TokenizedDataset(dataset_tokenized_path)

In [4]:
# # Prepare eval dataset
# eval_dataset = TokenizedDataset(dataset_tokenized_path, start_id=1100, max_data_size=3)

# processor = ChameleonProcessor.from_pretrained(ANOLE_PATH_HF)
# pattern_ids = torch.tensor(processor.tokenizer.encode('# Pattern\n')[1:])

# eval_input_ids = []
# for ds in eval_dataset:
#     # Drop all ids after `pattern_ids`
#     for i in range(len(ds) - len(pattern_ids) + 1):
#         if torch.equal(ds[i:i+len(pattern_ids)], pattern_ids):
#             eval_input_ids.append({'input_ids': ds[:i + len(pattern_ids)]})
#             break

In [5]:
for inputs in dataset:
    print(inputs.shape)
    break

torch.Size([5620])


In [6]:
# !pip install peft==0.12.0

In [7]:
class ChameleonForCausalLMCustom(ChameleonForCausalLM):
    def compute_loss(
            self, 
            logits: torch.LongTensor,
            labels: torch.LongTensor
        ):
        # # Disallow image tokens which does not include special begin-image and end-image tokens: Useful when training on text-only data
        # # Why use this: https://chatgpt.com/share/67c6af02-6884-8012-b688-2a85e09b5488
        # image_tokens = self.model.vocabulary_mapping.image_tokens
        # logits[:, :, image_tokens] = torch.finfo(logits.dtype).min

        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        # Flatten the tokens
        loss_fct = CrossEntropyLoss()
        shift_logits = shift_logits.view(-1, self.config.vocab_size)
        shift_labels = shift_labels.view(-1)
        # Enable model parallelism
        shift_labels = shift_labels.to(shift_logits.device)
        loss = loss_fct(shift_logits, shift_labels)

        return loss

In [8]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
)

model = ChameleonForCausalLMCustom.from_pretrained(
    ANOLE_PATH_HF,
    torch_dtype=torch.bfloat16
)

# model.config.max_position_embeddings = 6144

model = get_peft_model(model, config)
model.print_trainable_parameters()

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:08<00:00,  2.90s/it]

trainable params: 4,194,304 || all params: 7,046,776,832 || trainable%: 0.0595





In [8]:
# # play with the loss
# model.to('cuda')

# input_ids = collate_fn(inputs.unsqueeze(0))
# input_ids['input_ids'] = input_ids['input_ids'].to(model.device)
# input_ids['attention_mask'] = input_ids['attention_mask'].to(model.device)
# input_ids['labels'] = input_ids['labels'].to(model.device)

# with torch.no_grad():
#     output = model(**input_ids)
# print(output)

In [9]:
# # `Anole` trains only the lm_head
# model.lm_head.parameters()

In [7]:
ds_config = {
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "weight_decay": "auto",
            "torch_adam": True,
            "adam_w_mode": True
        }
    },
    "zero_optimization": {
        "stage": 2,
        "allgather_partitions": True,
        "allgather_bucket_size": 2e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": "auto",
        "contiguous_gradients": True
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": False
}

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=ANOLE_PATH_HF_TRAINED,
    learning_rate=2e-4,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    # torch_empty_cache_steps=1,
    # save_steps=3000,
    bf16=True,
    fp16=False,
    logging_strategy="steps",
    logging_steps=0.001,
    report_to='none'
)
training_args.deepspeed=ds_config

In [9]:
# Initialize the Trainer with custom collate_fn
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn
)

In [13]:
# Train the model
trainer.train()

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
3,1.9185
6,1.8115
9,1.6541
12,1.4418
15,1.244
18,1.0177
21,0.7477
24,0.5794
27,0.5107
30,0.4687


KeyboardInterrupt: 