In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [1]:
import torch
import bitsandbytes
import peft
import accelerate
import transformers
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, TrainerCallback

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


Download Model - Bits & Bytes

In [5]:

model_id = "bert-large-uncased"
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     load_in_8bit= False,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     device_map = "auto"
# )

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, 
#quantization_config=bnb_config, 
#device_map={"":0}
)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


Prepare Model Training

In [6]:
# Prepare the model for training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 0 || all params: 335174458 || trainable%: 0.0


Lora - PEFT

In [8]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, # Attention head
    lora_alpha=32, # alpha scaling
    #target_modules=["q", "k", "v", "o"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 786432 || all params: 335960890 || trainable%: 0.23408439000146714


PREPARE THE DATA

In [10]:
# DATA PROCESSING
# the prefix that we'll be using
prefix = 'summarize: '

# tokenize the dataset
def encode_batch(examples):
    # the name of the input column
    text_column = 'document'
    # the name of the target column
    summary_column = 'summary'
    # used to format the tokens
    padding = "max_length"

    # convert to lists of strings
    inputs, targets = [], []
    for i in range(len(examples[text_column])):
        if examples[text_column][i] and examples[summary_column][i]:
            inputs.append(examples[text_column][i])
            targets.append(examples[summary_column][i])

    # add prefix to inputs
    inputs = [prefix + inp for inp in inputs]

    # finally we can tokenize the inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, padding=padding, truncation=True)
    labels = tokenizer(targets, max_length=512, padding=padding, truncation=True)

    # rename to labels for training
    model_inputs["labels"] = labels["input_ids"]
    

    return model_inputs

In [11]:
# load the dataset
def load_split(split_name, max_items):
    # load the split
    dataset = load_dataset("xsum")[split_name]
    # only use the first max_items items
    dataset = dataset.filter(lambda _, idx: idx < max_items, with_indices=True)
    # tokenize the dataset
    dataset = dataset.map(
        encode_batch,
        #batched=True,
        remove_columns=dataset.column_names,
        desc="Running tokenizer on " + split_name + " dataset",
    )
    # set the format to torch
    dataset.set_format(type="torch", columns=["input_ids", "labels"])

    return dataset

Training Enviroment - With Processed Data

In [12]:
# small batch size to fit in memory
batch_size = 1

training_args = TrainingArguments(
    learning_rate=3e-4,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    warmup_steps=100, 
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    remove_unused_columns=False
)

# create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    # load the dataset
    train_dataset=load_split("train", 1000),
    eval_dataset=load_split("test", 100),
)

# Train
trainer.train()

Filter:   0%|          | 0/204045 [00:00<?, ? examples/s]

Running tokenizer on train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11334 [00:00<?, ? examples/s]

Running tokenizer on test dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

  0%|          | 0/250 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

Training Enviroment2 - Data Colletor (Padding on the go)

In [28]:
trainer = Trainer(
    model = model, 
    tokenizer=tokenizer,
    train_dataset= load_split("train", 1000),
        
    args=TrainingArguments(
        per_device_train_batch_size=1, 
        gradient_accumulation_steps=4,
        warmup_steps=1, 
       # max_steps=200, 
        learning_rate=3e-4, 
        #fp16=True,
        logging_steps=2, 
        output_dir='outputs',
        remove_unused_columns=False
    ),
    
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Using the latest cached version of the module from C:\Users\jmanu\.cache\huggingface\modules\datasets_modules\datasets\xsum\082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71 (last modified on Fri Dec  8 08:16:48 2023) since it couldn't be found locally at xsum., or remotely on the Hugging Face Hub.


  0%|          | 0/750 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
trainer.evaluate()

Save The Training

In [None]:
# login to upload the model
# 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
from huggingface_hub import login
login()

from huggingface_hub import HfApi
import torch
api = HfApi()

torch.save(model.state_dict(), 'pytorch_model.bin')

api.upload_file(
    path_or_fileobj="pytorch_model.bin",
    path_in_repo="pytorch_model.bin",
    # replace with your own username in order to upload
    repo_id="ManuelAlv/test",
    repo_type="model",
)

Load Full Model

In [None]:
# Load the model
peft_model = "ManuelAlv/test"

config = PeftConfig.from_pretrained(peft_model)

model_original = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, 
                                              quantization_config=bnb_config, 
                                              device_map={"":0})

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model_new_full2 = PeftModel.from_pretrained(model_original, peft_model)

Test The Model 

In [None]:
num_validation = 10

validation_dataset = load_split('validation', num_validation)

for i in range(num_validation):
    # load the input and label
    input_ids = validation_dataset[i]['input_ids'].unsqueeze(0).to(0)
    label_ids = validation_dataset[i]['labels'].unsqueeze(0).to(0)
    # use the model to generate the output
    output = model.generate(input_ids, max_length=1024)
    # convert the tokens to text
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    label_text = tokenizer.decode(label_ids[0], skip_special_tokens=True)

    print('Input:', input_text)
    print('Output:', output_text)
    print('Label:', label_text)
    print('---')