In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    BloomForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model

In [None]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")

<h3> 3. Load Dataset
#
# We will use the [wikitext-2-raw-v1](https://huggingface.co/datasets/wikitext) dataset for language modeling.

In [None]:
dataset_name = "wikitext"
dataset_config_name = "wikitext-2-raw-v1"


In [None]:
raw_datasets = load_dataset(dataset_name, dataset_config_name)
logger.info(raw_datasets)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
# We'll merge the train/validation splits for demonstration purposes,
# but in practice you should keep them separate for proper evaluation.

train_dataset = raw_datasets["train"]
valid_dataset = raw_datasets["validation"]


<h3>4. Prepare the Tokenizer and Model
#
# We pick a smaller BLOOM model from Hugging Face (bloom-560m) for demonstration.



In [None]:
model_name = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token

# Load the base (pretrained) language model
base_model = BloomForCausalLM.from_pretrained(model_name)
base_model.to(device)

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (

<h4>5. Define the LoRA Configuration and Wrap the Model with PEFT
#
# `LoraConfig` specifies the scaling, rank, and target modules to fine-tune.

In [None]:
lora_config = LoraConfig(
    r=8,             # LoRA attention dimension
    lora_alpha=32,   # Scaling of LoRA
    target_modules=["query_key_value"],  # Which modules to apply LoRA to in the model
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"  # Type of task
)

In [None]:
peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 786,432 || all params: 560,001,024 || trainable%: 0.1404


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], return_attention_mask=True)

# Tokenize both train and validation sets
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_valid = valid_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Convert data into language modeling format: each example will be chunked to a certain block size.
block_size = 128


Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
block_size = 128

def group_texts(examples):
    # Concatenate all 'input_ids' and 'attention_mask' in each batch
    concatenated_ids = []
    concatenated_mask = []

    for ids, mask in zip(examples["input_ids"], examples["attention_mask"]):
        concatenated_ids.extend(ids)
        concatenated_mask.extend(mask)

    # Truncate so length is a multiple of block_size
    total_length = (len(concatenated_ids) // block_size) * block_size

    # Chunk into sequences of length block_size
    result = {
        "input_ids": [
            concatenated_ids[i : i + block_size] for i in range(0, total_length, block_size)
        ],
        "attention_mask": [
            concatenated_mask[i : i + block_size] for i in range(0, total_length, block_size)
        ],
    }
    return result


In [None]:
lm_train_dataset = tokenized_train.map(group_texts, batched=True)
lm_valid_dataset = tokenized_valid.map(group_texts, batched=True)

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We are doing causal LM, not masked LM
)


In [None]:
# We use the `Trainer` from Hugging Face Transformers. Adjust hyperparameters as needed.

training_args = TrainingArguments(
    output_dir="./lora-bloom-checkpoints",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    learning_rate=1e-4,
    fp16=True if device == "cuda" else False,
    report_to="none"  # or "tensorboard" if you want logging
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=lm_train_dataset,
    eval_dataset=lm_valid_dataset,
    data_collator=data_collator
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
eval_results = trainer.evaluate()
logger.info(f"Perplexity on validation set: {torch.exp(torch.tensor(eval_results['eval_loss']))}")

# Let's do a quick test generation
peft_model.eval()
prompt = "In this study, we explore"
inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = peft_model.generate(
        inputs,
        max_length=50,
        num_return_sequences=1,
        do_sample=True,
        top_k=50,
        top_p=0.9
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
logger.info(f"Generated text: {generated_text}")

In [None]:
outputs = peft_model.generate(
    inputs,
    max_length=128,          # Increase output length
    min_length=30,           # Force the model to generate at least 30 tokens
    do_sample=True,          # Enable sampling
    top_k=50,                # Sampling parameter
    top_p=0.9,               # Sampling parameter
    temperature=0.8,         # Adjust "creativity"
    no_repeat_ngram_size=2,  # Helps avoid immediate repetition
    pad_token_id=tokenizer.eos_token_id,  # So it doesn't produce an error with missing pad token
    eos_token_id=tokenizer.eos_token_id
)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [None]:
prompt = (
    "In a faraway kingdom, a young explorer discovered a hidden library. "
    "Inside, they found a mysterious book that spoke of "
)


In [None]:
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
print("Raw decoding:", generated_text)

Raw decoding: In this study, we explore the possibility of a relationship between the use of different types of drugs (e.g. antipsychotics, anti-epileptic drugs) and the occurrence of schizophrenia. We report a case of an individual with schizophreniform personality disorder (SPPD) who had previously been treated with both antihistamines and antiepileptics. The individual was receiving a combination of these drugs, and one of the drugs was being used as a mood stabilizer (clozapine). It was unclear whether or not the antithrombotic treatment was causing the clinical symptoms observed


In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if tokenizer.eos_token is None:
    tokenizer.eos_token = tokenizer.eos_token or tokenizer.unk_token


In [None]:
print("Output token IDs:", outputs[0])

Output token IDs: tensor([  1411,   1119,  12589,     15,   1701,  72265,    368,  47665,    461,
           267,  23556,   5299,    368,   2971,    461,   5955,  15610,    461,
         51667,    375,     72,     17,     74,     17,   2130,  13553,  16230,
        217756,     15,  27748,   1026,   1128,     83,    979,  51667,     12,
           530,    368,  92904,    461, 185735, 221780,     17,   5361,   6210,
           267,   4462,    461,    660,  11559,   1002, 185735,   3278,  19544,
        124757,  76234,    375,     54,  18427,     39,     12,   5268,   3866,
         36372,   3784,  42566,   1002,   7378,   2130,   1267,    617, 191964,
           530,   2130,    641,     83,   1128,   1309,   3958,     17,   1387,
         11559,   1620,  54451,    267,  38836,    461,   4657,  51667,     15,
           530,   2592,    461,    368,  51667,   1620,   6610,   4853,    661,
           267,  90908,  32472,  13502,    375,    948,  15449,    483,    989,
          1216,   3162

In [None]:
outputs = peft_model.generate(
    inputs,
    max_length=100,
    do_sample=True,
    temperature=1.0,
    top_k=0,  # or a large value to reduce constraints
    top_p=1.0
)

In [None]:
prompt_ids = tokenizer.encode(prompt)
print("Prompt token IDs:", prompt_ids)

Prompt token IDs: [1411, 267, 64723, 5872, 114432, 15, 267, 20500, 135235, 54419, 267, 40977, 19750, 17, 157007, 15, 3291, 6222, 267, 195661, 12484, 861, 89175, 461, 210]


In [None]:
prompt = "Once upon a time in a hidden forest, an ancient tree whispered secrets of "
inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = peft_model.generate(
        inputs,
        max_length=100,
        min_length=20,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        temperature=0.8,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

token_ids = outputs[0].tolist()
print("Generated token IDs:", token_ids)

generated_text = tokenizer.decode(token_ids, skip_special_tokens=False)
print("Raw generated text:", generated_text)

generated_text_clean = tokenizer.decode(token_ids, skip_special_tokens=True)
print("Clean generated text:", generated_text_clean)

Generated token IDs: [64393, 14591, 267, 3509, 361, 267, 40977, 24140, 15, 660, 59962, 20893, 193512, 376, 80943, 461, 210, 9292, 26676, 36684, 15, 530, 15, 45747, 3776, 111257, 37073, 115528, 15, 718, 8348, 44593, 80943, 1485, 3776, 9016, 6199, 17, 5070, 368, 111257, 861, 1542, 24935, 15, 368, 20893, 1809, 3784, 267, 135513, 17, 7702, 1320, 1152, 1400, 5801, 267, 9999, 361, 718, 15, 1152, 4984, 722, 368, 2592, 427, 14565, 368, 8876, 17, 2]
Raw generated text: Once upon a time in a hidden forest, an ancient tree whispered secrets of erstwhile lives, and, despite its centuries-old reputation, it still holds secrets from its own people. For the centuries that have passed, the tree has been a mystery. But if you can find a secret in it, you could be the one to save the world.</s>
Clean generated text: Once upon a time in a hidden forest, an ancient tree whispered secrets of erstwhile lives, and, despite its centuries-old reputation, it still holds secrets from its own people. For the cent