In [14]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
import torch
torch.cuda.is_available()

True

In [16]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m", 
    torch_dtype=torch.float16,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("bigscience/tokenizer")

In [17]:
print(model)

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (

In [18]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

## Paramerter counting

In [19]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

## Obtain LoRA Model

In [20]:
from peft import LoraConfig, get_peft_model 

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 786432 || all params: 560001024 || trainable%: 0.14043402892063284


## Load the datasets

In [21]:
from datasets import load_dataset
qa_dataset = load_dataset("squad_v2")

# **Dataset Mapping for this model**
<hr>

In [22]:
def create_prompt(context, question, answer):
  if len(answer["text"]) < 1:
    answer = "Cannot Find Answer"
  else:
    answer = answer["text"][0]
  prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n{answer}</s>"
  return prompt_template

mapped_qa_dataset = qa_dataset.map(lambda samples: tokenizer(create_prompt(samples['context'], samples['question'], samples['answers'])))

# Trian Lora config model

In [23]:
import transformers

trainer = transformers.Trainer(
    model=model, 
    train_dataset=mapped_qa_dataset["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4, 
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=100,
        learning_rate=1e-3, 
        fp16=True,
        logging_steps=1, 
        output_dir='outputs',
        num_train_epochs = 1.0
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Step,Training Loss
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


TrainOutput(global_step=100, training_loss=0.0, metrics={'train_runtime': 84.0996, 'train_samples_per_second': 19.025, 'train_steps_per_second': 1.189, 'total_flos': 770736984686592.0, 'train_loss': 0.0, 'epoch': 0.012277470841006752})

In [24]:
HUGGING_FACE_USER_NAME = "alam1n"

In [31]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
model_name = "squad-bloom-588m"

model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)



adapter_model.safetensors:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alam1n/squad-bloom-588m/commit/f46dbeab1aea57e00dbe10879af75195c9af35be', commit_message='Upload model', commit_description='', oid='f46dbeab1aea57e00dbe10879af75195c9af35be', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alam1n/squad-bloom-588m', endpoint='https://huggingface.co', repo_type='model', repo_id='alam1n/squad-bloom-588m'), pr_revision=None, pr_num=None)

In [39]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = f"{HUGGING_FACE_USER_NAME}/{model_name}"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='cuda:0')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
qa_model = PeftModel.from_pretrained(model, peft_model_id)

In [42]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the Hugging Face user name and model name
peft_model_id = f"{HUGGING_FACE_USER_NAME}/{model_name}"

# Load the PEFT configuration
config = PeftConfig.from_pretrained(peft_model_id)

# Load the base model and set it to use CUDA
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    load_in_8bit=False,  # Ensure model is not quantized for training or further modifications
    device_map="cuda"    # Use CUDA devices for the model
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the LoRA model on top of the base model
qa_model = PeftModel.from_pretrained(model, peft_model_id)

# Ensure the LoRA model is also set to use CUDA
qa_model = qa_model.to("cuda")

# Optional: Verify device placement



In [43]:
print(f"Model is loaded on device: {next(qa_model.parameters()).device}")

In [46]:
from IPython.display import display, Markdown

def make_inference(context, question):
    # Tokenize input and move to the same device as the model
    batch = tokenizer(
        f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n", 
        return_tensors='pt'
    ).to(qa_model.device)  # Move input tensors to the same device as the model

    with torch.cuda.amp.autocast():
        output_tokens = qa_model.generate(**batch, max_new_tokens=200)

    # Decode the output tokens and display the result
    display(Markdown(tokenizer.decode(output_tokens[0], skip_special_tokens=True)))

context = "Cheese is the best food."
question = "What is the best food?"

make_inference(context, question)


### CONTEXT
Cheese is the best food.

### QUESTION
What is the best food?

### ANSWER
Cheese is the best food. It is a good source of protein, vitamins, minerals, and fiber. It is also a good source of antioxidants, which are important for the health of the body. It is also a good source of fiber, which helps in the digestion of food. It is also a good source of vitamins, which are important for the health of the body. It is also a good source of antioxidants, which are important for the health of the body. It is also a good source of fiber, which helps in the digestion of food. It is also a good source of vitamins, which are important for the health of the body. It is also a good source of antioxidants, which are important for the health of the body. It is also a good source of fiber, which helps in the digestion of food. It is also a good source of vitamins, which are important for the health of the body

In [47]:
context = "Cheese is the best food."
question = "How far away is the Moon from the Earth?"

make_inference(context, question)

### CONTEXT
Cheese is the best food.

### QUESTION
How far away is the Moon from the Earth?

### ANSWER
The Moon is a planet in the constellation of Sagittarius, which is the
same as the Earth. The Moon is a planet in the constellation of Capricorn,
which is the same as the Earth. The Moon is a planet in the constellation of
Libra, which is the same as the Earth. The Moon is a planet in the constellation
of Aquarius, which is the same as the Earth. The Moon is a planet in the
constellation of Aquarius, which is the same as the Earth. The Moon is a planet
in the constellation of Aquarius, which is the same as the Earth. The Moon is
a planet in the constellation of Aquarius, which is the same as the Earth.
The Moon is a planet in the constellation of Aquarius, which is the same as
the Earth. The Moon is a planet in the constellation of Aquarius, which is
the same as the Earth. The Moon

In [48]:
context = "NSU is North south university"
question = "How far away is the Moon from the Earth?"

make_inference(context, question)

### CONTEXT
NSU is North south university

### QUESTION
How far away is the Moon from the Earth?

### ANSWER
The Moon is a planet in the constellation of Sagittarius, which is the same as the Sun. The Sun is the planet in the constellation of Capricorn, which is the same as the Moon. The Sun is the planet in the constellation of Aquarius, which is the same as the Moon. The Sun is the planet in the constellation of Capricorn, which is the same as the Moon. The Sun is the planet in the constellation of Aquarius, which is the same as the Moon. The Sun is the planet in the constellation of Aquarius, which is the same as the Moon. The Sun is the planet in the constellation of Aquarius, which is the same as the Moon. The Sun is the planet in the constellation of Aquarius, which is the same as the Moon. The Sun is the planet in the constellation of Aquarius, which is the same as the Moon. The Sun is the planet in the constellation of