In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
! pip install -q wandb peft datasets bitsandbytes accelerate
#You likely installed accelerate as a dependency of transformers or peft. It's used under the hood, especially when you:
# Use Trainer
# Enable fp16=True in TrainingArguments
# Train on multiple GPUs

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m109.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m91.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [95]:
import torch
import wandb
import pandas as pd
from peft import LoraConfig,get_peft_model,TaskType,prepare_model_for_kbit_training
from datasets import load_dataset,Dataset
from transformers import Trainer,TrainingArguments,BitsAndBytesConfig,AutoTokenizer,AutoTokenizer, AutoModelForSeq2SeqLM

In [62]:
import os
import wandb

# Get the WandB API key from Colab's secret storage
api_key = os.environ.get('WB_API')

# Log into WandB using the API key
wandb.login(key=api_key)

# Initialize the WandB run
run = wandb.init(
    project="BART Fine-Tuning",
    job_type="training",
    anonymous="allow"
)




In [96]:
# 4bit Quantization

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  #non uniform quantization
    bnb_4bit_compute_dtype="float16"
)

# 8-Bit Quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True,             # Enable 8-bit quantization
#     llm_int8_threshold=6.0,        # Threshold for outlier detection (default = 6.0)
#     llm_int8_skip_modules=None,    # Modules to skip from quantization (optional)
#     llm_int8_enable_fp32_cpu_offload=True  # Offload non-quantized weights to CPU
# )

In [97]:

model_name = "google-t5/t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
                                             quantization_config=bnb_config,
                                             #load_in_4bit=True,  #another methood for applying  Quantization
                                            ).to("cuda")

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [98]:
#prepare model for kbit training
model = prepare_model_for_kbit_training(model)

In [103]:
lora_config = LoraConfig(
    r=8,  # Rank of the low-rank matrices (controls adapter capacity; higher = more expressive)
    lora_alpha=32,  # Scaling factor for the LoRA weights (controls how much influence adapters have)

    # q_proj → Query projection (part of self-attention)
    # k_proj → Key projection
    # v_proj → Value projection
    # o_proj → Output projection (after self-attention)
    # gate_proj, up_proj, down_proj → Layers in the Feed-Forward Network (FFN) part
    # For small datasets or fast prototyping, just using ["q_proj", "v_proj"] is often enough.
    # For serious fine-tuning, using all of modules gives the model much more flexibility to adapt
    target_modules=["q", "k", "v", "o"],  # T5 uses these names inside its attention blocks   # Inject LoRA into Query and Value ... projection layers of attention
    lora_dropout=0.05,    # Dropout applied to LoRA layers during training (regularization)
    bias="none",   # # No bias term is trained or modified (keeps adapters lightweight)
    task_type = TaskType.SEQ_2_SEQ_LM ## Specifies the task type (e.g., autoregressive language modeling)
)

In [104]:
model = get_peft_model(model,lora_config)   # Adds LoRA adapters into the specified layers of the model
print(model.print_trainable_parameters())

trainable params: 1,769,472 || all params: 224,673,024 || trainable%: 0.7876
None


In [105]:
# loading dataset

dataset = load_dataset("json",data_files="/content/kashmir_niche_dataset.json")



In [106]:
dataset = dataset["train"].to_pandas()

In [107]:
dataset.head()

Unnamed: 0,prompt,response
0,"What is a niche topic about Kashmir, Pakistan?",Natural beauty of Neelum Valley
1,"What is a niche topic about Kashmir, Pakistan?",Cultural traditions of Kashmiri people
2,"What is a niche topic about Kashmir, Pakistan?",Kashmiri embroidery and handicrafts
3,"What is a niche topic about Kashmir, Pakistan?",Role of Azad Kashmir in Pakistan's tourism
4,"What is a niche topic about Kashmir, Pakistan?",Historical significance of Muzaffarabad


In [108]:
dataset.shape

(107, 2)

In [82]:
prompt_max_length = max(len(tokenizer.encode(word)) for word in dataset["prompt"])
response_max_length = max(len(tokenizer.encode(word)) for word in dataset["response"])

In [109]:
def tokenization(example):
  input = tokenizer(example["prompt"],
                    truncation=True,
                    padding="max_length",
                    max_length=512)
  output = tokenizer(example["response"],
                     truncation=True,
                     padding="max_length",
                     max_length=512)
  input["labels"]=output["input_ids"]
  return input

dataset = Dataset.from_pandas(dataset)
dataset = dataset.map(tokenization,batched=True)

dataset.set_format(type="pt",columns=["input_ids","attention_mask","labels"])

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

In [110]:
split = dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
val_dataset = split["test"]

In [111]:


trainer = Trainer(
    model=model,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    args = TrainingArguments(
    output_dir ="content/fin-tuned/checkpints/",
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="content/fin-tuned/logs/",
    logging_steps=400,
    per_device_train_batch_size= 6,
    per_device_eval_batch_size = 6,
    num_train_epochs=4,
    eval_strategy="epoch",
    bf16=False,
    fp16=False,
    gradient_accumulation_steps=3,
    report_to="wandb",
    load_best_model_at_end=True,
    optim = "paged_adamw_8bit",   #Essential: memory-efficient optimizer used in QLoRA.
    learning_rate=5e-5    #Controls how much the model's weights are updated.
    )
    )
trainer.train()

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,No log,28.346193
2,No log,27.504494
3,No log,26.911821
4,No log,26.664558


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


TrainOutput(global_step=20, training_loss=22.854965209960938, metrics={'train_runtime': 128.839, 'train_samples_per_second': 2.639, 'train_steps_per_second': 0.155, 'total_flos': 208893860904960.0, 'train_loss': 22.854965209960938, 'epoch': 4.0})

In [None]:
model = model.save_pretrained('content/fin-tuned/model/')
tokenizer=  tokenizer.save_pretrained("content/fin-tuned/tokenizer/")

In [None]:
model = 'content/fin-tuned/model/'
tokenizer = "content/fin-tuned/tokenizer/"

In [None]:
from transformers import pipeline

def inference(text):
  pipe = pipeline("text-generation",model=model,tokenizer=tokenizer)
  response = pipe(text,num_retun_sequences=1)
  return response[0]["generated_text"]