<a href="https://colab.research.google.com/github/Gauri-Tripathi/Conversation-Helper/blob/main/src/notebooks/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**This model was trained on google colab using a100 gpu using data with 37k rows, results can be recreated using smaller dataset too.**

In [None]:
!pip install --no-deps bitsandbytes accelerate  peft trl transformers
!pip install sentencepiece  datasets huggingface_hub
!pip install flash-attn --no-build-isolation

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting xformers==0.0.29
  Downloading xformers-0.0.29-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.29-cp311-cp311-manylinux_2_28_x86_64.whl (15.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.15.2-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl, bitsandbytes
Successfully installed bitsandbytes-0.45.3 trl-0.15.2 xformers-

In [None]:
max_seq_length = 1024
dtype = None
load_in_4bit = True

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from peft import AdaLoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import logging
import gc

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)


In [None]:
logger.info("Loading dataset...")
# Replace with your actual dataframe loading code
df = pd.read_csv("/content/Data.csv")  # Should have 'input', 'output', 'instruction' columns

# Convert DataFrame to dataset
dataset = Dataset.from_pandas(df)
train_test_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_val_dataset = DatasetDict({
    'train': train_test_dataset['train'],
    'validation': train_test_dataset['test']
})

# -----  Setup quantization and load model -----
logger.info("Setting up model with quantization...")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,  # Double quantization for memory savings
    bnb_4bit_quant_type="nf4",       # NF4 data type for better quality
)


In [None]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Clear CUDA cache before loading model
torch.cuda.empty_cache()
gc.collect()

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.float16,
    use_flash_attention_2=True,
    use_cache=False  #
)


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [None]:
# ----- Configure AdaLoRA -----

adalora_config = AdaLoraConfig(
    init_r=16,
    target_r=8,
    beta1=0.85,
    beta2=0.85,
    tinit=200,
    tfinal=1000,

    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


model = get_peft_model(model, adalora_config)
model.print_trainable_parameters()

trainable params: 41,946,624 || all params: 8,072,208,096 || trainable%: 0.5196


In [None]:
def preprocess_function(examples):
    """More efficient preprocessing function"""
    batch_size = len(examples["input"])
    inputs = []


    for i in range(batch_size):
        instruction = examples["instruction"][i]
        conversation = examples["input"][i]
        response = examples["output"][i]

        prompt = f"""Below is an instruction that describes a task, and an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{conversation}

### Response:
{response}"""
        inputs.append(prompt)


    model_inputs = tokenizer(
        inputs,
        padding="max_length",
        truncation=True,
        max_length=512,  # Reduced from 768
        return_tensors=None
    )

    labels = []
    for i in range(batch_size):

        input_text = inputs[i]
        response_marker_pos = input_text.find("### Response:")
        if response_marker_pos == -1:
            # Fallback if marker isn't found
            label = [-100] * len(model_inputs["input_ids"][i])
        else:
            # Calculate token position approximately
            prefix_text = input_text[:response_marker_pos + len("### Response:")]
            prefix_tokens = tokenizer(prefix_text, return_tensors="pt")["input_ids"].shape[1]

            # Create label with -100 before response
            label = [-100] * prefix_tokens + model_inputs["input_ids"][i][prefix_tokens:]

        labels.append(label[:len(model_inputs["input_ids"][i])])

    model_inputs["labels"] = labels
    return model_inputs


In [None]:

processed_dataset = train_val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_val_dataset["train"].column_names,
    num_proc=4,  # Adjust based on your CPU
    desc="Processing dataset with Alpaca format",
)


Processing dataset with Alpaca format (num_proc=4):   0%|          | 0/32814 [00:00<?, ? examples/s]

Processing dataset with Alpaca format (num_proc=4):   0%|          | 0/3647 [00:00<?, ? examples/s]

In [None]:
# ----- Configure training arguments -----

output_dir = "./adalora_model"
os.makedirs(output_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=32,       # Smaller batch size for longer sequences
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,       # Compensate for small batch size
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True,                           # Mixed precision training
    logging_steps=50,
    logging_dir=f"{output_dir}/logs",
    save_steps=500,
    eval_steps=500,
    evaluation_strategy="steps",
    save_total_limit=3,                  # Keep only last 3 checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_grad_norm=0.3,
    optim="adamw_torch_fused",
    gradient_checkpointing=True,
    report_to="none",
    ddp_find_unused_parameters=False,
    remove_unused_columns=False
)

# ----- Create data collator -----
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)



In [None]:
# ----- Initialize trainer with error handling -----

try:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=processed_dataset["train"],
        eval_dataset=processed_dataset["validation"],
        data_collator=data_collator,
    )

    # -----  Train model -----
    logger.info("Starting training...")
    trainer.train()

    # -----  Save the trained model adapter -----
    logger.info("Training complete, saving model...")
    model.save_pretrained(f"{output_dir}/final")
    tokenizer.save_pretrained(f"{output_dir}/final")
    logger.info(f"Model saved to {output_dir}/final")

except Exception as e:
    logger.error(f"Error during training: {e}")
    # Try emergency save
    try:
        logger.info("Attempting emergency save...")
        model.save_pretrained(f"{output_dir}/emergency_save")
        logger.info(f"Emergency save successful: {output_dir}/emergency_save")
    except Exception as save_error:
        logger.error(f"Emergency save failed: {save_error}")




Step,Training Loss,Validation Loss


In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')