In [1]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_model_quantized(model_name="google/gemma-3-1b-it", quant=4):
    """
    Load gemma quantized with explicit device mapping
    """
    # Get current CUDA device index
    cuda_device = torch.cuda.current_device()
    
    # Create explicit device map to current device
    device_map = {"": cuda_device}  # Map all modules to current device
    
    if quant == 4:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            llm_int8_enable_fp32_cpu_offload=True,
        
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map=device_map,  # Use explicit device map
            offload_folder="offload",
            attn_implementation="eager"
        )
        
    elif quant == 8:
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map=device_map  # Use explicit device map
        )
        
    elif quant == 16:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map=device_map  # Use explicit device map
        )
        
    elif quant == 32:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float32,
            device_map=device_map  # Use explicit device map
        )
        
    else:
        raise ValueError("Number has to be 4, 8, 16 or 32")
    
    return model

In [3]:
def setup_model_With_lora(model_name="google/gemma-3-1b-it",quant=4,rank=16):
    """
        Setup model with LoRA config . 
        Rank = dimensionality of the matrices in adaption process
    :param quant: 4,8,16,32
    :param model: model used
    :param rank: if quantized 
    :return: model
    """
    
    model=load_model_quantized(model_name,quant)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # if quant is 4 or 8 i need to specify it for training
    if quant<=8:
        model=prepare_model_for_kbit_training(model)
    
    lora_config=LoraConfig(
        r=rank,
        lora_alpha=rank*2, # scaling -> apparently works best most times when double the rank
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM", # or QUESTION_ANS -> to look more into it . Gemma is CAUSAL_LM
    )
    
    model=get_peft_model(model,lora_config)
    
    # for debugging print params
    model.print_trainable_parameters()
    return model,tokenizer
    

In [4]:
from TrainingDataset import TrainingDataset
from transformers import Trainer
from sklearn.model_selection import train_test_split

def prepare_historical_datasets(data_examples, tokenizer, test_size=0.1, val_size=0.1):
    print(f"Preparing datasets with {len(data_examples)} examples")
    # First split off the test set
    train_val_data, test_data = train_test_split(
        data_examples, 
        test_size=test_size, 
        random_state=42
    )
    
    val_size_adjusted = val_size / (1 - test_size)
    train_data, val_data = train_test_split(
        train_val_data, 
        test_size=val_size_adjusted, 
        random_state=42
    )
    
    print(f"Data split: {len(train_data)} training, {len(val_data)} validation, {len(test_data)} test examples")
    
    train_dataset =TrainingDataset(train_data, tokenizer)
    val_dataset =TrainingDataset(val_data, tokenizer)
    test_dataset =TrainingDataset(test_data, tokenizer)
    
    return {
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset
    }

In [5]:
from transformers import TrainerCallback


class GPUMonitorCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if torch.cuda.is_available() and state.global_step % 10 == 0: 
            print(f"Step {state.global_step}: GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")


In [6]:
def data_collator(features):
    batch = {
        'input_ids': torch.stack([f['input_ids'] for f in features]),
        'attention_mask': torch.stack([f['attention_mask'] for f in features]),
        'labels': torch.stack([f['labels'] for f in features])
    }
    return batch


In [7]:
def train_model(
            data,
            model_name="google/gemma-3-1b-it",
            rank=16,
            quant=4,
            output_dir="./models",
            num_epochs=5,
            learning_rate=2e-4,
            batch_size=4
):
        """
        Traing model
        :param data: 
        :param rank: 
        :param quant: 
        :param output_dir: 
        :param num_epochs: 
        :param learning_rate: 
        :param batch_size: 
        :return: 
        """
        model,tokenizer=setup_model_With_lora(model_name,quant,rank=rank)
        print(f"Model loaded with LoRA rank={rank}, quantization={quant}")
        print("Model loaded")
        if model:
            print("Model loaded")
        else:
            print("Model not loaded")
        # set up dataset
        datasets = prepare_historical_datasets(data, tokenizer)
        
        training_args = TrainingArguments(
        output_dir=f"{output_dir}_r{rank}_{quant}",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=4,
        learning_rate=learning_rate,
        num_train_epochs=num_epochs,
        logging_steps=10,
        save_steps=100,
        eval_steps=100,
        eval_strategy="steps",
        save_total_limit=2,
        fp16=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        report_to="tensorboard",
        remove_unused_columns=False,
        dataloader_pin_memory=False,  # CRITICAL: Disable pin memory
     )
        
        trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets["train"],
        eval_dataset=datasets["validation"], 
        data_collator=data_collator,
        callbacks=[GPUMonitorCallback]
    )
        print(f"Starting training with LoRA rank={rank}, quantization={quant}")
        trainer.train()
        test_results = trainer.evaluate(datasets["test"])
        print(f"Test set evaluation: {test_results}")
        model.save_pretrained(f"{output_dir}_r{rank}_{quant}_final")
        return model, tokenizer, test_results
    

In [8]:
# Complete GPU memory cleanup
import torch
import gc

gc.collect()

torch.cuda.empty_cache()

torch.cuda.reset_peak_memory_stats()

model = None
    
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
                print(f"Clearing tensor of size: {obj.size()}")
                del obj
        except:
            pass
    gc.collect()
    torch.cuda.empty_cache()

In [2]:
import json
import pandas as pd
with open('data/extensive.json', 'r', encoding='utf-8') as f:
    data_examples = json.load(f)

print(f"Loaded {len(data_examples)} examples from JSON file")

# model, tokenizer, test_results = train_model(
#     data=data_examples,
#     model_name="google/gemma-3-4b-it",
#     rank=4,
#     quant=4,
#     num_epochs=2,  
#     batch_size=2  
# )

Loaded 36 examples from JSON file
