In [1]:
import os
import torch
import logging
import json
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    EarlyStoppingCallback,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import psutil
import gc
from tqdm import tqdm
from typing import Dict, Any, Optional
from torch.utils.tensorboard import SummaryWriter

In [2]:
class TrainingConfig:
    def __init__(self):
        self.model_name = "mistralai/Mistral-7B-Instruct-v0.3"
        self.qa_file = "qa_data_final_clean.jsonl"
        self.output_dir = "mistral_qlora_finetuned"
        self.max_length = 1024
        self.num_train_epochs = 7
        
        # Dynamic batch size with safety margin
        self.per_device_train_batch_size = self._find_optimal_batch_size()
        self.gradient_accumulation_steps = self._calculate_gradient_accumulation()
        
        # Conservative learning parameters
        self.learning_rate = 1e-4  # Reduced from 2e-4
        self.warmup_ratio = 0.05   # Increased from 0.03
        self.save_steps = 500
        self.logging_steps = 25
        self.eval_steps = 250
        
        # Create directories
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(os.path.join(self.output_dir, "checkpoints"), exist_ok=True)
        
    def _find_optimal_batch_size(self) -> int:
        if not torch.cuda.is_available():
            return 1
            
        total_gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
        # More conservative batch sizes
        if total_gpu_memory >= 24:
            return 1  # Reduced from 2
        return 1

    def _calculate_gradient_accumulation(self) -> int:
        # Adjust gradient accumulation based on batch size
        return 32 if self.per_device_train_batch_size == 1 else 16

    @property
    def lora_config(self) -> LoraConfig:
        return LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj", 
                "gate_proj", "up_proj", "down_proj", 
                "ml_proj", "ml_ln"
            ],
            lora_dropout=0.1,  # Increased from 0.05
            bias="none",
            task_type="CAUSAL_LM",
            fan_in_fan_out=False,
            inference_mode=False,
            init_lora_weights="gaussian",
        )

    @property
    def quantization_config(self) -> BitsAndBytesConfig:
        return BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            llm_int8_skip_modules=["lm_head"],
            llm_int8_threshold=6.0,
            bnb_4bit_quant_storage=torch.float16
        )

In [3]:
class EnhancedMemoryMonitor:
    def __init__(self, log_dir: str, memory_threshold: float = 0.95):
        self.writer = SummaryWriter(log_dir=log_dir)
        self.step = 0
        self.memory_threshold = memory_threshold
        self.warning_emitted = False
    
    def check_memory(self) -> bool:
        if torch.cuda.is_available():
            gpu_memory_used = torch.cuda.memory_allocated() / torch.cuda.get_device_properties(0).total_memory
            if gpu_memory_used > self.memory_threshold:
                if not self.warning_emitted:
                    logging.warning(f"GPU memory usage critical: {gpu_memory_used:.2%}")
                    self.warning_emitted = True
                return False
        
        cpu_percent = psutil.virtual_memory().percent / 100
        if cpu_percent > self.memory_threshold:
            if not self.warning_emitted:
                logging.warning(f"CPU memory usage critical: {cpu_percent:.2%}")
                self.warning_emitted = True
            return False
            
        self.warning_emitted = False
        return True

    def log_memory(self, prefix: str = "") -> Dict[str, float]:
        memory_stats = {}
        
        if torch.cuda.is_available():
            gpu_memory_allocated = torch.cuda.memory_allocated() / 1024**2
            gpu_memory_reserved = torch.cuda.memory_reserved() / 1024**2
            memory_stats.update({
                'gpu_allocated': gpu_memory_allocated,
                'gpu_reserved': gpu_memory_reserved
            })
            self.writer.add_scalar(f'{prefix}GPU Memory/Allocated (MB)', gpu_memory_allocated, self.step)
            self.writer.add_scalar(f'{prefix}GPU Memory/Reserved (MB)', gpu_memory_reserved, self.step)
            
            total_gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**2
            gpu_usage_percent = (gpu_memory_allocated / total_gpu_memory) * 100
            self.writer.add_scalar(f'{prefix}GPU Memory/Usage (%)', gpu_usage_percent, self.step)
        
        cpu_memory = psutil.Process().memory_info().rss / 1024**2
        memory_stats['cpu'] = cpu_memory
        self.writer.add_scalar(f'{prefix}CPU Memory (MB)', cpu_memory, self.step)
        
        cpu_percent = psutil.virtual_memory().percent
        self.writer.add_scalar(f'{prefix}CPU Memory/Usage (%)', cpu_percent, self.step)
        
        self.step += 1
        return memory_stats

In [4]:
class DatasetProcessor:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def format_qa(self, example: Dict[str, Any]) -> Dict[str, str]:
        try:
            instruction = example.get("instruction", "").strip()
            response = example.get("response", "").strip()
            
            if not instruction or not response:
                return None
                
            instruction = instruction.replace("\n\n", "\n").strip()
            response = response.replace("\n\n", "\n").strip()
            
            text = (
                f"### System: You are a helpful AI assistant that answers user queries accurately and helpfully.\n"
                f"### Instruction:\n{instruction}\n\n"
                f"### Response:\n{response}</s>"
            )
            return {"text": text}
        except Exception as e:
            logging.error(f"Error processing example: {str(e)}")
            return None

    def tokenize_function(self, examples: Dict[str, Any]) -> Dict[str, torch.Tensor]:
        return self.tokenizer(
            examples["text"],
            truncation=True,
            padding=False,
            max_length=config.max_length,
            return_tensors=None,
        )

    def validate_dataset(self, dataset) -> Dict[str, Any]:
        """Validate dataset quality and report statistics."""
        logging.info("Validating dataset...")
        stats = {
            "total_examples": len(dataset),
            "empty_instructions": 0,
            "empty_responses": 0,
            "instruction_lengths": [],
            "response_lengths": []
        }
        
        for example in tqdm(dataset, desc="Validating dataset"):
            instruction = example.get("instruction", "").strip()
            response = example.get("response", "").strip()
            
            if not instruction:
                stats["empty_instructions"] += 1
            if not response:
                stats["empty_responses"] += 1
                
            stats["instruction_lengths"].append(len(instruction.split()))
            stats["response_lengths"].append(len(response.split()))
        
        stats["avg_instruction_length"] = float(np.mean(stats["instruction_lengths"]))
        stats["avg_response_length"] = float(np.mean(stats["response_lengths"]))
        
        logging.info(f"Dataset Statistics:\n{json.dumps(stats, indent=2)}")
        return stats

    def process_dataset(self, qa_file: str):
        dataset = load_dataset("json", data_files=qa_file)["train"]
        self.validate_dataset(dataset)
        
        # Process and filter dataset
        dataset = dataset.map(
            self.format_qa,
            remove_columns=dataset.column_names,
            num_proc=4,
        ).filter(lambda x: x["text"] is not None)
        
        # Split dataset
        dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
        
        # Tokenize datasets
        tokenized_dataset = {}
        for split in ["train", "test"]:
            tokenized_dataset[split] = dataset[split].map(
                self.tokenize_function,
                remove_columns=["text"],
                num_proc=4,
                load_from_cache_file=True,
                desc=f"Tokenizing {split} split",
            )
            tokenized_dataset[split].set_format("torch")
        
        return tokenized_dataset

In [5]:
class SafeTrainer(Trainer):
    def __init__(self, *args, memory_monitor: Optional[EnhancedMemoryMonitor] = None, **kwargs):
        super().__init__(*args, **kwargs)
        self.memory_monitor = memory_monitor or EnhancedMemoryMonitor(
            log_dir=self.args.logging_dir,
            memory_threshold=0.95
        )

    def training_step(self, model, inputs, num_items_in_batch=None):  # Added num_items_in_batch parameter
        """Perform a training step with memory monitoring."""
        if not self.memory_monitor.check_memory():
            logging.warning("Memory usage too high, triggering garbage collection")
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()
            
            if not self.memory_monitor.check_memory():
                self.args.per_device_train_batch_size = max(1, self.args.per_device_train_batch_size - 1)
                logging.warning(f"Reduced batch size to {self.args.per_device_train_batch_size}")
        
        self.memory_monitor.log_memory("training/")
        return super().training_step(model, inputs, num_items_in_batch)

    def evaluation_step(self, model, inputs):
        """Perform an evaluation step with memory monitoring."""
        if not self.memory_monitor.check_memory():
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()
        
        self.memory_monitor.log_memory("eval/")
        return super().evaluation_step(model, inputs)

In [6]:
config = TrainingConfig()

In [7]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s - %(funcName)s:%(lineno)d",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler(f"{config.output_dir}/training.log")
    ]
)

In [8]:
memory_monitor = EnhancedMemoryMonitor(
    log_dir=os.path.join(config.output_dir, "tensorboard"),
    memory_threshold=0.90
)

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    config.model_name,
    use_fast=True,
    padding_side="left"
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Process dataset
dataset_processor = DatasetProcessor(tokenizer)
tokenized_datasets = dataset_processor.process_dataset(config.qa_file)

Generating train split: 0 examples [00:00, ? examples/s]

2024-12-10 01:57:17 - INFO - root - Validating dataset... - validate_dataset:37
Validating dataset: 100%|██████████████████| 540/540 [00:00<00:00, 82217.37it/s]
2024-12-10 01:57:17 - INFO - root - Dataset Statistics:
{
  "total_examples": 540,
  "empty_instructions": 0,
  "empty_responses": 0,
  "instruction_lengths": [
    4,
    7,
    9,
    7,
    7,
    8,
    4,
    6,
    8,
    8,
    10,
    7,
    6,
    6,
    6,
    6,
    3,
    8,
    8,
    6,
    5,
    7,
    7,
    9,
    11,
    9,
    12,
    12,
    12,
    7,
    6,
    7,
    8,
    7,
    9,
    7,
    8,
    13,
    12,
    11,
    11,
    15,
    7,
    11,
    11,
    10,
    7,
    11,
    10,
    7,
    11,
    12,
    11,
    11,
    14,
    11,
    11,
    12,
    11,
    13,
    8,
    11,
    14,
    9,
    13,
    13,
    15,
    11,
    9,
    8,
    10,
    5,
    10,
    10,
    10,
    6,
    9,
    9,
    10,
    8,
    8,
    9,
    8,
    6,
    6,
    4,
    7,
    8,
    9,
    7,
    7,
    7

Map (num_proc=4):   0%|          | 0/540 [00:00<?, ? examples/s]

Filter:   0%|          | 0/540 [00:00<?, ? examples/s]

Tokenizing train split (num_proc=4):   0%|          | 0/486 [00:00<?, ? examples/s]

Tokenizing test split (num_proc=4):   0%|          | 0/54 [00:00<?, ? examples/s]

In [10]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    quantization_config=config.quantization_config,
    device_map="auto",
    torch_dtype=torch.float16,
    use_cache=False,
    attn_implementation="sdpa"
)

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
model = get_peft_model(model, config.lora_config)

2024-12-10 01:57:19 - INFO - accelerate.utils.modeling - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk). - get_balanced_memory:1014


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
training_args = TrainingArguments(
    output_dir=os.path.join(config.output_dir, "checkpoints"),
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    num_train_epochs=config.num_train_epochs,
    learning_rate=config.learning_rate,
    warmup_ratio=config.warmup_ratio,
    logging_steps=config.logging_steps,
    # Match evaluation_strategy with save_strategy
    evaluation_strategy="steps",  # Added this line
    save_strategy="steps",
    eval_steps=config.eval_steps,
    save_steps=config.save_steps,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_first_step=True,
    group_by_length=True,
    remove_unused_columns=True,
    dataloader_pin_memory=True,
    dataloader_num_workers=2,
    report_to="tensorboard",
    save_safetensors=True,
)

trainer = SafeTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
        pad_to_multiple_of=8
    ),
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.01
        )
    ],
    memory_monitor=memory_monitor
)



In [12]:
# Clean up memory before training
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

try:
    train_result = trainer.train()
    
    trainer.save_model()
    tokenizer.save_pretrained(config.output_dir)
    model.save_pretrained(config.output_dir)
    
except Exception as e:
    logging.error(f"Training failed: {str(e)}")
    # Save emergency checkpoint
    try:
        emergency_dir = os.path.join(config.output_dir, "emergency_checkpoint")
        os.makedirs(emergency_dir, exist_ok=True)
        trainer.save_model(emergency_dir)
        logging.info(f"Saved emergency checkpoint to {emergency_dir}")
    except Exception as save_error:
        logging.error(f"Failed to save emergency checkpoint: {str(save_error)}")
    raise
finally:
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    memory_monitor.writer.close()

Step,Training Loss,Validation Loss


In [None]:
#!/usr/bin/env python3
import os
import torch
import logging
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import re
import gc
import signal
import sys

class InferenceWrapper:
    def __init__(self, 
                 base_model_name="mistralai/Mistral-7B-Instruct-v0.3", 
                 adapter_path="mistral_qlora_finetuned",
                 temperature=0.6,
                 max_new_tokens=512):
        self.temperature = temperature
        self.max_new_tokens = max_new_tokens
        
        # Configure logging
        logging.basicConfig(
            format='%(asctime)s - %(levelname)s - %(message)s',
            level=logging.INFO
        )
        
        try:
            # Initialize tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            # Quantization configuration
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4"
            )
            
            logging.info("Loading base model...")
            # Load base model with quantization
            base_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                quantization_config=quantization_config,
                device_map="auto",
                torch_dtype=torch.float16
            )
            
            logging.info("Loading fine-tuned adapter...")
            self.model = PeftModel.from_pretrained(base_model, adapter_path)
            self.model.eval()  # Set to evaluation mode
            logging.info("Model initialization complete!")
            
        except Exception as e:
            logging.error(f"Error during model initialization: {str(e)}")
            raise

    def remove_emojis_and_clean(self, text):
        """Remove emojis and clean up text."""
        # Remove emojis and special characters
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub('', text)
        
        # Remove repeated characters
        text = re.sub(r'(.)\1{2,}', r'\1\1', text)
        
        # Clean up extra whitespace
        text = ' '.join(text.split())
        return text.strip()

    def generate_response(self, instruction):
        """Generate a response for the given instruction."""
        try:
            # Format the prompt
            prompt = (
                f"### System: You are a helpful AI assistant that answers user queries "
                f"accurately and professionally. Avoid using emojis or special characters.\n"
                f"### Instruction:\n{instruction}\n\n"
                f"### Response:\n"
            )
            
            # Tokenize input
            inputs = self.tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
            
            # Generate response
            with torch.no_grad():
                generation_output = self.model.generate(
                    **inputs,
                    max_new_tokens=self.max_new_tokens,
                    temperature=self.temperature,
                    top_p=0.9,
                    top_k=50,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    do_sample=True
                )
            
            # Decode and clean response
            response = self.tokenizer.decode(generation_output[0], skip_special_tokens=True)
            response = response.split("### Response:\n")[-1].strip()
            response = self.remove_emojis_and_clean(response)
            
            return response
            
        except Exception as e:
            logging.error(f"Error generating response: {str(e)}")
            return f"Error generating response: {str(e)}"
    
    def cleanup(self):
        """Clean up resources."""
        try:
            del self.model
            del self.tokenizer
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()
            logging.info("Cleanup complete")
        except Exception as e:
            logging.error(f"Error during cleanup: {str(e)}")

def signal_handler(sig, frame):
    """Handle interruption signals."""
    print("\nInterrupted! Cleaning up...")
    if 'inference' in globals():
        inference.cleanup()
    sys.exit(0)

def main():
    # Register signal handler for clean exit
    signal.signal(signal.SIGINT, signal_handler)
    
    try:
        # Initialize inference wrapper
        print("Initializing model... This may take a few moments.")
        global inference
        inference = InferenceWrapper()
        print("\nModel is ready! Type 'quit' to exit.")
        
        # Interactive loop
        while True:
            try:
                user_input = input("\nEnter your question: ").strip()
                
                if user_input.lower() in ['quit', 'exit']:
                    print("Cleaning up and exiting...")
                    break
                
                if not user_input:
                    print("Please enter a question.")
                    continue
                
                print("\nGenerating response...")
                response = inference.generate_response(user_input)
                print("\nResponse:", response)
                
            except KeyboardInterrupt:
                print("\nInterrupted by user. Cleaning up...")
                break
            except Exception as e:
                print(f"Error: {str(e)}")
                print("Please try again.")
    
    finally:
        # Clean up resources
        if 'inference' in globals():
            inference.cleanup()
        print("\nGoodbye!")

if __name__ == "__main__":
    main()

Initializing model... This may take a few moments.


2024-12-10 02:34:22 - INFO - root - Loading base model... - __init__:41
2024-12-10 02:34:22 - INFO - accelerate.utils.modeling - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk). - get_balanced_memory:1014


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

2024-12-10 02:34:26 - INFO - root - Loading fine-tuned adapter... - __init__:50
2024-12-10 02:34:26 - INFO - root - Model initialization complete! - __init__:53



Model is ready! Type 'quit' to exit.
