In [None]:
"""
SIMPLE FINE-TUNING SCRIPT - All-in-One
Just update the paths below and run!
"""

import json
import logging
from pathlib import Path
import torch
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)

# CONFIGURATION - EDIT THESE VALUES

# Your dataset path (update this!)
DATASET_PATH = r"C:\Users\ncc333\Desktop\My_Task\fine tuning\New folder\fine_tune_dataset.jsonl"

# Output directory for trained model
OUTPUT_DIR = "trained_model"

# Model to fine-tune (you can change this)
MODEL_NAME = "microsoft/DialoGPT-medium"

# Training settings (adjust based on your GPU)
EPOCHS = 3
BATCH_SIZE = 2
LEARNING_RATE = 2e-4

# SETUP LOGGING

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# MAIN SCRIPT

def main():
    logger.info("=" * 80)
    logger.info("SIMPLE FINE-TUNING SCRIPT")
    logger.info("=" * 80)
    
    # Step 1: Check dataset
    logger.info(f" Checking dataset: {DATASET_PATH}")
    if not Path(DATASET_PATH).exists():
        logger.error(f" Dataset not found: {DATASET_PATH}")
        logger.error("Please update DATASET_PATH in the script!")
        return
    
    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
        num_samples = len(f.readlines())
    logger.info(f"✓ Found {num_samples} samples")
    
    # Step 2: Load model and tokenizer
    logger.info(f" Loading model: {MODEL_NAME}")
    
    # Use 4-bit quantization to save memory
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    )
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16
    )
    logger.info("✓ Model loaded")
    
    # Step 3: Setup LoRA
    logger.info(" Setting up LoRA adapters")
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["q_proj", "v_proj"],
        bias="none"
    )
    
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # Step 4: Prepare dataset
    logger.info(" Preparing dataset")
    
    def format_and_tokenize(example):
        # Handle different JSON formats
        if "text" in example:
            text = example["text"]
        elif "instruction" in example and "output" in example:
            text = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
        elif "prompt" in example and "completion" in example:
            text = f"{example['prompt']}\n{example['completion']}"
        else:
            # Fallback: join all string values
            text = " ".join([str(v) for v in example.values() if isinstance(v, str)])
        
        # Tokenize
        tokenized = tokenizer(
            text,
            truncation=True,
            max_length=512,
            padding=False
        )
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized
    
    # Load and process dataset
    dataset = Dataset.from_json(DATASET_PATH)
    dataset = dataset.map(
        format_and_tokenize,
        remove_columns=dataset.column_names,
        desc="Tokenizing"
    )
    logger.info(f"✓ Dataset prepared: {len(dataset)} samples")
    
    # Step 5: Setup training
    logger.info(" Starting training")
    
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=8,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=10,
        save_steps=500,
        save_total_limit=2,
        report_to="none",  # No external logging
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
    )
    
    # Step 6: Train!
    logger.info("=" * 80)
    logger.info("TRAINING STARTED")
    logger.info("=" * 80)
    
    trainer.train()
    
    # Step 7: Save model
    logger.info(" Saving model")
    final_path = Path(OUTPUT_DIR) / "final"
    trainer.save_model(str(final_path))
    tokenizer.save_pretrained(str(final_path))
    
    logger.info("=" * 80)
    logger.info(" TRAINING COMPLETE!")
    logger.info("=" * 80)
    logger.info(f"Model saved to: {final_path}")
    logger.info("\nTo use your model:")
    logger.info("1. Load it with: AutoModelForCausalLM.from_pretrained('trained_model/final')")
    logger.info("2. Or continue to the inference script below")


def test_model():
    """Simple test function to try your trained model"""
    logger.info("\n" + "=" * 80)
    logger.info("TESTING TRAINED MODEL")
    logger.info("=" * 80)
    
    model_path = Path(OUTPUT_DIR) / "final"
    
    if not model_path.exists():
        logger.error(" Model not found. Train first!")
        return
    
    # Load model
    logger.info("Loading model...")
    tokenizer = AutoTokenizer.from_pretrained(str(model_path))
    model = AutoModelForCausalLM.from_pretrained(
        str(model_path),
        device_map="auto",
        torch_dtype=torch.float16
    )
    model.eval()
    
    # Test prompt
    prompt = "Hello! How can I help you today?"
    logger.info(f"\nPrompt: {prompt}")
    
    # Generate
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    logger.info(f"Response: {response}\n")


if __name__ == "__main__":
    # Run training
    main()
    
    # Uncomment to test after training:
    # test_model()


ERROR:__main__:❌ Dataset not found: C:\Users\ncc333\Desktop\My_Task\fine tuning\New folder\fine_tune_dataset.jsonl
ERROR:__main__:Please update DATASET_PATH in the script!
