# GPT-2 Sanatana Dharma Training
## Fine-tune GPT-2 on Bhagavad-Gita Dataset

This notebook trains GPT-2 on the Bhagavad-Gita dataset to create a specialized AI assistant for Hindu scriptures.


## 1. Setup Environment


In [None]:
# Install required packages
!pip install transformers datasets accelerate torch scikit-learn numpy tqdm


In [None]:
import json
import torch
from transformers import (
    GPT2Tokenizer, 
    GPT2LMHeadModel, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import numpy as np
from tqdm import tqdm
import os
from google.colab import files

print("✅ All imports successful!")


## 2. Upload Training Dataset


In [None]:
# Upload the enhanced Bhagavad-Gita dataset
print("📁 Please upload your enhanced_llama_training_dataset.jsonl file")
uploaded = files.upload()

# Get the uploaded file name
dataset_file = None
for filename in uploaded.keys():
    if filename.endswith('.jsonl'):
        dataset_file = filename
        break

if dataset_file:
    print(f"✅ Dataset uploaded: {dataset_file}")
else:
    print("❌ No JSONL file found. Please upload the dataset file.")


## 3. Load and Prepare Dataset


In [None]:
def load_gita_dataset(jsonl_path):
    """Load and format the Bhagavad-Gita dataset for GPT-2 training"""
    texts = []
    
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Loading dataset"):
            data = json.loads(line.strip())
            
            # Format as conversation for GPT-2
            if 'instruction' in data and 'output' in data:
                # Format: "Human: {instruction} Assistant: {output}<|endoftext|>"
                text = f"Human: {data['instruction']} Assistant: {data['output']}<|endoftext|>"
                texts.append(text)
            elif 'prompt' in data and 'response' in data:
                # Format: "Question: {prompt} Answer: {response}<|endoftext|>"
                text = f"Question: {data['prompt']} Answer: {data['response']}<|endoftext|>"
                texts.append(text)
    
    print(f"✅ Loaded {len(texts)} training examples")
    return texts

# Load the dataset
if dataset_file:
    training_texts = load_gita_dataset(dataset_file)
    
    # Show sample
    print("\n📝 Sample training text:")
    print(training_texts[0][:200] + "...")
else:
    print("❌ No dataset file available")


## 4. Prepare GPT-2 Model and Tokenizer


In [None]:
# Load GPT-2 tokenizer
print("🔄 Loading GPT-2 tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add special tokens for our use case
special_tokens = {
    "pad_token": "<|pad|>",
    "eos_token": "<|endoftext|>",
    "bos_token": "<|startoftext|>",
    "sep_token": "<|sep|>"
}

tokenizer.add_special_tokens(special_tokens)
print("✅ Tokenizer loaded and special tokens added")

# Load GPT-2 model
print("🔄 Loading GPT-2 model...")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Resize token embeddings for new tokens
model.resize_token_embeddings(len(tokenizer))
print("✅ Model loaded and token embeddings resized")

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🖥️ Using device: {device}")

if torch.cuda.is_available():
    model = model.to(device)
    print(f"🚀 Model moved to GPU: {torch.cuda.get_device_name()}")


## 5. Tokenize Dataset


In [None]:
def tokenize_function(examples):
    """Tokenize the dataset for training"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

# Create dataset
print("🔄 Creating dataset...")
dataset = Dataset.from_dict({"text": training_texts})

# Tokenize dataset
print("🔄 Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

print("✅ Dataset tokenized successfully")
print(f"📊 Dataset size: {len(tokenized_dataset)} examples")


## 6. Configure Training


In [None]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-2 is not a masked language model
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_sanatana_dharma",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False,
    fp16=torch.cuda.is_available(),  # Use fp16 if GPU available
    dataloader_num_workers=2,
    report_to=None,  # Disable wandb
)

print("✅ Training configuration set up")


## 7. Train the Model


In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("🚀 Starting training...")
print(f"📊 Training on {len(tokenized_dataset)} examples")
print(f"⚙️ Epochs: {training_args.num_train_epochs}")
print(f"📦 Batch size: {training_args.per_device_train_batch_size}")

# Start training
trainer.train()

print("✅ Training completed!")


## 8. Save the Trained Model


In [None]:
# Save the trained model
print("💾 Saving trained model...")
trainer.save_model()
tokenizer.save_pretrained("./gpt2_sanatana_dharma")

print("✅ Model saved successfully!")
print("📁 Model saved to: ./gpt2_sanatana_dharma")


## 9. Test the Trained Model


In [None]:
def test_gpt2_model(prompt, max_length=150):
    """Test the trained GPT-2 model"""
    
    # Format prompt
    formatted_prompt = f"Question: {prompt} Answer:"
    
    # Tokenize
    inputs = tokenizer.encode(formatted_prompt, return_tensors="pt")
    
    if torch.cuda.is_available():
        inputs = inputs.to(device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=1
        )
    
    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract answer part
    if "Answer:" in response:
        answer = response.split("Answer:")[-1].strip()
    else:
        answer = response
    
    return answer

# Test the model with various questions
test_questions = [
    "What is dharma?",
    "Explain karma yoga",
    "What does the Bhagavad-Gita teach about detachment?",
    "How should one perform their duties?"
]

print("🧪 Testing trained model...")
print("=" * 60)

for question in test_questions:
    print(f"\n❓ Question: {question}")
    answer = test_gpt2_model(question)
    print(f"🤖 Answer: {answer}")
    print("-" * 60)


## 10. Download the Trained Model


In [None]:
# Create a zip file of the trained model
import shutil

print("📦 Creating model package...")
shutil.make_archive("gpt2_sanatana_dharma_model", "zip", "./gpt2_sanatana_dharma")

print("📥 Downloading trained model...")
files.download("gpt2_sanatana_dharma_model.zip")

print("✅ Model download initiated!")
print("\n📋 Next steps:")
print("1. Download the zip file")
print("2. Extract it to your server/models/ directory")
print("3. Update your model loader to use the trained GPT-2 model")
