## Package Installation

In [None]:
# Check if GPU is available
!nvidia-smi


In [None]:
!pip uninstall -y huggingface-hub fsspec transformers


In [None]:
# Install backoff library for handling rate limits
!pip install -q backoff

# Now import backoff after installation
import backoff  # We'll use this for exponential backoff

In [None]:
# Install packages for image generation
!pip install diffusers accelerate
!pip install Pillow

In [None]:
!pip install huggingface-hub==0.27.0 fsspec==2025.3.2 transformers==4.41.0


In [None]:
!pip install --upgrade diffusers transformers


In [None]:
# Image generation imports
from diffusers import StableDiffusionPipeline
import PIL.Image
import re
from typing import List, Dict

## Setting up hugging face credentials


In [None]:
import os
from google.colab import userdata

# Try to load token from Colab secrets first (more secure)
try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    print("Token loaded from Colab secrets")
except:
    # If not available, ask for manual input
    HF_TOKEN = input("Enter your Hugging Face token: ")

# Set the token as an environment variable
os.environ["HF_TOKEN"] = HF_TOKEN

# Test the token by logging in
from huggingface_hub import login
login(HF_TOKEN)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create a project directory
project_dir = '/content/drive/MyDrive/genai_synthesizer'
!mkdir -p {project_dir}

## Dataset Creation

In [None]:
import json
import time
import random
from tqdm.notebook import tqdm
import backoff
from huggingface_hub import InferenceClient

# Initialize the Hugging Face client
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)
print(f"Initialized client for model: {MODEL_ID}")

In [None]:
# Create a list of topics across domains
domains = {
    "Science": [
        "Photosynthesis", "Newton's Laws of Motion", "Cellular Respiration",
        "Atomic Structure", "DNA Replication", "Plate Tectonics",
        "Electromagnetic Spectrum", "Periodic Table Trends", "Ecological Succession",
        "Evolution by Natural Selection", "Quantum Mechanics Basics", "Carbon Cycle",
        "Human Circulatory System", "Stem Cells", "Climate Change"
    ],

    "Mathematics": [
        "Pythagorean Theorem", "Quadratic Equations", "Probability Fundamentals",
        "Calculus Derivatives", "Statistical Sampling Methods", "Geometric Sequences",
        "Linear Algebra Basics", "Set Theory Introduction", "Trigonometric Functions",
        "Logarithmic Properties", "Number Theory Fundamentals", "Differential Equations"
    ],

    "Computer Science": [
        "Binary Number System", "Data Structures Overview", "Basic Algorithms",
        "Object-Oriented Programming", "HTTP Protocol", "Regular Expressions",
        "Big O Notation", "Database Normalization", "Neural Networks Basics",
        "Version Control with Git", "Encryption Fundamentals", "Web API Design"
    ],

    "Humanities": [
        "Renaissance Art Period", "World War II Causes", "Ancient Greek Philosophy",
        "Comparative World Religions", "Literary Devices in Poetry", "French Revolution",
        "Industrial Revolution Impact", "Civil Rights Movement", "Existentialism",
        "Feminist Literary Theory", "Ethics of Technology", "Postcolonial Literature"
    ]
}

# Flatten the list
all_topics = []
for domain, topics in domains.items():
    for topic in topics:
        all_topics.append({"domain": domain, "topic": topic})

print(f"Prepared {len(all_topics)} topics across {len(domains)} domains")

# Shuffle the topics for more diverse training
random.seed(42)  # For reproducibility
random.shuffle(all_topics)

# Select topics for dataset creation
# You can increase this up to 500 for full dataset
selected_topics = all_topics[:60]  # Start with 60 topics for testing, increase as needed
print(f"Selected {len(selected_topics)} topics for dataset creation")

In [None]:
def create_prompt(topic, domain):
    """Create a detailed prompt for generating an educational lesson using Llama-3.1 format"""

    return f"""<|system|>
You are an expert educational content creator specializing in creating comprehensive, well-structured lessons.
<|user|>
I need you to create a detailed, educational lesson about "{topic}" (from the {domain} domain).

Follow these specific requirements:
1. Format the entire lesson using Markdown with proper headings, lists, and emphasis
2. Start with a level-1 heading (# Title) that is descriptive and engaging
3. Include an introduction that explains the importance and relevance of the topic
4. Provide 3-5 key concepts or principles with clear explanations
5. Include real-world applications or examples where appropriate
6. Add practical examples, formulas, or code snippets if relevant to the topic
7. End with a concise summary that reinforces the main points
8. Keep the content factually accurate and at an appropriate educational level
9. Make the content engaging and interesting for learners
10. Aim for approximately 500-800 words in total

Make sure the content has enough depth to be educational while remaining accessible to someone new to the topic.
<|assistant|>"""

@backoff.on_exception(backoff.expo,
                     (Exception),
                     max_tries=8,  # Maximum number of retries
                     max_time=300)  # Maximum total time to retry in seconds
def generate_lesson_with_backoff(topic, domain):
    """Generate a lesson with exponential backoff for API failures"""
    prompt = create_prompt(topic, domain)

    # Generate text from the model
    response = client.text_generation(
        prompt,
        max_new_tokens=1024,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1
    )

    # Extract just the lesson content
    lesson = response.strip()

    # For Llama-3.1 chat format
    if "<|assistant|>" in lesson:
        lesson = lesson.split("<|assistant|>")[1].strip()

    # Basic validation
    if len(lesson.split()) < 50:  # Too short
        raise ValueError("Generated lesson is too short")

    return lesson

def generate_dataset_in_chunks(topics, chunk_size=20, save_dir=project_dir):
    """Generate dataset in smaller chunks with frequent saving"""

    # Create or load existing progress
    progress_file = f"{save_dir}/training_data_progress.json"

    if os.path.exists(progress_file):
        print("Found existing progress, loading...")
        with open(progress_file, "r") as f:
            training_data = json.load(f)

        # Identify which topics have already been processed
        processed_topics = set((item["domain"], item["topic"]) for item in training_data)
        topics_to_process = [t for t in topics if (t["domain"], t["topic"]) not in processed_topics]
        print(f"Loaded {len(training_data)} existing entries, {len(topics_to_process)} remaining to process")
    else:
        training_data = []
        topics_to_process = topics
        print(f"Starting fresh with {len(topics_to_process)} topics to process")

    # Process in chunks
    chunks = [topics_to_process[i:i + chunk_size] for i in range(0, len(topics_to_process), chunk_size)]
    print(f"Split remaining work into {len(chunks)} chunks of up to {chunk_size} topics each")

    # Process each chunk
    for chunk_idx, chunk in enumerate(chunks):
        print(f"\nProcessing chunk {chunk_idx+1}/{len(chunks)} ({len(chunk)} topics)")

        # Create progress bar for this chunk
        chunk_progress = tqdm(chunk, desc=f"Chunk {chunk_idx+1}")

        for item in chunk_progress:
            topic = item["topic"]
            domain = item["domain"]

            # Update progress bar description
            chunk_progress.set_description(f"Generating: {topic}")

            try:
                # Generate the lesson with backoff for API failures
                lesson = generate_lesson_with_backoff(topic, domain)

                # Add to our dataset
                training_data.append({
                    "domain": domain,
                    "topic": topic,
                    "lesson": lesson
                })

                # Brief pause to avoid rate limiting
                time.sleep(random.uniform(0.5, 1.5))

            except Exception as e:
                print(f"\nError generating lesson for {topic}: {e}")
                # Save what we have so far even when errors occur
                with open(progress_file, "w") as f:
                    json.dump(training_data, f, indent=2)
                print(f"Progress saved after error. Current count: {len(training_data)}")
                continue

        # Save progress after each chunk
        with open(progress_file, "w") as f:
            json.dump(training_data, f, indent=2)

        print(f"Chunk {chunk_idx+1} complete. Progress saved. Current dataset size: {len(training_data)}")

        # Every 3 chunks, also save a backup
        if (chunk_idx + 1) % 3 == 0 and len(training_data) > 0:
            backup_file = f"{save_dir}/training_data_backup_{len(training_data)}.json"
            with open(backup_file, "w") as f:
                json.dump(training_data, f, indent=2)
            print(f"Created backup at {backup_file}")

    # Save final dataset
    if len(training_data) > 0:
        final_file = f"{save_dir}/lesson_dataset_complete.json"
        with open(final_file, "w") as f:
            json.dump(training_data, f, indent=2)

        print(f"\nDataset generation complete! Created {len(training_data)} lesson examples.")
        print(f"Final dataset saved to {final_file}")

    return training_data

In [None]:
# Generate the dataset in chunks - this will take some time
training_data = generate_dataset_in_chunks(selected_topics, chunk_size=20)

In [None]:
# Dataset quality check and conversion to Hugging Face format
from datasets import Dataset
from IPython.display import Markdown

def validate_dataset(dataset_path):
    """Perform basic validation and statistics on the dataset"""
    with open(dataset_path, "r") as f:
        data = json.load(f)

    total_entries = len(data)
    print(f"Total entries: {total_entries}")

    # Check distribution by domain
    domain_counts = {}
    for entry in data:
        domain = entry["domain"]
        domain_counts[domain] = domain_counts.get(domain, 0) + 1

    print("\nDistribution by domain:")
    for domain, count in domain_counts.items():
        print(f"  {domain}: {count} entries ({count/total_entries*100:.1f}%)")

    # Check lengths
    lesson_lengths = [len(entry["lesson"].split()) for entry in data]
    avg_length = sum(lesson_lengths) / len(lesson_lengths)
    min_length = min(lesson_lengths)
    max_length = max(lesson_lengths)

    print(f"\nLesson word count statistics:")
    print(f"  Average: {avg_length:.1f} words")
    print(f"  Minimum: {min_length} words")
    print(f"  Maximum: {max_length} words")

    # Check for markdown formatting
    markdown_header_count = sum(1 for entry in data if "#" in entry["lesson"])
    markdown_list_count = sum(1 for entry in data if "- " in entry["lesson"] or "* " in entry["lesson"])

    print(f"\nMarkdown formatting:")
    print(f"  Headers: {markdown_header_count} lessons ({markdown_header_count/total_entries*100:.1f}%)")
    print(f"  Lists: {markdown_list_count} lessons ({markdown_list_count/total_entries*100:.1f}%)")

    # Return the validated data
    return data

def prepare_hf_dataset(data, save_dir=project_dir):
    """Convert dataset to Hugging Face format with train/val/test splits"""
    # Convert to Dataset object
    dataset = Dataset.from_list(data)

    # Split the dataset
    splits = dataset.train_test_split(test_size=0.15, seed=42)
    train_test = splits["train"].train_test_split(test_size=0.15, seed=42)

    dataset_dict = {
        "train": train_test["train"],
        "validation": train_test["test"],
        "test": splits["test"]
    }

    # Print statistics
    print("Dataset splits:")
    for split, ds in dataset_dict.items():
        print(f"  {split}: {len(ds)} examples")

    # Save the dataset to disk
    dataset_path = f"{save_dir}/lesson_dataset_hf"
    for split, ds in dataset_dict.items():
        ds.save_to_disk(f"{dataset_path}/{split}")

    print(f"Saved dataset to {dataset_path}/")

    return dataset_dict

# Run dataset validation and conversion
dataset_file = f"{project_dir}/lesson_dataset_complete.json"
# Alternative: Use progress file if complete file not ready yet
if not os.path.exists(dataset_file):
    dataset_file = f"{project_dir}/training_data_progress.json"

validated_data = validate_dataset(dataset_file)
dataset_dict = prepare_hf_dataset(validated_data)

# Display a sample
def display_sample(dataset_dict, split="train"):
    """Display a random sample from the dataset"""
    dataset = dataset_dict[split]
    idx = random.randint(0, len(dataset) - 1)
    sample = dataset[idx]

    print(f"DOMAIN: {sample['domain']}")
    print(f"TOPIC: {sample['topic']}")
    print(f"\nLESSON CONTENT PREVIEW:\n")

    # Show first 300 characters
    preview = sample['lesson'][:300] + "..." if len(sample['lesson']) > 300 else sample['lesson']
    print(preview)

    # Save to file for easier viewing of full content
    sample_file = f"{project_dir}/sample_{sample['topic'].replace(' ', '_')}.md"
    with open(sample_file, "w") as f:
        f.write(f"# Sample: {sample['topic']}\n\n")
        f.write(sample['lesson'])

    print(f"\nFull sample saved to: {sample_file}")

    return sample

# View a sample
random_sample = display_sample(dataset_dict)

## Finetuning

In [None]:
# Install packages in the exact same way as your successful project
!pip install transformers accelerate datasets peft bitsandbytes trl
!pip install git+https://github.com/huggingface/transformers.git
!pip install torch transformers datasets accelerate bitsandbytes peft
!pip install -U trl


In [None]:
import os
import json
from google.colab import drive

# Mount Google Drive for saving results
if not os.path.exists('/content/drive/MyDrive'):
    print("Mounting Google Drive...")
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")

# Define paths
# Runtime directory for uploaded files
runtime_dir = os.getcwd()
# Drive directory for saving results
drive_dir = '/content/drive/MyDrive/genai_synthesizer'

print(f"Runtime directory for input files: {runtime_dir}")
print(f"Google Drive directory for saving results: {drive_dir}")

# Ensure the results directory exists in Drive
os.makedirs(os.path.join(drive_dir, 'results'), exist_ok=True)
os.makedirs(os.path.join(drive_dir, 'logs'), exist_ok=True)

# Look specifically for the lesson_dataset_complete.json file
main_dataset_file = 'lesson_dataset_complete.json'
main_dataset_path = os.path.join(runtime_dir, main_dataset_file)

if os.path.exists(main_dataset_path):
    print(f"\nFound main dataset file: {main_dataset_path}")
    file_size_kb = os.path.getsize(main_dataset_path) / 1024
    print(f"File size: {file_size_kb:.2f} KB")

    # Try to read the dataset
    try:
        with open(main_dataset_path, 'r') as f:
            data = json.load(f)
        print(f"Successfully read dataset with {len(data)} records")

        # Show some stats
        if len(data) > 0:
            domains = {}
            for item in data:
                domain = item.get("domain", "Unknown")
                domains[domain] = domains.get(domain, 0) + 1

            print("\nDataset statistics:")
            print(f"Total examples: {len(data)}")
            print("\nDistribution by domain:")
            for domain, count in domains.items():
                print(f"  {domain}: {count} examples ({count/len(data)*100:.1f}%)")

            # Show a sample
            sample = data[0]
            print("\nSample record:")
            print(f"  Domain: {sample.get('domain', 'Not specified')}")
            print(f"  Topic: {sample.get('topic', 'Not specified')}")
            lesson_preview = sample.get('lesson', '')[:200]
            if len(sample.get('lesson', '')) > 200:
                lesson_preview += "..."
            print(f"  Lesson preview: {lesson_preview}")
    except Exception as e:
        print(f"Error reading dataset: {e}")
        import traceback
        traceback.print_exc()
else:
    print(f"\nMain dataset file '{main_dataset_file}' not found!")
    print("Please upload the file and run this cell again.")

    # List all files in the directory to help troubleshooting
    print("\nFiles in the current directory:")
    for file in os.listdir(runtime_dir):
        if os.path.isfile(os.path.join(runtime_dir, file)):
            size_kb = os.path.getsize(os.path.join(runtime_dir, file)) / 1024
            print(f"  - {file} ({size_kb:.2f} KB)")

In [None]:
# Import required libraries
import os
import json
import torch
import numpy as np
from datasets import Dataset

# Import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)

# Import PEFT
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)


# Set Hugging Face token
HF_TOKEN = input("Enter your Hugging Face token: ")
os.environ["HF_TOKEN"] = HF_TOKEN

# Use TinyLlama as our base model
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
print(f"Using base model: {BASE_MODEL}")

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU Model: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
# Import necessary libraries
import torch
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Dataset preparation function focusing on the main dataset file
def prepare_dataset_for_training(tokenizer, max_length=512):
    """Load and prepare the dataset from the main dataset file"""

    # Path to the main dataset file
    dataset_path = os.path.join(runtime_dir, 'lesson_dataset_complete.json')
    print(f"Attempting to load dataset from: {dataset_path}")

    if os.path.exists(dataset_path):
        try:
            with open(dataset_path, 'r') as f:
                data = json.load(f)
            print(f"Successfully loaded {len(data)} examples from dataset")
        except Exception as e:
            print(f"Error loading dataset: {e}")
            data = None
    else:
        print(f"Dataset file not found at: {dataset_path}")
        data = None

    # If loading failed, create synthetic data
    if data is None:
        print("Creating synthetic dataset instead")
        data = [
            {
                "domain": "Science",
                "topic": f"Topic {i}",
                "lesson": f"# Lesson {i}\n\nContent for lesson {i}. This is a synthetic example for fine-tuning."
            } for i in range(3)
        ]

    # Split into train/validation
    train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

    print(f"\nSplit into {len(train_data)} training and {len(val_data)} validation examples")

    # Create dataset objects
    dataset_dict = {
        "train": Dataset.from_dict({k: [d[k] for d in train_data] for k in train_data[0]}),
        "validation": Dataset.from_dict({k: [d[k] for d in val_data] for k in val_data[0]})
    }

    def preprocess_function(examples):
        """Process examples in batches"""
        formatted_prompts = []

        for i in range(len(examples["topic"])):
            topic = examples["topic"][i]
            lesson = examples["lesson"][i]

            # Format for TinyLlama chat format
            prompt = f"""<human>: Create a comprehensive educational lesson about the topic: {topic}
<assistant>: {lesson}"""
            formatted_prompts.append(prompt)

        # Tokenize inputs
        tokenized_inputs = tokenizer(
            formatted_prompts,
            padding=False,
            truncation=True,
            max_length=max_length,
            return_tensors=None
        )

        # Set labels equal to input_ids
        tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()

        return tokenized_inputs

    # Apply preprocessing to all splits
    tokenized_datasets = {}
    for split, dataset in dataset_dict.items():
        print(f"\nProcessing {split} split...")
        tokenized_datasets[split] = dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=dataset.column_names,
        )
        print(f"Processed {split} split: {len(tokenized_datasets[split])} examples")

    return tokenized_datasets

In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Ensure padding token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Set padding token to EOS token")

# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Set up 8-bit configuration
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False
)

In [None]:
# Load model with 8-bit quantization
print("Loading model with 8-bit quantization...")
try:
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Prepare model for 8-bit training
    print("Preparing model for 8-bit training...")
    model = prepare_model_for_kbit_training(model)
    print("Model prepared for 8-bit training successfully!")

except Exception as e:
    print(f"Error loading or preparing model: {e}")
    # If 8-bit fails, fallback to float16
    print("Falling back to float16 model...")
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float16,
        device_map="auto"
    )

In [None]:
# Define LoRA configuration
print("Setting up LoRA configuration...")
lora_config = LoraConfig(
    r=8,                        # Rank
    lora_alpha=32,              # Alpha parameter
    target_modules=["q_proj", "v_proj"],  # Target q and v projections in attention
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply LoRA adapters
print("Applying LoRA adapters to model...")
model = get_peft_model(model, lora_config)

# Print trainable parameters info
model.print_trainable_parameters()

In [None]:
# Prepare dataset from the uploaded file
print("Preparing dataset...")
tokenized_datasets = prepare_dataset_for_training(tokenizer)

# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Get transformers version
import transformers
print(f"Transformers version: {transformers.__version__}")

# Important: Disable wandb before creating training arguments
import os
os.environ["WANDB_DISABLED"] = "true"
print("Wandb disabled via environment variable")

# Define paths for results and logs
results_path = os.path.join(drive_dir, "results")
logs_path = os.path.join(drive_dir, "logs")
print(f"Results will be saved to: {results_path}")
print(f"Logs will be saved to: {logs_path}")

# Start with only the absolutely required parameters
from transformers import TrainingArguments

# Check which parameters are actually supported in this version
print("\nChecking TrainingArguments parameters...")
try:
    # Create a test instance with minimal parameters
    test_args = TrainingArguments(
        output_dir=results_path,
        per_device_train_batch_size=1
    )
    print("Created test TrainingArguments successfully")

    # Get actual attributes
    supported_attrs = [attr for attr in dir(test_args)
                       if not attr.startswith('_') and not callable(getattr(test_args, attr))]
    print(f"Supported attributes: {supported_attrs[:10]}... (showing first 10)")

except Exception as e:
    print(f"Error creating test TrainingArguments: {e}")

# Create training arguments with only the essential parameters
print("\nCreating minimal TrainingArguments...")
try:
    training_args = TrainingArguments(
        output_dir=results_path,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1
    )
    print("Basic TrainingArguments created successfully")

    # Now try to set additional attributes directly
    if hasattr(training_args, "learning_rate"):
        training_args.learning_rate = 2e-4
        print("Set learning_rate")

    if hasattr(training_args, "num_train_epochs"):
        training_args.num_train_epochs = 1
        print("Set num_train_epochs")

    if hasattr(training_args, "logging_dir"):
        training_args.logging_dir = logs_path
        print("Set logging_dir")

    if hasattr(training_args, "logging_steps"):
        training_args.logging_steps = 10
        print("Set logging_steps")

    if hasattr(training_args, "overwrite_output_dir"):
        training_args.overwrite_output_dir = True
        print("Set overwrite_output_dir")

    if hasattr(training_args, "fp16"):
        training_args.fp16 = True
        print("Set fp16")

    # Explicitly check and set report_to
    if hasattr(training_args, "report_to"):
        if isinstance(training_args.report_to, list):
            training_args.report_to = []
        else:
            training_args.report_to = "none"
        print(f"Disabled reporting with report_to = {training_args.report_to}")

except Exception as e:
    print(f"Error setting up training arguments: {e}")
    # Absolute fallback
    training_args = TrainingArguments(
        output_dir=results_path,
        per_device_train_batch_size=1
    )
    print("Created fallback TrainingArguments with minimal parameters")

# Initialize trainer
print("\nInitializing trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
import datetime # Make sure datetime is imported


# Start training and explicitly save to Google Drive
print("Starting training...")
try:
    # Clear GPU cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Train
    trainer.train()
    print("Training completed successfully!")

    # Save model to Google Drive
    print(f"Saving model to Google Drive at: {drive_dir}/tinyllama_lora_finetuned")
    model_save_path = os.path.join(drive_dir, "tinyllama_lora_finetuned")
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

    print(f"Model and tokenizer saved to {model_save_path}")

    # Convert training_args to a dictionary
    training_args_dict = training_args.to_dict()

    # Also save a copy of the model config for reference
    config_info = {
        "base_model": BASE_MODEL,
        "training_date": str(datetime.datetime.now()),
        "training_parameters": training_args_dict,
        "dataset_size": len(tokenized_datasets["train"]),
    }

    with open(os.path.join(drive_dir, "model_training_info.json"), 'w') as f:
        json.dump(config_info, f, indent=2)

    print(f"Training info saved to {drive_dir}/model_training_info.json")

except Exception as e:
    print(f"Error during training: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Test generation with the trained model and save results to Google Drive
import datetime

def generate_text(prompt, max_length=500):
    """Generate text using the trained model"""
    # Check if model exists
    if not os.path.exists(model_save_path):
        return f"Model not found at {model_save_path}. Please train the model first."

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_save_path)

    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Load LoRA model
    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, model_save_path)

    # Format prompt
    formatted_prompt = f"<human>: Create a comprehensive educational lesson about the topic: {prompt}\n<assistant>:"

    # Generate text
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True
    )

    # Decode and extract response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "<assistant>:" in generated_text:
        generated_text = generated_text.split("<assistant>:", 1)[1].strip()

    return generated_text

# Try generation with topics from the dataset
try:
    # Check if we have access to the loaded data
    if 'data' in locals() and data and len(data) > 0:
        # Use topics from our dataset
        import random
        sample_topics = random.sample([item['topic'] for item in data], min(3, len(data)))
    else:
        # Fallback topics
        sample_topics = ["Quantum Computing", "Neural Networks", "Climate Change"]

    print(f"\nGenerating lessons for {len(sample_topics)} topics...")

    # Create a directory for generated lessons in Google Drive
    generated_dir = os.path.join(drive_dir, "generated_lessons")
    os.makedirs(generated_dir, exist_ok=True)

    # Generate and save for each topic
    for i, topic in enumerate(sample_topics):
        print(f"\nGenerating lesson for topic {i+1}/{len(sample_topics)}: {topic}")
        start_time = datetime.datetime.now()

        generated_text = generate_text(topic)

        # Calculate generation time
        end_time = datetime.datetime.now()
        gen_time = (end_time - start_time).total_seconds()

        # Save to file
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"lesson_{topic.replace(' ', '_')}_{timestamp}.md"
        filepath = os.path.join(generated_dir, filename)

        with open(filepath, 'w') as f:
            f.write(f"# Lesson: {topic}\n\n")
            f.write(generated_text)

        print(f"Lesson generated in {gen_time:.2f} seconds")
        print(f"Saved to {filepath}")
        print(f"Preview of generated text:")
        print("-" * 50)
        print(generated_text[:200] + "..." if len(generated_text) > 200 else generated_text)
        print("-" * 50)

    print(f"\nAll lessons saved to {generated_dir}")

except Exception as e:
    print(f"Error during generation: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Create a summary HTML file with all generated lessons for easy viewing
try:
    # Path to generated lessons
    generated_dir = os.path.join(drive_dir, "generated_lessons")

    if os.path.exists(generated_dir):
        # Get markdown files
        markdown_files = [f for f in os.listdir(generated_dir) if f.endswith('.md')]

        if markdown_files:
            # Create HTML file
            html_path = os.path.join(drive_dir, "generated_lessons_summary.html")

            with open(html_path, 'w') as f:
                html_content = """<!DOCTYPE html>
<html>
<head>
    <title>Generated Educational Lessons</title>
    <style>
        body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
        h1 { color: #2c3e50; }
        h2 { color: #3498db; margin-top: 30px; }
        pre { background: #f8f9fa; padding: 15px; border-radius: 5px; overflow-x: auto; }
        .lesson { margin-bottom: 40px; border-bottom: 1px solid #ddd; padding-bottom: 20px; }
        .metadata { color: #7f8c8d; font-size: 0.9em; margin-bottom: 15px; }
    </style>
    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
</head>
<body>
    <h1>Generated Educational Lessons</h1>
    <p>Fine-tuned TinyLlama model trained on the educational lesson dataset.</p>
"""
                f.write(html_content)

                # Add each lesson
                for i, filename in enumerate(markdown_files):
                    filepath = os.path.join(generated_dir, filename)

                    with open(filepath, 'r') as lesson_file:
                        content = lesson_file.read()

                    # Extract topic from filename
                    topic = filename.replace('lesson_', '').replace('.md', '').split('_')[0]
                    topic = topic.replace('_', ' ')

                    # Prepare the JavaScript string using standard formatting
                    # This avoids f-string issues with backslashes
                    js_script = """
        document.getElementById('content-{}').innerHTML = marked.parse(`{}`);
""".format(i, content.replace('`', '\\`')) # Use format for content with escaped backticks

                    # Construct the lesson HTML block using an f-string
                    lesson_html = f"""
    <div class="lesson">
        <h2>Lesson {i+1}: {topic}</h2>
        <div class="metadata">File: {filename}</div>
        <div class="markdown-content" id="content-{i}"></div>
        <script>{js_script}</script>
    </div>
"""
                    f.write(lesson_html)

                # Close HTML
                f.write("""
    <script>
        // Additional JavaScript if needed
    </script>
</body>
</html>""")

            print(f"Created HTML summary of {len(markdown_files)} generated lessons")
            print(f"Saved to {html_path}")
        else:
            print("No generated lesson files found")
    else:
        print(f"Generated lessons directory not found: {generated_dir}")
except Exception as e:
    print(f"Error creating HTML summary: {e}")

In [None]:
import datetime
from transformers import Trainer # Ensure Trainer is imported
from google.colab import userdata # Import userdata to get the token

# ... (previous code for model, tokenizer, dataset, training_args, trainer setup)

# Define the name for the Hugging Face repository for the SECOND account
# Replace "other_username" with the username or organization name of the second account
# Replace "tinyllama-lesson-synthesizer" with a descriptive name for your model
hf_model_name_second_account = "Manoghn/tinyllama-lesson-synthesizer"
print(f"Model will be pushed to Hugging Face Hub as: {hf_model_name_second_account}")

# Get the token for the second Hugging Face account from Colab secrets
try:
    # Replace 'HF_TOKEN_SECOND_ACCOUNT' with the actual name of your secret in Colab
    HF_TOKEN_SECOND_ACCOUNT = userdata.get('HF_TOKEN_SECOND_ACCOUNT')
    print("Token for second account loaded from Colab secrets")
except:
    # Fallback to manual input if secret is not found (less secure)
    HF_TOKEN_SECOND_ACCOUNT = input("Enter the Hugging Face token for the second account: ")


# Start training and save to Google Drive and Hugging Face (using the second token)
print("Starting training...")
try:
    # Clear GPU cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Train
    trainer.train()
    print("Training completed successfully!")

    # Save model to Google Drive (optional, but good practice)
    print(f"Saving model to Google Drive at: {drive_dir}/tinyllama_lora_finetuned")
    model_save_path_drive = os.path.join(drive_dir, "tinyllama_lora_finetuned")
    model.save_pretrained(model_save_path_drive)
    tokenizer.save_pretrained(model_save_path_drive)
    print(f"Model and tokenizer saved to Google Drive at {model_save_path_drive}")

    # Push model and tokenizer to Hugging Face Hub using the SECOND account's token
    print(f"\nPushing model and tokenizer to Hugging Face Hub: {hf_model_name_second_account}")
    model.push_to_hub(hf_model_name_second_account, token=HF_TOKEN_SECOND_ACCOUNT)
    tokenizer.push_to_hub(hf_model_name_second_account, token=HF_TOKEN_SECOND_ACCOUNT)
    print(f"Model and tokenizer successfully pushed to Hugging Face Hub for the second account!")

    # Convert training_args to a dictionary and save info (optional)
    training_args_dict = training_args.to_dict()
    config_info = {
        "base_model": BASE_MODEL,
        "training_date": str(datetime.datetime.now()),
        "training_parameters": training_args_dict,
        "dataset_size": len(tokenized_datasets["train"]),
        "huggingface_repo": hf_model_name_second_account # Update repo name in info
    }
    with open(os.path.join(drive_dir, "model_training_info.json"), 'w') as f:
        json.dump(config_info, f, indent=2)
    print(f"Training info saved to {drive_dir}/model_training_info.json")


except Exception as e:
    print(f"Error during training or pushing to Hub: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Test generation with the trained model and save results to Google Drive
import datetime

def generate_text(prompt, max_length=500):
    """Generate text using the trained model"""
    # Check if model exists
    if not os.path.exists(model_save_path):
        return f"Model not found at {model_save_path}. Please train the model first."

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_save_path)

    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Load LoRA model
    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, model_save_path)

    # Format prompt
    formatted_prompt = f"<human>: Create a comprehensive educational lesson about the topic: {prompt}\n<assistant>:"

    # Generate text
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True
    )

    # Decode and extract response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "<assistant>:" in generated_text:
        generated_text = generated_text.split("<assistant>:", 1)[1].strip()

    return generated_text

# Define a new topic not in the training data
new_topic = "Cryptocurrency"  # Example topic
print(f"\nGenerating a lesson for a new topic: {new_topic}")

# Create a directory for generated lessons in Google Drive if it doesn't exist
generated_dir = os.path.join(drive_dir, "generated_lessons")
os.makedirs(generated_dir, exist_ok=True)

try:
    start_time = datetime.datetime.now()

    # Generate the lesson for the new topic
    generated_lesson = generate_text(new_topic)

    # Calculate generation time
    end_time = datetime.datetime.now()
    gen_time = (end_time - start_time).total_seconds()

    # Save the generated lesson to a file
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"lesson_{new_topic.replace(' ', '_')}_{timestamp}_test.md"
    filepath = os.path.join(generated_dir, filename)

    with open(filepath, 'w') as f:
        f.write(f"# Lesson: {new_topic}\n\n")
        f.write(generated_lesson)

    print(f"Lesson generated for '{new_topic}' in {gen_time:.2f} seconds")
    print(f"Saved to {filepath}")
    print(f"Preview of generated text:\n")
    print("-" * 50)
    print(generated_lesson[:500] + "..." if len(generated_lesson) > 500 else generated_lesson)
    print("-" * 50)

except Exception as e:
    print(f"Error during generation for '{new_topic}': {e}")
    import traceback
    traceback.print_exc()

## Main Function

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from google.colab import drive
import datetime

def generate_lesson_from_drive_model(model_drive_path, base_model_id, user_topic, max_length=500):
    """
    Loads a fine-tuned LoRA model from Google Drive, generates a lesson
    for a user-provided topic, and prints the result.

    Args:
        model_drive_path (str): The path in Google Drive where the fine-tuned
                                model and tokenizer are saved.
                                (e.g., '/content/drive/MyDrive/genai_synthesizer/tinyllama_lora_finetuned')
        base_model_id (str): The identifier for the base model used for finetuning.
                             (e.g., "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
        user_topic (str): The topic provided by the user to generate a lesson for.
        max_length (int): The maximum number of tokens for the generated lesson.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Check if Google Drive is mounted
    if not os.path.exists('/content/drive/MyDrive'):
        print("Google Drive is not mounted. Attempting to mount...")
        try:
            drive.mount('/content/drive')
            print("Google Drive mounted successfully.")
        except Exception as e:
            print(f"Failed to mount Google Drive: {e}")
            return

    # Check if the model path exists
    if not os.path.exists(model_drive_path):
        print(f"Error: Model not found at '{model_drive_path}'.")
        print("Please ensure you have trained and saved the model to this path.")
        return

    try:
        # Load tokenizer
        print(f"Loading tokenizer from {model_drive_path}...")
        tokenizer = AutoTokenizer.from_pretrained(model_drive_path)
        if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token # Ensure padding token is set
             print("Set padding token for tokenizer.")

        # Load base model
        print(f"Loading base model '{base_model_id}'...")
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_id,
            torch_dtype=torch.float16,
            device_map="auto"
        )

        # Load LoRA model
        print(f"Loading LoRA adapters from {model_drive_path}...")
        model = PeftModel.from_pretrained(base_model, model_drive_path)

        # Move model to device
        model.to(device)
        model.eval() # Set model to evaluation mode
        print("Model loaded successfully!")

        # Format prompt for the model
        formatted_prompt = f"<human>: Create a comprehensive educational lesson about the topic: {user_topic}\n<assistant>:"

        # Generate text
        print(f"\nGenerating lesson for topic: '{user_topic}'")
        start_time = datetime.datetime.now()

        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

        with torch.no_grad(): # Disable gradient calculation for inference
            outputs = model.generate(
                inputs.input_ids,
                max_length=max_length,
                temperature=0.7, # Adjust temperature and other parameters as needed
                top_p=0.9,
                repetition_penalty=1.1,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id # Ensure generation stops
            )

        # Decode and extract response
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if "<assistant>:" in generated_text:
            generated_text = generated_text.split("<assistant>:", 1)[1].strip()

        end_time = datetime.datetime.now()
        gen_time = (end_time - start_time).total_seconds()

        print(f"Lesson generated in {gen_time:.2f} seconds\n")
        print("-" * 50)
        print(f"Generated Lesson for '{user_topic}':\n")
        print(generated_text)
        print("-" * 50)

    except Exception as e:
        print(f"An error occurred during model loading or generation: {e}")
        import traceback
        traceback.print_exc()

# --- Main execution block ---
if __name__ == "__main__":
    # Define the path to your fine-tuned model in Google Drive
    # MAKE SURE THIS PATH MATCHES WHERE YOUR MODEL WAS SAVED
    model_save_path = '/content/drive/MyDrive/genai_synthesizer/tinyllama_lora_finetuned'

    # Define the base model ID used for finetuning
    base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # MAKE SURE THIS MATCHES YOUR BASE MODEL

    # Get user input for the topic
    user_input_topic = input("Enter the topic you want a lesson on: ")

    # Run the generation function
    generate_lesson_from_drive_model(model_save_path, base_model_id, user_input_topic)

## Lesson With Image generation



In [None]:
import torch
import os
import re
import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from diffusers import StableDiffusionPipeline
from IPython.display import display, Markdown, Image as IPImage
import matplotlib.pyplot as plt
from typing import List, Dict

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


In [None]:
# Initialize image generation pipeline - UPDATED VERSION
def initialize_image_pipeline():
    """Initialize Stable Diffusion pipeline for educational image generation."""
    print("Initializing image generation pipeline...")

    # Use better model for higher quality
    model_id = "stabilityai/stable-diffusion-2-1"  # Changed from 2-base

    # Import scheduler for better quality
    from diffusers import DPMSolverMultistepScheduler

    pipe = StableDiffusionPipeline.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        safety_checker=None,
        requires_safety_checker=False,
        variant="fp16"  # Added for better performance
    )

    # Use better scheduler for higher quality
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

    pipe = pipe.to(device)
    pipe.enable_attention_slicing()

    # Enable additional optimizations
    pipe.enable_vae_slicing()

    # Try to enable xFormers for better performance
    try:
        pipe.enable_xformers_memory_efficient_attention()
        print("xFormers enabled for better performance")
    except:
        pass

    print("Image generation pipeline ready!")
    return pipe


In [None]:
# DYNAMIC AI-based image suggestion - UPDATED VERSION
def get_dynamic_image_suggestions(lesson_text: str, main_topic: str, tokenizer, model, max_concepts: int = 3) -> List[Dict[str, str]]:
    """Let AI dynamically analyze the lesson and suggest the most relevant images."""

    # Create a more structured prompt for the AI
    analysis_prompt = f"""<human>: You are an educational content expert. Analyze this lesson about {main_topic} and suggest {max_concepts} images that would best help students understand the topic.

Lesson excerpt:
{lesson_text[:600]}...

Think about what visual representations would be most helpful for students. Consider:
- Key visual concepts mentioned in the lesson
- What students would benefit from seeing
- Different types of visuals (diagrams, real photos, historical scenes, processes, etc.)

For each image, provide ONLY:
- A short descriptive title
- A detailed visual description (no text, focus on what should be shown)

Format exactly like this:
1. Title: [descriptive title]
   Visual: [detailed description of what to show]

2. Title: [descriptive title]
   Visual: [detailed description of what to show]

3. Title: [descriptive title]
   Visual: [detailed description of what to show]
<assistant>:"""

    try:
        # Generate suggestions with higher temperature for creativity
        inputs = tokenizer(analysis_prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids,
                max_length=1500,
                temperature=0.9,
                top_p=0.95,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if "<assistant>:" in response:
            response = response.split("<assistant>:", 1)[1].strip()

        print("\n📋 AI Analysis:")
        print(response[:300] + "...")

        # Parse the response more flexibly
        concepts = []

        # Try multiple parsing patterns
        patterns = [
            r'(?:^|\n)\s*\d+\.\s*Title:\s*(.+?)\s*\n\s*Visual:\s*(.+?)(?=\n\s*\d+\.|$)',
            r'Title:\s*(.+?)\s*\nVisual:\s*(.+?)(?=\nTitle:|$)',
            r'(?:^|\n)(.+?):\s*(.+?)(?=\n|$)'
        ]

        for pattern in patterns:
            matches = re.findall(pattern, response, re.DOTALL | re.MULTILINE)
            if matches:
                for title, visual in matches[:max_concepts]:
                    if len(title) < 100 and len(visual) > 20:  # Basic validation
                        concepts.append({
                            'title': title.strip(),
                            'visual_description': visual.strip()
                        })
                break

        # If parsing failed, try a simpler approach
        if not concepts:
            print("\n🔄 Using simplified parsing...")
            lines = response.split('\n')
            current_title = None
            current_visual = []

            for line in lines:
                line = line.strip()
                if 'Title:' in line or (line and line[0].isdigit() and '.' in line):
                    if current_title and current_visual:
                        concepts.append({
                            'title': current_title,
                            'visual_description': ' '.join(current_visual)
                        })
                    current_title = line.split(':', 1)[-1].strip()
                    current_visual = []
                elif 'Visual:' in line:
                    current_visual.append(line.split(':', 1)[-1].strip())
                elif current_title and line:
                    current_visual.append(line)

            # Add the last one
            if current_title and current_visual:
                concepts.append({
                    'title': current_title,
                    'visual_description': ' '.join(current_visual)
                })

    except Exception as e:
        print(f"\n⚠️ Error in AI analysis: {e}")
        concepts = []

    # Convert to image generation prompts with ENHANCED QUALITY MODIFIERS
    final_concepts = []
    for concept in concepts[:max_concepts]:
        # Determine artistic style based on topic context
        if any(word in main_topic.lower() for word in ['history', 'ancient', 'war', 'empire', 'king', 'emperor']):
            style = "historical painting, epic composition, museum quality, dramatic lighting, highly detailed, 8k resolution, photorealistic"
        elif any(word in main_topic.lower() for word in ['science', 'biology', 'chemistry', 'physics']):
            style = "scientific illustration, educational diagram, clean design, accurate representation, highly detailed, professional quality, bright colors"
        elif any(word in main_topic.lower() for word in ['technology', 'computer', 'ai', 'digital']):
            style = "modern tech visualization, futuristic design, clean interface, professional infographic, high resolution, sharp focus"
        else:
            style = "professional educational illustration, clear visualization, high quality, detailed, bright lighting, 8k resolution"

        # Add quality enhancers to the prompt
        enhanced_prompt = f"{concept['visual_description']}, {style}, masterpiece, best quality, ultra-detailed"

        final_concepts.append({
            'title': concept['title'],
            'prompt': enhanced_prompt
        })

    # Fallback if AI didn't provide good suggestions
    if not final_concepts:
        print("\n🔧 Using context-aware fallback...")
        # Analyze the lesson for key terms
        key_terms = []

        # Extract bold terms
        bold_terms = re.findall(r'\*\*([^*]+)\*\*', lesson_text)
        key_terms.extend([term for term in bold_terms if len(term) > 3])

        # Extract capitalized phrases
        cap_phrases = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', lesson_text)
        key_terms.extend([phrase for phrase in cap_phrases if len(phrase) > 5])

        # Create fallback concepts based on extracted terms
        for i, term in enumerate(key_terms[:max_concepts]):
            final_concepts.append({
                'title': term,
                'prompt': f"Educational visualization of {term} in context of {main_topic}, detailed illustration, professional quality, high resolution, 8k, masterpiece"
            })

    return final_concepts


In [None]:
# Generate and display lesson with images - UPDATED VERSION
def generate_and_display_lesson_with_images(prompt, tokenizer, model, pipe, max_text_length=500, drive_dir=None):
    """Generate a lesson with dynamically suggested images."""
    print(f"\n{'='*60}")
    print(f"Generating enhanced lesson for: {prompt}")
    print('='*60)

    # Generate text lesson
    formatted_prompt = f"<human>: Create a comprehensive educational lesson about the topic: {prompt}\n<assistant>:"
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_text_length,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "<assistant>:" in generated_text:
        generated_text = generated_text.split("<assistant>:", 1)[1].strip()

    # Display the generated text
    print("\n📝 GENERATED LESSON:")
    print("-" * 60)
    display(Markdown(f"# Lesson: {prompt}\n\n{generated_text}"))

    # Get dynamic AI suggestions for images
    print("\n🤖 AI is analyzing the lesson to suggest relevant images...")
    concepts = get_dynamic_image_suggestions(generated_text, prompt, tokenizer, model)

    print(f"\n🎨 AI suggested {len(concepts)} images:")
    for i, concept in enumerate(concepts):
        print(f"\n{i+1}. {concept['title']}")
        print(f"   Description: {concept['prompt'][:100]}...")

    if concepts:
        # Create figure with larger size for better quality
        fig, axes = plt.subplots(1, len(concepts), figsize=(8*len(concepts), 8))
        if len(concepts) == 1:
            axes = [axes]

        for i, concept in enumerate(concepts):
            try:
                print(f"\n⏳ Generating image {i+1}/{len(concepts)}: {concept['title']}")

                # Truncate prompt if too long
                prompt_text = concept['prompt']
                if len(prompt_text.split()) > 60:  # Rough token estimate
                    words = prompt_text.split()[:55]
                    prompt_text = ' '.join(words) + ", high quality, masterpiece"

                # ENHANCED NEGATIVE PROMPT for better quality
                enhanced_negative_prompt = (
                    "text overlay, watermark, low quality, blurry, distorted, ugly, "
                    "words, letters, numbers, deformed, duplicate, morbid, mutilated, "
                    "poorly drawn, worst quality, low resolution, bad anatomy, "
                    "bad proportions, gross proportions, malformed limbs, "
                    "missing arms, missing legs, extra arms, extra legs, "
                    "mutated hands, fused fingers, too many fingers"
                )

                # Generate with IMPROVED PARAMETERS
                image = pipe(
                    prompt=prompt_text,
                    negative_prompt=enhanced_negative_prompt,
                    num_inference_steps=50,      # Increased from 35
                    guidance_scale=8.0,          # Slightly increased from 7.5
                    height=768,                  # Increased from 512
                    width=768,                   # Increased from 512
                    generator=torch.Generator(device=device).manual_seed(42 + i)  # For consistency
                ).images[0]

                # Optional: Apply post-processing for better clarity
                from PIL import ImageEnhance

                # Enhance sharpness
                enhancer = ImageEnhance.Sharpness(image)
                image = enhancer.enhance(1.2)

                # Enhance contrast slightly
                enhancer = ImageEnhance.Contrast(image)
                image = enhancer.enhance(1.1)

                axes[i].imshow(image)
                axes[i].set_title(concept['title'], fontsize=14, pad=10, weight='bold')
                axes[i].axis('off')

                # Save high-quality version if drive_dir is provided
                if drive_dir:
                    save_dir = f"{drive_dir}/generated_images"
                    os.makedirs(save_dir, exist_ok=True)
                    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                    image.save(f"{save_dir}/{concept['title'].replace(' ', '_')}_{timestamp}.png",
                              "PNG", quality=95)

                print(f"✅ Successfully generated: {concept['title']}")

            except Exception as e:
                print(f"❌ Error: {e}")
                # Try with lower resolution as fallback
                try:
                    print(f"⏳ Retrying with lower resolution...")
                    image = pipe(
                        prompt=prompt_text,
                        negative_prompt=enhanced_negative_prompt,
                        num_inference_steps=40,
                        guidance_scale=7.5,
                        height=512,
                        width=512
                    ).images[0]

                    axes[i].imshow(image)
                    axes[i].set_title(concept['title'], fontsize=14, pad=10, weight='bold')
                    axes[i].axis('off')
                    print(f"✅ Successfully generated at lower resolution: {concept['title']}")
                except:
                    axes[i].text(0.5, 0.5, f"Generation failed\n{concept['title']}",
                               ha='center', va='center', transform=axes[i].transAxes)
                    axes[i].axis('off')

        plt.tight_layout()
        plt.show()

    print(f"\n{'='*60}")
    return generated_text, concepts

In [None]:
# MAIN EXECUTION - UPDATED VERSION
def main():
    # Paths
    drive_dir = '/content/drive/MyDrive/genai_synthesizer'
    model_save_path = os.path.join(drive_dir, 'tinyllama_lora_finetuned')
    base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_save_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load model
    print("Loading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    print("Loading LoRA adapters...")
    model = PeftModel.from_pretrained(base_model, model_save_path)
    model.eval()
    print("Model loaded successfully!")

    # Initialize image pipeline with improvements
    image_pipe = initialize_image_pipeline()

    # Interactive loop
    while True:
        user_topic = input("\nEnter a topic (or 'quit' to exit): ")
        if user_topic.lower() == 'quit':
            break

        # Generate and display
        generate_and_display_lesson_with_images(
            user_topic,
            tokenizer,
            model,
            image_pipe,
            drive_dir=drive_dir  # Pass drive_dir as parameter
        )

# Run the main function
if __name__ == "__main__":
    main()

## Quiz, Audio and Image With Lesson

In [None]:
!pip install langchain chromadb sentence-transformers faiss-cpu
!pip install gtts pygame  # For audio generation

In [None]:
# Run these commands in order in your Colab notebook
!pip install -U bitsandbytes
!pip install -U accelerate
!pip install -U transformers

# Check if bitsandbytes is properly installed
import bitsandbytes as bnb
print(f"bitsandbytes version: {bnb.__version__}")

# For Colab, sometimes you need to restart runtime after installing
# If you get an error, try: Runtime -> Restart runtime

In [None]:
!pip install -U langchain-community

In [None]:
import os
import json
import re
import datetime
import torch
from typing import List, Dict, Tuple
import numpy as np
from IPython.display import display, Markdown, Image as IPImage
try:
    from IPython.display import Audio
except:
    Audio = None
import matplotlib.pyplot as plt

# Model imports
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Image generation imports
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from PIL import Image, ImageEnhance

# RAG imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

# Audio imports (optional)
try:
    from gtts import gTTS
    AUDIO_AVAILABLE = True
except:
    AUDIO_AVAILABLE = False
    print("gTTS not available. Audio generation will be skipped.")

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


# Global variables to store models (initialized once)
text_model = None
tokenizer = None
image_pipe = None
embeddings = None
text_splitter = None
quiz_model = None
quiz_tokenizer = None


In [None]:
import os
from google.colab import userdata

# Try to load token from Colab secrets first (more secure)
try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    print("Token loaded from Colab secrets")
except:
    # If not available, ask for manual input
    HF_TOKEN = input("Enter your Hugging Face token: ")

# Set the token as an environment variable
os.environ["HF_TOKEN"] = HF_TOKEN

# Test the token by logging in
from huggingface_hub import login
login(HF_TOKEN)

In [None]:
def initialize_models(hf_model_name="Manoghn/tinyllama-lesson-synthesizer",
                     base_model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
    """Initialize all models including quiz model"""
    global text_model, tokenizer, image_pipe, embeddings, text_splitter

    print("🚀 Initializing models...")

    # 1. Load text generation model (your fine-tuned model)
    print(f"\n📚 Loading lesson generation model from: {hf_model_name}")
    tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    text_model = PeftModel.from_pretrained(base_model, hf_model_name)
    text_model.eval()
    print("✅ Lesson generation model loaded!")

    # 2. Initialize quiz generation model
    print("\n🎯 Loading quiz generation model...")
    initialize_quiz_model()

    # 3. Initialize image generation pipeline (same as before)
    print("\n🎨 Loading image generation model...")
    model_id = "stabilityai/stable-diffusion-2-1"
    image_pipe = StableDiffusionPipeline.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        safety_checker=None,
        requires_safety_checker=False,
        variant="fp16"
    )

    image_pipe.scheduler = DPMSolverMultistepScheduler.from_config(image_pipe.scheduler.config)
    image_pipe = image_pipe.to(device)
    image_pipe.enable_attention_slicing()
    image_pipe.enable_vae_slicing()
    print("✅ Image model loaded!")

    # 4. Initialize RAG components
    print("\n🔍 Loading RAG components...")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': device}
    )

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )
    print("✅ RAG components loaded!")

    print("\n✅ All models initialized successfully!")
    return True

In [None]:
def generate_lesson(topic, max_length=800):
    """Generate educational lesson text"""
    if text_model is None or tokenizer is None:
        print("❌ Models not initialized! Run initialize_models() first.")
        return None

    print(f"\n📝 Generating lesson for: {topic}")

    prompt = f"<human>: Create a comprehensive educational lesson about the topic: {topic}\n<assistant>:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = text_model.generate(
            inputs.input_ids,
            max_length=max_length,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )

    lesson_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "<assistant>:" in lesson_text:
        lesson_text = lesson_text.split("<assistant>:", 1)[1].strip()

    return lesson_text


In [None]:
def get_image_suggestions(lesson_text, topic, max_images=3):
    """Get AI-suggested images for the lesson"""
    print("\n🤖 Analyzing lesson for image suggestions...")

    # Create a structured prompt for the AI
    analysis_prompt = f"""<human>: You are an educational content expert. Analyze this lesson about {topic} and suggest {max_images} images that would best help students understand the topic.

Lesson excerpt:
{lesson_text[:600]}...

Think about what visual representations would be most helpful for students. Consider:
- Key visual concepts mentioned in the lesson
- What students would benefit from seeing
- Different types of visuals (diagrams, real photos, historical scenes, processes, etc.)

For each image, provide ONLY:
- A short descriptive title
- A detailed visual description (no text, focus on what should be shown)

Format exactly like this:
1. Title: [descriptive title]
   Visual: [detailed description of what to show]

2. Title: [descriptive title]
   Visual: [detailed description of what to show]

3. Title: [descriptive title]
   Visual: [detailed description of what to show]
<assistant>:"""

    try:
        # Generate suggestions with higher temperature for creativity
        inputs = tokenizer(analysis_prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)

        with torch.no_grad():
            outputs = text_model.generate(
                inputs.input_ids,
                max_length=1500,
                temperature=0.9,
                top_p=0.95,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if "<assistant>:" in response:
            response = response.split("<assistant>:", 1)[1].strip()

        print("\n📋 AI Analysis:")
        print(response[:300] + "...")

        # Parse the response more flexibly
        concepts = []

        # Try multiple parsing patterns
        patterns = [
            r'(?:^|\n)\s*\d+\.\s*Title:\s*(.+?)\s*\n\s*Visual:\s*(.+?)(?=\n\s*\d+\.|$)',
            r'Title:\s*(.+?)\s*\nVisual:\s*(.+?)(?=\nTitle:|$)',
            r'(?:^|\n)(.+?):\s*(.+?)(?=\n|$)'
        ]

        for pattern in patterns:
            matches = re.findall(pattern, response, re.DOTALL | re.MULTILINE)
            if matches:
                for title, visual in matches[:max_images]:
                    if len(title) < 100 and len(visual) > 20:  # Basic validation
                        concepts.append({
                            'title': title.strip(),
                            'visual_description': visual.strip()
                        })
                break

        # If parsing failed, try a simpler approach
        if not concepts:
            print("\n🔄 Using simplified parsing...")
            lines = response.split('\n')
            current_title = None
            current_visual = []

            for line in lines:
                line = line.strip()
                if 'Title:' in line or (line and line[0].isdigit() and '.' in line):
                    if current_title and current_visual:
                        concepts.append({
                            'title': current_title,
                            'visual_description': ' '.join(current_visual)
                        })
                    current_title = line.split(':', 1)[-1].strip()
                    current_visual = []
                elif 'Visual:' in line:
                    current_visual.append(line.split(':', 1)[-1].strip())
                elif current_title and line:
                    current_visual.append(line)

            # Add the last one
            if current_title and current_visual:
                concepts.append({
                    'title': current_title,
                    'visual_description': ' '.join(current_visual)
                })

    except Exception as e:
        print(f"\n⚠️ Error in AI analysis: {e}")
        concepts = []

    # Convert to image generation prompts
    final_concepts = []
    for concept in concepts[:max_images]:
        # Determine artistic style based on topic context
        if any(word in topic.lower() for word in ['history', 'ancient', 'war', 'empire', 'king', 'emperor']):
            style = "historical painting, epic composition, museum quality, dramatic lighting, highly detailed, 8k resolution, photorealistic"
        elif any(word in topic.lower() for word in ['science', 'biology', 'chemistry', 'physics', 'photosynth']):
            style = "scientific illustration, educational diagram, clean design, accurate representation, highly detailed, professional quality, bright colors"
        elif any(word in topic.lower() for word in ['technology', 'computer', 'ai', 'digital']):
            style = "modern tech visualization, futuristic design, clean interface, professional infographic, high resolution, sharp focus"
        else:
            style = "professional educational illustration, clear visualization, high quality, detailed, bright lighting, 8k resolution"

        # Add quality enhancers to the prompt
        enhanced_prompt = f"{concept['visual_description']}, {style}, masterpiece, best quality, ultra-detailed"

        final_concepts.append({
            'title': concept['title'],
            'prompt': enhanced_prompt
        })

    # Fallback if AI didn't provide good suggestions
    if not final_concepts:
        print("\n🔧 Using context-aware fallback...")
        # Analyze the lesson for key terms
        key_terms = []

        # Extract bold terms
        bold_terms = re.findall(r'\*\*([^*]+)\*\*', lesson_text)
        key_terms.extend([term for term in bold_terms if len(term) > 3])

        # Extract capitalized phrases
        cap_phrases = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', lesson_text)
        key_terms.extend([phrase for phrase in cap_phrases if len(phrase) > 5])

        # Create fallback concepts based on extracted terms
        for i, term in enumerate(key_terms[:max_images]):
            final_concepts.append({
                'title': term,
                'prompt': f"Educational visualization of {term} in context of {topic}, detailed illustration, professional quality, high resolution, 8k, masterpiece"
            })

    return final_concepts

In [None]:
def generate_images(suggestions):
    """Generate high-quality images from suggestions"""
    if image_pipe is None:
        print("❌ Image model not initialized! Run initialize_models() first.")
        return []

    print(f"\n🎨 Generating {len(suggestions)} images...")

    generated_images = []

    for i, suggestion in enumerate(suggestions):
        try:
            print(f"⏳ Generating image {i+1}/{len(suggestions)}: {suggestion['title']}")

            negative_prompt = (
                "text overlay, watermark, low quality, blurry, distorted, ugly, "
                "words, letters, numbers, deformed, duplicate, morbid, mutilated, "
                "poorly drawn, worst quality, low resolution, bad anatomy"
            )

            image = image_pipe(
                prompt=suggestion['prompt'],
                negative_prompt=negative_prompt,
                num_inference_steps=50,
                guidance_scale=8.0,
                height=768,
                width=768,
                generator=torch.Generator(device=device).manual_seed(42 + i)
            ).images[0]

            # Post-process
            enhancer = ImageEnhance.Sharpness(image)
            image = enhancer.enhance(1.2)
            enhancer = ImageEnhance.Contrast(image)
            image = enhancer.enhance(1.1)

            generated_images.append((suggestion['title'], image))
            print(f"✅ Generated: {suggestion['title']}")

        except Exception as e:
            print(f"❌ Error: {e}")
            # Try lower resolution
            try:
                image = image_pipe(
                    prompt=suggestion['prompt'],
                    negative_prompt=negative_prompt,
                    num_inference_steps=40,
                    guidance_scale=7.5,
                    height=512,
                    width=512
                ).images[0]
                generated_images.append((suggestion['title'], image))
            except:
                pass

    return generated_images

In [None]:
def create_vector_store(lesson_text, topic):
    """Create vector store for RAG"""
    print("\n📚 Creating vector store for Q&A generation...")

    chunks = text_splitter.split_text(lesson_text)
    documents = []

    for i, chunk in enumerate(chunks):
        doc = Document(
            page_content=chunk,
            metadata={"topic": topic, "chunk_id": i}
        )
        documents.append(doc)

    vector_store = FAISS.from_documents(documents, embeddings)
    print(f"Vector store created with {len(documents)} chunks")

    return vector_store

In [None]:
def initialize_quiz_model():
    """Initialize a better model specifically for quiz generation"""
    global quiz_model, quiz_tokenizer

    print("🎯 Loading specialized quiz generation model...")

    # Create proper quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_compute_dtype=torch.float16
    )

    # Try models in order of preference
    models_to_try = [
        "mistralai/Mistral-7B-Instruct-v0.1",
        "meta-llama/Llama-2-7b-chat-hf",
        "HuggingFaceH4/zephyr-7b-beta",
        "TheBloke/Llama-2-7B-Chat-GPTQ",  # Quantized version
        "google/flan-t5-xl",  # Good for Q&A
        "google/flan-t5-large"  # Smaller but still good
    ]

    for model_name in models_to_try:
        try:
            print(f"Trying to load: {model_name}")

            # Different loading strategies based on model
            if "GPTQ" in model_name:
                # GPTQ models don't need BitsAndBytes
                quiz_tokenizer = AutoTokenizer.from_pretrained(model_name)
                quiz_model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    torch_dtype=torch.float16,
                    device_map="auto"
                )
            elif "flan-t5" in model_name:
                # T5 models work differently
                from transformers import T5ForConditionalGeneration, T5Tokenizer
                quiz_tokenizer = T5Tokenizer.from_pretrained(model_name)
                quiz_model = T5ForConditionalGeneration.from_pretrained(
                    model_name,
                    torch_dtype=torch.float16,
                    device_map="auto"
                )
            else:
                # Regular models with quantization
                quiz_tokenizer = AutoTokenizer.from_pretrained(model_name)
                quiz_model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    quantization_config=bnb_config,
                    device_map="auto"
                )

            print(f"✅ Successfully loaded: {model_name}")
            return

        except Exception as e:
            print(f"❌ Failed to load {model_name}: {str(e)[:100]}")
            continue

    # If all models fail, don't use phi-2, use better fallback
    print("⚠️ All models failed. Using enhanced fallback generation instead.")
    quiz_model = None  # Will trigger fallback
    quiz_tokenizer = None

In [None]:
def generate_quiz_questions(lesson_text, topic, num_questions=15):
    """Generate multiple types of quiz questions using specialized model"""
    print(f"\n❓ Generating {num_questions} quiz questions with multiple formats...")

    # Initialize quiz model if not already loaded
    if quiz_model is None:
        initialize_quiz_model()

    # Determine question type distribution
    question_distribution = {
        "multiple_choice": int(num_questions * 0.3),  # 30%
        "true_false": int(num_questions * 0.25),      # 25%
        "fill_blank": int(num_questions * 0.25),      # 25%
        "short_answer": int(num_questions * 0.2)      # 20%
    }

    # Adjust for rounding
    total = sum(question_distribution.values())
    if total < num_questions:
        question_distribution["multiple_choice"] += num_questions - total

    print(f"📊 Question distribution: {question_distribution}")

    all_questions = []

    # Generate each type of question
    for q_type, count in question_distribution.items():
        if count > 0:
            print(f"\n🎲 Generating {count} {q_type} questions...")

            if quiz_model is not None:
                questions = generate_questions_by_type(lesson_text, topic, q_type, count)
            else:
                questions = generate_fallback_by_type(lesson_text, topic, q_type, count)

            all_questions.extend(questions)
            print(f"✅ Generated {len(questions)} {q_type} questions")

    # Shuffle for variety
    import random
    random.shuffle(all_questions)

    return all_questions[:num_questions]

In [None]:
def generate_questions_by_type(lesson_text, topic, question_type, count):
    """Generate specific question types using the quiz model"""

    # Create type-specific prompts
    if question_type == "multiple_choice":
        prompt = f"""Generate {count} multiple choice questions about {topic}.

Based on this lesson:
{lesson_text[:1000]}

Format each question exactly like this:
Question 1: [question text]
A) [option]
B) [option]
C) [option]
D) [option]
Correct: [A/B/C/D]

Question 2: [question text]
A) [option]
B) [option]
C) [option]
D) [option]
Correct: [A/B/C/D]"""

    elif question_type == "true_false":
        prompt = f"""Generate {count} true/false questions about {topic}.

Based on this lesson:
{lesson_text[:1000]}

Format each question exactly like this:
Question 1: [statement]
Answer: True/False

Question 2: [statement]
Answer: True/False"""

    elif question_type == "fill_blank":
        prompt = f"""Generate {count} fill-in-the-blank questions about {topic}.

Based on this lesson:
{lesson_text[:1000]}

Format each question exactly like this:
Question 1: [sentence with _____ for the blank]
Answer: [missing word/phrase]

Question 2: [sentence with _____ for the blank]
Answer: [missing word/phrase]"""

    else:  # short_answer
        prompt = f"""Generate {count} short answer questions about {topic}.

Based on this lesson:
{lesson_text[:1000]}

Format each question exactly like this:
Question 1: [question requiring 1-2 sentence answer]
Answer: [brief answer]

Question 2: [question requiring 1-2 sentence answer]
Answer: [brief answer]"""

    try:
        # Format prompt based on model
        model_name = quiz_model.config._name_or_path

        if "flan-t5" in model_name:
            formatted_prompt = prompt
        elif "mistral" in model_name.lower():
            formatted_prompt = f"<s>[INST] {prompt} [/INST]"
        elif "llama" in model_name.lower():
            formatted_prompt = f"<s>[INST] <<SYS>>\nYou are a helpful teacher creating quiz questions.\n<</SYS>>\n\n{prompt} [/INST]"
        else:
            formatted_prompt = prompt

        inputs = quiz_tokenizer(formatted_prompt, return_tensors="pt", max_length=1500, truncation=True)

        with torch.no_grad():
            outputs = quiz_model.generate(
                inputs.input_ids.to(device),
                max_length=2500,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=quiz_tokenizer.eos_token_id
            )

        response = quiz_tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract generated content
        if "[/INST]" in response:
            response = response.split("[/INST]", 1)[1].strip()

        # Parse based on question type
        return parse_questions_by_type(response, topic, question_type)

    except Exception as e:
        print(f"❌ Model generation failed: {e}")
        return generate_fallback_by_type(lesson_text, topic, question_type, count)


In [None]:
def generate_questions_by_type(lesson_text, topic, question_type, count):
    """Generate specific question types using the quiz model"""

    # Create type-specific prompts
    if question_type == "multiple_choice":
        prompt = f"""Generate {count} multiple choice questions about {topic}.

Based on this lesson:
{lesson_text[:1000]}

Format each question exactly like this:
Question 1: [question text]
A) [option]
B) [option]
C) [option]
D) [option]
Correct: [A/B/C/D]

Question 2: [question text]
A) [option]
B) [option]
C) [option]
D) [option]
Correct: [A/B/C/D]"""

    elif question_type == "true_false":
        prompt = f"""Generate {count} true/false questions about {topic}.

Based on this lesson:
{lesson_text[:1000]}

Format each question exactly like this:
Question 1: [statement]
Answer: True/False

Question 2: [statement]
Answer: True/False"""

    elif question_type == "fill_blank":
        prompt = f"""Generate {count} fill-in-the-blank questions about {topic}.

Based on this lesson:
{lesson_text[:1000]}

Format each question exactly like this:
Question 1: [sentence with _____ for the blank]
Answer: [missing word/phrase]

Question 2: [sentence with _____ for the blank]
Answer: [missing word/phrase]"""

    else:  # short_answer
        prompt = f"""Generate {count} short answer questions about {topic}.

Based on this lesson:
{lesson_text[:1000]}

Format each question exactly like this:
Question 1: [question requiring 1-2 sentence answer]
Answer: [brief answer]

Question 2: [question requiring 1-2 sentence answer]
Answer: [brief answer]"""

    try:
        # Format prompt based on model
        model_name = quiz_model.config._name_or_path

        if "flan-t5" in model_name:
            formatted_prompt = prompt
        elif "mistral" in model_name.lower():
            formatted_prompt = f"<s>[INST] {prompt} [/INST]"
        elif "llama" in model_name.lower():
            formatted_prompt = f"<s>[INST] <<SYS>>\nYou are a helpful teacher creating quiz questions.\n<</SYS>>\n\n{prompt} [/INST]"
        else:
            formatted_prompt = prompt

        inputs = quiz_tokenizer(formatted_prompt, return_tensors="pt", max_length=1500, truncation=True)

        with torch.no_grad():
            outputs = quiz_model.generate(
                inputs.input_ids.to(device),
                max_length=2500,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=quiz_tokenizer.eos_token_id
            )

        response = quiz_tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract generated content
        if "[/INST]" in response:
            response = response.split("[/INST]", 1)[1].strip()

        # Parse based on question type
        return parse_questions_by_type(response, topic, question_type)

    except Exception as e:
        print(f"❌ Model generation failed: {e}")
        return generate_fallback_by_type(lesson_text, topic, question_type, count)

In [None]:
def parse_questions_by_type(response, topic, question_type):
    """Parse different question types from model response"""
    questions = []

    if question_type == "multiple_choice":
        # Parse MCQ format
        pattern = r'Question\s*\d*:?\s*(.+?)\n\s*A\)\s*(.+?)\n\s*B\)\s*(.+?)\n\s*C\)\s*(.+?)\n\s*D\)\s*(.+?)\n\s*Correct:\s*([A-D])'
        matches = re.findall(pattern, response, re.MULTILINE | re.DOTALL)

        for match in matches:
            q_text, opt_a, opt_b, opt_c, opt_d, correct = match
            questions.append({
                "type": "multiple_choice",
                "question": q_text.strip(),
                "options": {
                    "A": opt_a.strip(),
                    "B": opt_b.strip(),
                    "C": opt_c.strip(),
                    "D": opt_d.strip()
                },
                "correct": correct.upper(),
                "answer": f"The correct answer is {correct.upper()}",
                "concept": f"{topic} MCQ"
            })

    elif question_type == "true_false":
        # Parse T/F format
        pattern = r'Question\s*\d*:?\s*(.+?)\n\s*Answer:\s*(True|False)'
        matches = re.findall(pattern, response, re.IGNORECASE)

        for statement, answer in matches:
            questions.append({
                "type": "true_false",
                "question": f"True or False: {statement.strip()}",
                "answer": answer.capitalize(),
                "concept": f"{topic} T/F"
            })

    elif question_type == "fill_blank":
        # Parse fill-in-the-blank format
        pattern = r'Question\s*\d*:?\s*(.+?_+.+?)\n\s*Answer:\s*(.+?)(?:\n|$)'
        matches = re.findall(pattern, response, re.MULTILINE)

        for blank_q, answer in matches:
            questions.append({
                "type": "fill_blank",
                "question": blank_q.strip(),
                "answer": answer.strip(),
                "concept": f"{topic} Fill"
            })

    else:  # short_answer
        # Parse short answer format
        pattern = r'Question\s*\d*:?\s*(.+?)\n\s*Answer:\s*(.+?)(?=Question|\Z)'
        matches = re.findall(pattern, response, re.DOTALL)

        for q_text, answer in matches:
            questions.append({
                "type": "short_answer",
                "question": q_text.strip(),
                "answer": answer.strip()[:200],  # Limit answer length
                "concept": f"{topic} Short"
            })

    return questions

In [None]:
def generate_fallback_by_type(lesson_text, topic, question_type, count):
    """Generate specific question types without specialized model"""
    questions = []
    sentences = [s.strip() for s in lesson_text.split('.') if len(s.strip()) > 30]

    if question_type == "multiple_choice":
        # Generate MCQs from lesson content
        for i in range(min(count, len(sentences))):
            sentence = sentences[i % len(sentences)]

            # Extract a fact that can be questioned
            if " is " in sentence:
                parts = sentence.split(" is ", 1)
                subject = parts[0].strip()
                predicate = parts[1].strip()

                questions.append({
                    "type": "multiple_choice",
                    "question": f"What is {subject}?",
                    "options": {
                        "A": predicate,
                        "B": f"Not related to {topic}",
                        "C": f"A type of {topic} equipment",
                        "D": f"The opposite of {subject}"
                    },
                    "correct": "A",
                    "answer": "The correct answer is A",
                    "concept": f"{topic} MCQ"
                })

    elif question_type == "true_false":
        # Generate T/F from sentences
        for i in range(min(count, len(sentences))):
            sentence = sentences[i % len(sentences)]

            if i % 2 == 0:
                # True statement
                questions.append({
                    "type": "true_false",
                    "question": f"True or False: {sentence}",
                    "answer": "True",
                    "concept": f"{topic} T/F"
                })
            else:
                # Create false statement
                false_sentence = sentence.replace(" is ", " is not ")
                if false_sentence == sentence:
                    false_sentence = sentence.replace(" are ", " are not ")

                questions.append({
                    "type": "true_false",
                    "question": f"True or False: {false_sentence}",
                    "answer": "False",
                    "concept": f"{topic} T/F"
                })

    elif question_type == "fill_blank":
        # Generate fill-in-the-blank
        key_terms = re.findall(r'\*\*([^*]+)\*\*', lesson_text)

        for i in range(min(count, len(sentences))):
            sentence = sentences[i % len(sentences)]
            words = sentence.split()

            if len(words) > 5:
                # Replace a key word with blank
                if i < len(key_terms) and key_terms[i] in sentence:
                    blank_sentence = sentence.replace(key_terms[i], "_____")
                    answer = key_terms[i]
                else:
                    # Replace middle word
                    blank_pos = len(words) // 2
                    answer = words[blank_pos]
                    words[blank_pos] = "_____"
                    blank_sentence = " ".join(words)

                questions.append({
                    "type": "fill_blank",
                    "question": blank_sentence,
                    "answer": answer,
                    "concept": f"{topic} Fill"
                })

    else:  # short_answer
        # Generate short answer questions
        question_templates = [
            f"What is the main purpose of {topic}?",
            f"How does {topic} work?",
            f"Why is {topic} important?",
            f"Describe the process of {topic}.",
            f"What are the key components of {topic}?"
        ]

        for i in range(min(count, len(question_templates))):
            questions.append({
                "type": "short_answer",
                "question": question_templates[i],
                "answer": sentences[i % len(sentences)][:150] if sentences else f"{topic} is an important concept.",
                "concept": f"{topic} Short"
            })

    return questions[:count]

In [None]:
def generate_with_quiz_model(lesson_text, topic, num_questions):
    """Generate questions with the loaded quiz model"""
    # Format based on model type
    if quiz_model is None:
        return generate_enhanced_fallback_questions(lesson_text, topic, num_questions)

    model_name = quiz_model.config._name_or_path

    if "flan-t5" in model_name:
        # T5 models use different prompting
        prompt = f"""Generate {num_questions} quiz questions about {topic}.

Context: {lesson_text[:1000]}

Create questions in this format:
Q1: [question]
A1: [answer]
Q2: [question]
A2: [answer]
..."""

        inputs = quiz_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
        outputs = quiz_model.generate(
            inputs.input_ids.to(device),
            max_length=1000,
            temperature=0.7,
            do_sample=True
        )

    else:
        # Regular model prompting (keep existing code)
        prompt = f"""Create {num_questions} quiz questions based on this lesson about {topic}.

Lesson: {lesson_text[:1500]}

Generate questions in a simple format:
1. Question: [text]
   Answer: [text]

2. Question: [text]
   Answer: [text]"""

        if "mistral" in model_name.lower():
            formatted_prompt = f"<s>[INST] {prompt} [/INST]"
        elif "llama" in model_name.lower():
            formatted_prompt = f"<s>[INST] {prompt} [/INST]"
        else:
            formatted_prompt = prompt

        inputs = quiz_tokenizer(formatted_prompt, return_tensors="pt", max_length=2000, truncation=True)
        outputs = quiz_model.generate(
            inputs.input_ids.to(device),
            max_length=3000,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=quiz_tokenizer.eos_token_id
        )

    response = quiz_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Parse response into questions
    return parse_generated_questions(response, topic, lesson_text, num_questions)


In [None]:
def generate_enhanced_fallback_questions(lesson_text, topic, num_questions):
    """Generate good questions without specialized model"""
    questions = []

    # Extract meaningful content from lesson
    sentences = [s.strip() for s in lesson_text.split('.') if len(s.strip()) > 40]
    paragraphs = lesson_text.split('\n\n')

    # Extract key terms and concepts
    key_terms = []
    # Bold terms
    bold_terms = re.findall(r'\*\*([^*]+)\*\*', lesson_text)
    key_terms.extend(bold_terms)
    # Capitalized phrases
    cap_phrases = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', lesson_text)
    key_terms.extend([p for p in cap_phrases if len(p) > 5 and p != topic])
    # Terms after "is/are"
    definitions = re.findall(r'(\w+(?:\s+\w+){0,2})\s+(?:is|are)\s+(.+?)(?:\.|,)', lesson_text)

    # Ensure we have enough content
    if len(sentences) < 5:
        sentences.extend(["This topic involves complex processes"] * 5)

    question_count = 0

    # Generate True/False questions (easiest to create)
    for i in range(min(5, num_questions, len(sentences))):
        if question_count >= num_questions:
            break

        fact = sentences[i % len(sentences)]
        questions.append({
            "concept": f"{topic} fact {i+1}",
            "context": fact[:100] + "...",
            "type": "true_false",
            "question": f"True or False: {fact}",
            "answer": "True"
        })
        question_count += 1

        # Also create false version
        if question_count < num_questions and i < len(sentences) - 1:
            false_fact = fact.replace("is", "is not").replace("are", "are not").replace("can", "cannot")
            if false_fact != fact:  # Only if we made a change
                questions.append({
                    "concept": f"{topic} fact {i+1}b",
                    "context": fact[:100] + "...",
                    "type": "true_false",
                    "question": f"True or False: {false_fact}",
                    "answer": "False"
                })
                question_count += 1

    # Generate Fill in the Blank questions
    for term in key_terms[:5]:
        if question_count >= num_questions:
            break

        # Find sentence containing this term
        for sent in sentences:
            if term in sent:
                blank_sent = sent.replace(term, "_____")
                questions.append({
                    "concept": f"{topic} term",
                    "context": sent[:100] + "...",
                    "type": "fill_blank",
                    "question": blank_sent,
                    "answer": term
                })
                question_count += 1
                break

    # Generate definition questions
    for i, (term, definition) in enumerate(definitions[:5]):
        if question_count >= num_questions:
            break

        questions.append({
            "concept": f"{topic} definition",
            "context": f"{term} is {definition[:100]}...",
            "type": "simple_qa",
            "question": f"What is {term}?",
            "answer": definition[:200]
        })
        question_count += 1

    # Generate short answer questions about the topic
    topic_questions = [
        f"What is the main purpose of {topic}?",
        f"How does {topic} work?",
        f"Why is {topic} important?",
        f"What are the key components of {topic}?",
        f"Describe the process of {topic}.",
        f"What are the applications of {topic}?",
        f"What role does {topic} play in nature?",
        f"How does {topic} benefit us?"
    ]

    for i, q in enumerate(topic_questions):
        if question_count >= num_questions:
            break

        # Find relevant sentence for answer
        answer = sentences[i % len(sentences)] if sentences else f"{topic} is an important process."

        questions.append({
            "concept": f"{topic} understanding",
            "context": "From the lesson",
            "type": "short_answer",
            "question": q,
            "answer": answer[:200]
        })
        question_count += 1

    # Ensure we have at least the requested number
    while len(questions) < num_questions:
        questions.append({
            "concept": f"{topic} review",
            "context": "General understanding",
            "type": "simple_qa",
            "question": f"What did you learn about {topic} from this lesson?",
            "answer": f"The lesson explains that {topic} is an important concept involving various processes and applications."
        })

    return questions[:num_questions]


In [None]:
def parse_generated_questions(response, topic, lesson_text, num_questions):
    """Parse questions from model response with fallback"""
    questions = []

    # Try to parse numbered questions
    pattern = r'(?:Q?\d+[:.]|Question \d+:)\s*(.+?)(?:\n|$)(?:A\d*[:.]|Answer:)?\s*(.+?)(?=Q?\d+[:.:]|Question \d+:|$)'
    matches = re.findall(pattern, response, re.MULTILINE | re.DOTALL)

    for q_text, a_text in matches:
        if q_text and a_text:
            questions.append({
                "concept": f"{topic} Q{len(questions)+1}",
                "context": "From the lesson",
                "type": "simple_qa",
                "question": q_text.strip()[:200],
                "answer": a_text.strip()[:200]
            })

    # If parsing failed or not enough questions, use fallback
    if len(questions) < num_questions:
        print(f"⚠️ Only parsed {len(questions)} questions, adding fallback questions...")
        fallback = generate_enhanced_fallback_questions(lesson_text, topic, num_questions - len(questions))
        questions.extend(fallback)

    return questions[:num_questions]

In [None]:
def generate_audio_lesson(lesson_text, topic, output_dir):
    """Generate audio narration"""
    if not AUDIO_AVAILABLE:
        print("⚠️ Audio generation not available (gTTS not installed)")
        return None

    print("\n🎤 Generating audio narration...")

    try:
        # Clean text for better speech
        clean_text = re.sub(r'[#*\[\]]', '', lesson_text)
        clean_text = re.sub(r'\n+', '. ', clean_text)

        tts = gTTS(text=f"Lesson on {topic}. {clean_text}", lang='en', slow=False)

        audio_path = os.path.join(output_dir, f"lesson_{topic.replace(' ', '_')}_audio.mp3")
        tts.save(audio_path)

        print(f"✅ Audio saved!")
        return audio_path
    except Exception as e:
        print(f"❌ Error generating audio: {e}")
        return None

In [None]:
def create_study_guide(topic, lesson_text, quiz_questions):
    """Create comprehensive study guide with multiple question types"""
    study_guide = f"""# 📚 Study Guide: {topic}

## 📝 Lesson Summary

{lesson_text[:500]}...

---

## ❓ Practice Questions ({len(quiz_questions)} questions)

"""

    # Group questions by type
    questions_by_type = {}
    for q in quiz_questions:
        q_type = q.get('type', 'simple_qa')
        if q_type not in questions_by_type:
            questions_by_type[q_type] = []
        questions_by_type[q_type].append(q)

    type_names = {
        "multiple_choice": "🔤 Multiple Choice Questions",
        "true_false": "✅ True or False",
        "fill_blank": "📝 Fill in the Blanks",
        "short_answer": "💭 Short Answer Questions",
        "simple_qa": "❔ Questions & Answers"
    }

    question_num = 1

    # Display questions by type
    for q_type, type_questions in questions_by_type.items():
        if type_questions:
            study_guide += f"\n### {type_names.get(q_type, q_type)} ({len(type_questions)} questions)\n\n"

            for q in type_questions:
                # Clean up text
                question_text = q.get('question', '').strip()
                if len(question_text) > 300:
                    question_text = question_text[:300] + "..."

                if q_type == "multiple_choice":
                    study_guide += f"""**{question_num}.** {question_text}

A) {q['options'].get('A', 'Option A')}
B) {q['options'].get('B', 'Option B')}
C) {q['options'].get('C', 'Option C')}
D) {q['options'].get('D', 'Option D')}

<details>
<summary>Show Answer</summary>

✅ **{q.get('correct', 'A')}** - {q['options'].get(q.get('correct', 'A'), 'Correct answer')}

</details>

"""
                elif q_type == "true_false":
                    study_guide += f"""**{question_num}.** {question_text}

<details>
<summary>Show Answer</summary>

✅ **{q.get('answer', 'True')}**

</details>

"""
                elif q_type == "fill_blank":
                    study_guide += f"""**{question_num}.** {question_text}

<details>
<summary>Show Answer</summary>

✅ **{q.get('answer', 'Answer')}**

</details>

"""
                else:  # short_answer or simple_qa
                    answer_text = q.get('answer', 'Answer not provided').strip()
                    if len(answer_text) > 300:
                        answer_text = answer_text[:300] + "..."

                    study_guide += f"""**{question_num}.** {question_text}

<details>
<summary>Show Answer</summary>

{answer_text}

</details>

"""
                question_num += 1

            study_guide += "---\n"

    # Add statistics section
    study_guide += f"""## 📊 Question Statistics

- **Total Questions**: {len(quiz_questions)}
- **Question Types**: {len(questions_by_type)}
"""

    # Calculate percentages for each type
    for q_type, type_questions in questions_by_type.items():
        percentage = (len(type_questions) / len(quiz_questions)) * 100
        study_guide += f"- **{type_names.get(q_type, q_type)}**: {len(type_questions)} ({percentage:.0f}%)\n"

    # Add study tips
    study_guide += """
---

## 💡 Study Tips

### By Question Type:

**🔤 Multiple Choice Tips:**
- Read all options before selecting
- Eliminate obviously wrong answers first
- Look for keywords in the question
- Watch for "all of the above" or "none of the above"

**✅ True/False Tips:**
- Watch for absolute words (always, never, all, none)
- Look for qualifiers that might change meaning
- If unsure, consider if the statement is generally true

**📝 Fill in the Blanks Tips:**
- Read the entire sentence first
- Consider the context around the blank
- Think about the lesson's key terms
- Check if your answer makes grammatical sense

**💭 Short Answer Tips:**
- Answer in complete sentences
- Include key terms from the lesson
- Be concise but thorough
- Support your answer with facts from the lesson

### General Study Strategies:

1. **Active Recall** - Try to answer questions without looking at notes first
2. **Spaced Repetition** - Review questions multiple times over several days
3. **Teach Someone** - Explain the concepts to someone else
4. **Make Connections** - Link new information to what you already know
5. **Practice Tests** - Time yourself answering all questions

---

## 🎯 Learning Objectives

After completing this study guide, you should be able to:
"""

    # Extract key learning objectives
    key_concepts = set()

    # From questions
    for q in quiz_questions[:10]:  # Analyze first 10 questions
        # Extract key terms from questions
        words = q.get('question', '').split()
        for word in words:
            cleaned = word.strip('.,!?:;').lower()
            if (len(cleaned) > 4 and
                cleaned not in ['what', 'which', 'where', 'when', 'how', 'true', 'false', 'blank'] and
                cleaned != topic.lower()):
                key_concepts.add(word.strip('.,!?:;'))

    # From lesson text - look for bold terms
    bold_terms = re.findall(r'\*\*([^*]+)\*\*', lesson_text[:1000])
    key_concepts.update(bold_terms)

    # Add learning objectives
    if key_concepts:
        # Take up to 6 most relevant concepts
        for concept in list(key_concepts)[:6]:
            study_guide += f"- Understand and explain **{concept}**\n"

    # Add general objectives
    study_guide += f"""- Apply {topic} concepts to solve problems
- Identify the key components and processes of {topic}
- Analyze the importance and applications of {topic}
- Evaluate different aspects of {topic} in various contexts

---

## 📚 Additional Resources

To further your understanding of {topic}:

1. **Review the lesson** multiple times, focusing on different aspects each time
2. **Create your own examples** related to the concepts
3. **Draw diagrams or mind maps** to visualize relationships
4. **Research real-world applications** of {topic}
5. **Discuss with peers** to gain different perspectives

---

## ✍️ Notes Section

Use this space to write your own notes, questions, or insights about {topic}:

_[Your notes here]_

---

## 🏆 Self-Assessment

Rate your understanding of each topic (1-5 scale):

- [ ] Overall understanding of {topic}
- [ ] Key concepts and definitions
- [ ] Practical applications
- [ ] Problem-solving ability
- [ ] Ability to explain to others

**Target Score**: Aim for 4 or 5 in each area before moving on!

---

*Generated on: {datetime.datetime.now().strftime("%B %d, %Y at %I:%M %p")}*
"""

    return study_guide

In [None]:
import random

def generate_complete_lesson(topic, save_to_drive=False):
    """Generate a complete lesson package with all components"""

    # Check if models are initialized
    if text_model is None:
        print("❌ Models not initialized! Running initialize_models()...")
        initialize_models()

    print(f"\n{'='*60}")
    print(f"🚀 Creating Complete Lesson Package for: {topic}")
    print('='*60)

    # Create output directory
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    if save_to_drive and os.path.exists("/content/drive/MyDrive"):
        output_dir = f"/content/drive/MyDrive/complete_lessons/{topic.replace(' ', '_')}_{timestamp}"
    else:
        output_dir = f"/content/complete_lessons/{topic.replace(' ', '_')}_{timestamp}"

    os.makedirs(output_dir, exist_ok=True)

    # 1. Generate lesson
    lesson_text = generate_lesson(topic)

    print("\n" + "="*60)
    print("📝 LESSON CONTENT")
    print("="*60)
    display(Markdown(f"# {topic}\n\n{lesson_text}"))

    # Save lesson
    with open(f"{output_dir}/lesson.md", 'w') as f:
        f.write(f"# Lesson: {topic}\n\n{lesson_text}")

    # 2. Generate images with improved suggestions
    print("\n" + "="*60)
    print("🎨 GENERATING EDUCATIONAL IMAGES")
    print("="*60)

    image_suggestions = get_image_suggestions(lesson_text, topic)
    images = []

    if image_suggestions:
        print(f"\n🎨 AI suggested {len(image_suggestions)} images:")
        for i, concept in enumerate(image_suggestions):
            print(f"\n{i+1}. {concept['title']}")
            print(f"   Description: {concept['prompt'][:100]}...")

        images = generate_images(image_suggestions)

        if images:
            fig, axes = plt.subplots(1, len(images), figsize=(8*len(images), 8))
            if len(images) == 1:
                axes = [axes]

            for i, (title, img) in enumerate(images):
                axes[i].imshow(img)
                axes[i].set_title(title, fontsize=16, pad=15, weight='bold', color='#2c3e50')
                axes[i].axis('off')

                # Save image
                img.save(f"{output_dir}/image_{i+1}_{title.replace(' ', '_')}.png", "PNG", quality=95)

            plt.suptitle(f"Educational Visuals for {topic}", fontsize=20, y=0.98, weight='bold')
            plt.tight_layout()
            plt.savefig(f"{output_dir}/all_images.png", dpi=150, bbox_inches='tight')
            plt.show()
        else:
            print("⚠️ No images were generated")
    else:
        print("⚠️ No image suggestions found")

    # 3. Generate quiz with multiple formats
    print("\n" + "="*60)
    print("📋 GENERATING QUIZ QUESTIONS")
    print("="*60)

    # Generate 15 questions by default
    quiz_questions = generate_quiz_questions(lesson_text, topic, num_questions=15)

    # Display quiz questions
    if quiz_questions:
        # Group by type for display
        questions_by_type = {}
        for q in quiz_questions:
            q_type = q.get('type', 'simple_qa')
            if q_type not in questions_by_type:
                questions_by_type[q_type] = []
            questions_by_type[q_type].append(q)

        type_emojis = {
            "simple_qa": "❔",
            "fill_blank": "📝",
            "true_false": "✅",
            "short_answer": "💭",
            "complete_sentence": "✏️"
        }

        for q_type, questions in questions_by_type.items():
            print(f"\n{type_emojis.get(q_type, '❓')} {q_type.replace('_', ' ').title()} Questions ({len(questions)})")
            print("─" * 50)

            for i, q in enumerate(questions, 1):
                if q_type == "simple_qa":
                    display(Markdown(f"""**Q{i}:** {q['question']}

<details>
<summary><b>Show Answer</b></summary>

**A:** {q['answer']}

</details>
"""))
                elif q_type == "fill_blank":
                    display(Markdown(f"""**{i}.** {q['question']}

<details>
<summary><b>Show Answer</b></summary>

**Answer:** {q['answer']}

</details>
"""))
                elif q_type == "true_false":
                    display(Markdown(f"""**{i}.** {q['question']}

<details>
<summary><b>Show Answer</b></summary>

**{q['answer']}**

</details>
"""))
                else:
                    display(Markdown(f"""**{i}.** {q['question']}

<details>
<summary><b>Show Answer</b></summary>

{q['answer']}

</details>
"""))
    else:
        print("⚠️ No quiz questions generated")

    # 4. Create study guide
    print("\n" + "="*60)
    print("📚 COMPLETE STUDY GUIDE")
    print("="*60)

    study_guide = create_study_guide(topic, lesson_text, quiz_questions)
    display(Markdown(study_guide))

    # Save study guide
    with open(f"{output_dir}/study_guide.md", 'w') as f:
        f.write(study_guide)

    # 5. Generate audio
    audio_file = generate_audio_lesson(lesson_text, topic, output_dir)
    if audio_file:
        print("\n" + "="*60)
        print("🎤 AUDIO NARRATION")
        print("="*60)
        try:
            from IPython.display import Audio
            display(Audio(audio_file))
        except:
            print(f"Audio file saved at: {audio_file}")

    # 6. Summary
    print("\n" + "="*60)
    print("✅ LESSON PACKAGE COMPLETE!")
    print("="*60)
    print(f"\n📁 All files saved to: {output_dir}")
    print(f"\n📊 Summary:")
    print(f"   • Lesson length: {len(lesson_text.split())} words")
    print(f"   • Images generated: {len(images)}")
    print(f"   • Quiz questions: {len(quiz_questions)}")
    print(f"   • Audio narration: {'Yes' if audio_file else 'No'}")

    return output_dir

# Quick start example
if __name__ == "__main__":
    # Initialize models once
    initialize_models()

    # Generate a complete lesson
    topic = input("Enter the topic:")
    output_path = generate_complete_lesson(topic, save_to_drive=True)
    print(f"\nLesson package created at: {output_path}")