In [None]:
# Cell 1: Install required packages
!pip install -q datasets tqdm huggingface-hub groq python-dotenv
!pip install -q --upgrade huggingface-hub

In [None]:
import os
import json
import time
from tqdm import tqdm
from datasets import load_dataset, Dataset
from dotenv import load_dotenv
from groq import Groq

# Initialize multiple Groq clients with API key rotation
from kaggle_secrets import UserSecretsClient

# List of 10 API keys (replace with your actual secret names)
API_KEY_NAMES = [
    "GROQ_API_KEY_8", "GROQ_API_KEY_2"
]

# Create clients for all API keys
clients = []
user_secrets = UserSecretsClient()
for key_name in API_KEY_NAMES:
    try:
        api_key = user_secrets.get_secret(key_name)
        if api_key:
            clients.append(Groq(api_key=api_key))
    except:
        continue

if not clients:
    raise ValueError("No valid Groq API keys found!")

current_client_index = 0

def get_client():
    """Rotate through available clients"""
    global current_client_index
    client = clients[current_client_index]
    current_client_index = (current_client_index + 1) % len(clients)
    return client

# Disable file locking for Kaggle
os.environ["HF_DATASETS_DISABLE_FILE_LOCKING"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Cell 3: Your exact summarizer function
def create_educational_summary(content):
    """Generate educational summaries with your exact specifications"""
    system_message = """ou are an expert educational content summarizer with expertise across ALL academic disciplines. Your task is to create a comprehensive yet concise summary that captures ALL important information while removing redundancy.

Follow these guidelines:
1. Capture EVERY new term, concept, or definition introduced
2. Preserve ALL explanations of new concepts or ideas
3. Maintain ALL key relationships between concepts
4. Include ALL practical applications and examples
5. Remove ONLY redundant explanations and repeated information
6. Use clear, direct language without unnecessary elaboration
7. Adapt your summary format to the specific field of study

For ANY scientific content:
- ALWAYS include ALL mathematical equations, formulas, and rules with their explanations
- ALWAYS preserve the exact notation and symbols used in the original content
- ALWAYS explain what each variable and symbol represents
- ALWAYS include any important constants or parameters
- ALWAYS explain the context and application of each equation

For ANY humanities content:
- Include literary analysis, themes, and symbolism
- Capture historical context, timelines, and cause-and-effect relationships
- Preserve philosophical arguments and ethical considerations
- Note cultural significance and societal impact

For ANY social sciences:
- Include theoretical frameworks and methodologies
- Capture statistical data and research findings
- Highlight policy implications and real-world applications
- Note cultural and societal contexts

For ANY arts and languages:
- Include techniques, styles, and movements
- Capture grammar rules, vocabulary, and cultural context
- Highlight artistic elements and creative processes
- Note historical development and influence

Format your response with these sections (include only those relevant to the content):
- New Terms and Definitions (list ALL new terms with their explanations)
- Core Concepts (include ALL key concepts with their explanations)
- [Field-Specific Content] (e.g., Mathematical Content, Literary Analysis, Historical Context)
- Relationships and Connections (how concepts relate to each other)
- Applications and Examples (ALL practical uses and examples)

Important: Do not omit any new concept, definition, or explanation. Only remove truly redundant content. Adapt your format to best represent the specific field of study. For ANY scientific content, ALWAYS include ALL equations and formulas with their explanations."""

    for attempt in range(5):  # 5 retry attempts
        try:
            client = get_client()  # Get client with rotation
            response = client.chat.completions.create(
                model="llama-3.2-90b-vision-preview",
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": content}  # No truncation
                ],
                temperature=0.2,
                timeout=60
            )
            return response.choices[0].message.content
        except Exception as e:
            if attempt < 4:
                wait_time = 2 ** (attempt + 3)  # Exponential backoff: 8,16,32,64s
                print(f"Attempt {attempt+1} failed. Waiting {wait_time}s...")
                time.sleep(wait_time)
                continue
            return f"Error after 5 attempts: {str(e)}"

In [None]:
# Cell 4: Load exactly 1000 samples (no truncation)
def load_samples(dataset_ranges=None):
    """
    Load samples across all datasets with customizable ranges for each dataset
    
    Args:
        dataset_ranges: Dictionary specifying ranges for each dataset
                        Format: {'lectures': (start, end), 
                                'education': (start, end),
                                'feynman': (start, end)}
                        If None, uses default ranges to get exactly 1000 samples
    """
    samples = []
    
    # Default ranges that give exactly 1000 samples (334 + 333 + 333)
    default_ranges = {
        'lectures': (0, 333),
        'education': (0, 333),
        'feynman': (0, 334)
    }
    
    ranges = dataset_ranges if dataset_ranges else default_ranges
    
    # Dataset 1: AlexanderBenady/generated_lectures
    try:
        ds1 = load_dataset("AlexanderBenady/generated_lectures", split="train")
        start, end = ranges['lectures']
        samples.extend([{"text": x['Lecture'], "source": "lectures"} 
                       for x in ds1.select(range(start, end))])
        print(f"Loaded {end-start} samples from lectures dataset (indices {start}-{end})")
    except Exception as e:
        print(f"Error loading lectures: {str(e)}")
    
    # Dataset 2: ajibawa-2023/Education-High-School-Students
    try:
        ds2 = load_dataset("ajibawa-2023/Education-High-School-Students", split="train")
        start, end = ranges['education']
        samples.extend([{"text": x['text'], "source": "education"} 
                       for x in ds2.select(range(start, end))])
        print(f"Loaded {end-start} samples from education dataset (indices {start}-{end})")
    except Exception as e:
        print(f"Error loading education: {str(e)}")
    
    # Dataset 3: enesxgrahovac/the-feynman-lectures-on-physics
    try:
        ds3 = load_dataset("enesxgrahovac/the-feynman-lectures-on-physics", split="train")
        start, end = ranges['feynman']
        samples.extend([{"text": x['section_text'], "source": "feynman"} 
                       for x in ds3.select(range(start, end))])
        print(f"Loaded {end-start} samples from feynman dataset (indices {start}-{end})")
    except Exception as e:
        print(f"Error loading feynman: {str(e)}")
    
    return samples

In [None]:
def process_and_create_hf_dataset():
    # 1. Load samples
    samples = load_samples()
    if not samples:
        raise ValueError("No samples loaded - check dataset errors")
    
    # 2. Process with summarizer
    results = []
    for i, sample in enumerate(tqdm(samples, desc="Processing")):
        try:
            # Call your summarizer (no truncation)
            summary = create_educational_summary(sample["text"])
            
            results.append({
                "original_text": sample["text"],
                "summary": summary,
                "source": sample["source"],
                "sample_id": i
            })
            
            # Save progress every 25 samples
            if (i + 1) % 25 == 0:
                with open('progress_checkpoint.json', 'w') as f:
                    json.dump(results, f, indent=2)
                print(f"\nCheckpoint saved: {len(results)} processed")
            
            # Dynamic delay based on recent errors
            delay = 10 if results and "Error" in results[-1]["summary"] else 5
            time.sleep(delay)
            
        except Exception as e:
            print(f"\nCritical error on sample {i}: {str(e)}")
            continue
    
    # 3. Create Hugging Face Dataset
    hf_dataset = Dataset.from_dict({
        "original_text": [x["original_text"] for x in results],
        "summary": [x["summary"] for x in results],
        "source": [x["source"] for x in results],
        "sample_id": [x["sample_id"] for x in results]
    })
    
    # 4. Save dataset
    hf_dataset.save_to_disk("education_summaries_dataset")
    
    # 5. Create dataset card
    with open("education_summaries_dataset/README.md", "w") as f:
        f.write("# Education Summaries Dataset\n\n")
        f.write("## Dataset Summary\n")
        f.write(f"- Total samples: {len(results)}\n")
        f.write("- Sources: education, feynman\n")
        f.write("- Generated using Groq's llama3-70b-8192 model\n\n")
        f.write("## Fields\n")
        f.write("- original_text: Full original educational content\n")
        f.write("- summary: AI-generated comprehensive summary\n")
        f.write("- source: Dataset origin\n")
        f.write("- sample_id: Unique identifier\n")
    
    print("\n" + "="*50)
    print(f"Completed processing {len(results)} samples")
    print("Hugging Face dataset saved to: education_summaries_dataset")
    print("="*50)
    
    return hf_dataset

In [None]:
if __name__ == "__main__":
    # Run the complete pipeline
    dataset = process_and_create_hf_dataset()
    
    # Show sample output
    print("\nSample from final dataset:")
    sample = dataset[0]
    print(f"Source: {sample['source']}")
    print(f"Original length: {len(sample['original_text'])} chars")
    print(f"Summary length: {len(sample['summary'])} chars")
    print("\nSummary preview:")
    print(sample['summary'][:200] + "...")