In [24]:
# Cell 1: Setup and Configuration

print("A: importing")
import google.generativeai as genai
import json
import time
from pathlib import Path
from typing import Dict, List

print("B: loaded env")
from dotenv import load_dotenv
load_dotenv('../.env.local')

print("C: have key")
import os
key = os.getenv('GEMINI_API_KEY')
assert key, "Missing GEMINI_API_KEY"
print(f"Key ends with: {key[-4:]}")

print("D: configure")
genai.configure(api_key=key)

print("E: construct model")
model = genai.GenerativeModel('gemini-2.5-flash')  # Updated model name

print("F: test call")
resp = model.generate_content("ping")
print("G: done", resp.text[:20])

print("‚úÖ Gemini configured successfully")


A: importing
B: loaded env
C: have key
Key ends with: 8Y-c
D: configure
E: construct model
F: test call
G: done pong
‚úÖ Gemini configured successfully


In [25]:
# Cell 2: Safe classifier (drop right after Cell 1)
import time
import google.generativeai as genai

# Reuse the 'model' from Cell 1
GENCFG = genai.types.GenerationConfig(
    temperature=0.2,
    top_p=0.9,
    max_output_tokens=512,
    response_mime_type="text/plain",
)

SYSTEM = (
    "You are a strict classifier. Output ONLY one label from:\n"
    "['Probability','Combinatorics','Number Theory','Algebra','Geometry','Other']\n"
    "No explanations; under 10 tokens."
)

def classify(prompt_text: str, retries: int = 2) -> str:
    contents = [
        {"role": "user", "parts": [{"text": SYSTEM + "\n\nQuestion:\n" + prompt_text}]}
    ]
    last_err = None
    for _ in range(retries + 1):
        try:
            resp = model.generate_content(
                contents,
                generation_config=GENCFG,
                tools=None,
            )
            pf = getattr(resp, "prompt_feedback", None)
            if pf and pf.block_reason:
                raise RuntimeError(f"Blocked: {pf.block_reason}")

            texts = []
            for cand in (resp.candidates or []):
                fr = getattr(cand, "finish_reason", "")
                parts = getattr(getattr(cand, "content", None), "parts", []) or []
                piece = "".join(getattr(p, "text", "") for p in parts if hasattr(p, "text")).strip()
                if piece:
                    texts.append((fr, piece))
            if texts:
                for fr, t in texts:
                    if fr == "STOP":
                        return t
                return texts[0][1]

            # If we hit MAX_TOKENS once, bump and retry
            if GENCFG.max_output_tokens < 2048:
                GENCFG.max_output_tokens = min(2048, GENCFG.max_output_tokens * 2)
            last_err = RuntimeError("No usable candidates returned.")
        except Exception as e:
            last_err = e
            time.sleep(0.3)
    raise last_err or RuntimeError("Classification failed.")


In [26]:
# Cell 2: Load Questions Data
questions_file = Path('../all-questions-content.json')

if not questions_file.exists():
    print("‚ùå all-questions-content.json not found. Run scraping first!")
else:
    with open(questions_file, 'r', encoding='utf-8') as f:
        questions = json.load(f)
    
    print(f"‚úÖ Loaded {len(questions)} questions")
    print(f"\nSample question:")
    sample = questions[0]
    print(f"Name: {sample['name']}")
    print(f"Difficulty: {sample['difficulty']}")
    print(f"Tags: {sample['tags']}")
    print(f"Text preview: {sample['questionText'][:200]}...")


‚úÖ Loaded 1210 questions

Sample question:
Name: Place Or Take
Difficulty: Hard
Tags: ['Probability']
Text preview: You are playing a one-player game with two opaque boxes. At each turn, you can choose to either "place" or "take". "Place" places $\$$1 from a third party into one box randomly. "Take" empties out one...


In [27]:
# Cell 3: Define Fixed Categories

# Fixed categories - curated list
FIXED_CATEGORIES = [
    "Linear Algebra",
    "Uniform Random Variables",
    "Normal Random Variables",
    "Exponential Random Variables",
    "Hypergeometric Random Variables",
    "Binomial Random Variables",
    "Poisson Random Variables",
    "Continuous Random Variables",
    "Discrete Random Variables",
    "Coins",
    "Dice",
    "Cards",
    "Grids",
    "Martingales",
    "Markov Chains",
    "Stochastic Processes",
    "Random Walks",
    "Game Theory",
    "Calculus",
    "Geometry",
    "Algebraic Manipulation",
    "Combinatorics"
]

print(f"‚úÖ Using {len(FIXED_CATEGORIES)} fixed categories:")
for i, cat in enumerate(FIXED_CATEGORIES, 1):
    print(f"  {i:2d}. {cat}")

# Categorization prompt
CATEGORIZATION_PROMPT = """You are categorizing quantitative finance interview questions. You MUST actually understand the problem and solution method before categorizing.

## CRITICAL: Understand the Problem First!
1. **Read the question carefully** - What is the problem actually asking?
2. **Think about the solution method** - What mathematical techniques are needed?
3. **Check if content is complete** - Scraping may have cut off text, be cautious
4. **Only assign categories if you're confident** - Don't guess based on keywords alone

## Available Categories:
{categories}

## Category Definitions (Use ONLY if the problem actually uses these):

**Random Variable Types** (assign ONLY if the problem explicitly involves these distributions):
- **Uniform Random Variables**: Problem involves uniform distribution U(a,b) or discrete uniform
- **Normal Random Variables**: Problem involves normal/Gaussian distribution N(Œº,œÉ¬≤)
- **Exponential Random Variables**: Problem involves exponential distribution Exp(Œª)
- **Hypergeometric Random Variables**: Problem involves sampling without replacement from finite population
- **Binomial Random Variables**: Problem involves binomial distribution Bin(n,p) - repeated independent trials
- **Poisson Random Variables**: Problem involves Poisson distribution Pois(Œª) - rare events
- **Continuous Random Variables**: Problem uses continuous distributions (normal, exponential, uniform continuous)
- **Discrete Random Variables**: Problem uses discrete distributions (binomial, Poisson, discrete uniform, hypergeometric)

**Object Types**:
- **Coins**: Problem involves coin flips/tosses
- **Dice**: Problem involves dice rolls
- **Cards**: Problem involves cards/deck of cards
- **Grids**: Problem involves grid paths/lattice paths (like Catalan numbers)

**Stochastic Processes** (ONLY if the problem involves time-dependent random processes):
- **Martingales**: Problem uses martingale property (E[X_{n+1}|X_n] = X_n)
- **Markov Chains**: Problem involves Markov chain (future depends only on current state)
- **Stochastic Processes**: General stochastic process (Brownian motion, etc.)
- **Random Walks**: Problem involves random walk (sum of random steps)

**Mathematical Domains**:
- **Linear Algebra**: Problem uses matrices, eigenvalues, eigenvectors, linear transformations
- **Calculus**: Problem uses derivatives, integrals, optimization via calculus
- **Geometry**: Problem involves geometric shapes, areas, volumes, distances
- **Algebraic Manipulation**: Problem requires algebraic simplification/manipulation
- **Combinatorics**: Problem involves counting, permutations, combinations
- **Game Theory**: Problem involves strategic decision-making, Nash equilibrium

## Rules:
1. **Think through the solution method** - What would you actually do to solve this?
2. **Be precise** - Don't assign "Discrete Random Variables" just because something is discrete
3. **Don't assign both Discrete AND Uniform** - Choose the most specific one
4. **Cards problems are usually Combinatorics + Cards** - Not necessarily "Discrete Random Variables"
5. **If uncertain, return []** - Better to miss a category than assign wrong one
6. **Try multiple times** - If unsure, think again before responding

## Examples:
- **Coin flip question**: Uses binomial distribution ‚Üí ["Coins", "Binomial Random Variables"]
- **Poker hands**: Counting combinations ‚Üí ["Cards", "Combinatorics"] (NOT "Discrete Random Variables")
- **Free sundae**: If it's a counting problem ‚Üí ["Combinatorics"], if it doesn't fit ‚Üí []
- **Matrix eigenvalue problem**: ["Linear Algebra"]
- **Derivative optimization**: ["Calculus"]
- **Grid path counting**: ["Grids", "Combinatorics"]
- **Brainteaser with no clear math**: []

## Output Format:
Return ONLY a JSON array of category strings. No explanation.
"""

print("\n‚úÖ Categorization prompt defined")


‚úÖ Using 22 fixed categories:
   1. Linear Algebra
   2. Uniform Random Variables
   3. Normal Random Variables
   4. Exponential Random Variables
   5. Hypergeometric Random Variables
   6. Binomial Random Variables
   7. Poisson Random Variables
   8. Continuous Random Variables
   9. Discrete Random Variables
  10. Coins
  11. Dice
  12. Cards
  13. Grids
  14. Martingales
  15. Markov Chains
  16. Stochastic Processes
  17. Random Walks
  18. Game Theory
  19. Calculus
  20. Geometry
  21. Algebraic Manipulation
  22. Combinatorics

‚úÖ Categorization prompt defined


In [None]:
# Cell 4: Categorization Function

def categorize_question(question: Dict, rate_limit_delay: float = 4.0, max_retries: int = 3) -> List[str]:
    """
    Categorize a single question using fixed categories with retry logic.
    
    Args:
        question: Question dict with name, tags, questionText
        rate_limit_delay: Seconds to wait between API calls (Gemini free tier: 15 RPM = 4s)
        max_retries: Maximum number of retry attempts
    
    Returns:
        List of category strings from fixed list (can be empty)
    """
    categories_str = "\n".join([f"- {cat}" for cat in FIXED_CATEGORIES])
    prompt_text = CATEGORIZATION_PROMPT.format(categories=categories_str)
    
    user_prompt = f"""Question Name: {question['name']}
Existing Tags: {', '.join(question['tags'])}
Difficulty: {question['difficulty']}

Question Text:
{question['questionText'][:1000]}

**INSTRUCTIONS:**
1. Read the problem carefully and understand what it's asking
2. Think about HOW you would solve it - what methods/techniques?
3. Assign categories ONLY if you're confident the problem actually uses those concepts
4. If the content seems incomplete or unclear, be conservative
5. Return [] if no categories fit confidently

Assign this question to appropriate categories (or return [] if none fit)."""
    
    full_prompt = f"{prompt_text}\n\n{user_prompt}"
    
    for attempt in range(max_retries):
        try:
            response = model.generate_content(
                full_prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=0.3,
                    max_output_tokens=1000,
                )
            )
            
            response_text = response.text.strip()
            
            # Remove markdown code blocks if present
            if response_text.startswith('```'):
                response_text = response_text.split('```')[1]
                if response_text.startswith('json'):
                    response_text = response_text[4:]
            
            categories = json.loads(response_text.strip())
            
            # Validate categories are from fixed list
            valid_categories = [cat for cat in categories if cat in FIXED_CATEGORIES]
            
            # Check for conflicting categories
            random_var_types = [
                "Uniform Random Variables", "Normal Random Variables", "Exponential Random Variables",
                "Hypergeometric Random Variables", "Binomial Random Variables", "Poisson Random Variables",
                "Continuous Random Variables", "Discrete Random Variables"
            ]
            
            assigned_rv_types = [cat for cat in valid_categories if cat in random_var_types]
            
            # If multiple random variable types assigned, keep only the most specific ones
            if len(assigned_rv_types) > 1:
                # Remove the generic ones if specific ones exist
                if any(cat in ["Uniform Random Variables", "Normal Random Variables", "Exponential Random Variables",
                               "Hypergeometric Random Variables", "Binomial Random Variables", "Poisson Random Variables"]
                       for cat in assigned_rv_types):
                    valid_categories = [cat for cat in valid_categories 
                                      if cat not in ["Continuous Random Variables", "Discrete Random Variables"]]
            
            # Rate limiting (Gemini free tier: 15 RPM = 4 seconds per request)
            time.sleep(rate_limit_delay)
            
            return valid_categories
        
        except json.JSONDecodeError as e:
            if attempt < max_retries - 1:
                time.sleep(rate_limit_delay)  # Wait before retry
                continue  # Retry on JSON parse error
            print(f"‚ùå JSON decode error after {max_retries} attempts for '{question['name']}': {e}")
            return []
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(rate_limit_delay)  # Wait before retry
                continue  # Retry on other errors
            print(f"‚ùå Error categorizing '{question['name']}' after {max_retries} attempts: {e}")
            return []
    
    return []  # Fallback if all retries fail

print("‚úÖ Categorization function defined")


In [28]:
# Cell 5: Test Categorization on Sample Questions

# Test on 5 sample questions
if 'questions' in locals():
    print("üß™ Testing categorization on 5 sample questions...\n")
    
    test_questions = questions[:5]
    
    for i, q in enumerate(test_questions, 1):
        print(f"\n{'='*80}")
        print(f"Question {i}: {q['name']}")
        print(f"Difficulty: {q['difficulty']} | Original Tags: {', '.join(q['tags'])}")
        print(f"Text: {q['questionText'][:150]}...")
        print(f"\nü§ñ AI Categories:")
        
        categories = categorize_question(q, rate_limit_delay=4.0, max_retries=3)
        if categories:
            for cat in categories:
                print(f"   ‚úì {cat}")
        else:
            print(f"   (no matching categories)")
    
    print(f"\n{'='*80}")
    print("‚úÖ Test complete!")
else:
    print("‚ö†Ô∏è  Run Cell 2 first to load questions")


üß™ Testing categorization on 5 sample questions...


Question 1: Place Or Take
Difficulty: Hard | Original Tags: Probability
Text: You are playing a one-player game with two opaque boxes. At each turn, you can choose to either "place" or "take". "Place" places $\$$1 from a third p...

ü§ñ AI Categories:
Error categorizing 'Place Or Take' after 1 attempts: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 2.
   (no matching categories)

Question 2: Collecting Toys II
Difficulty: Hard | Original Tags: Probability
Text: Every box of cereal contains one toy from a group of 555 distinct toys, each of which is mutually independent from the others and is equally likely to...

ü§ñ AI Categories:
Error categorizing 'Collecting Toys II' after 1 attempts: Invalid operation: The `response.text` quick accessor requires the respo

In [None]:
# Cell 6: Batch Categorization Function

def categorize_all_questions(
    questions: List[Dict],
    rate_limit_delay: float = 4.0,
    save_every: int = 10
) -> List[Dict]:
    """
    Categorize all questions with progress tracking and periodic saving.
    
    Args:
        questions: List of all questions
        rate_limit_delay: Seconds between API calls (Gemini free tier: 15 RPM = 4s)
        save_every: Save progress every N questions
    
    Returns:
        List of questions with added 'aiCategories' field
    """
    from datetime import datetime
    
    categorized = []
    total = len(questions)
    start_time = datetime.now()
    
    print(f"üöÄ Starting categorization of {total} questions...")
    print(f"   Rate limit: {rate_limit_delay}s per request (15 RPM)")
    print(f"   Estimated time: {total * rate_limit_delay / 60:.1f} minutes\n")
    
    for i, question in enumerate(questions, 1):
        try:
            categories = categorize_question(question, rate_limit_delay=rate_limit_delay, max_retries=3)
            
            categorized_q = question.copy()
            categorized_q['aiCategories'] = categories
            categorized.append(categorized_q)
            
            # Progress update
            if i % 10 == 0 or i == total:
                elapsed = (datetime.now() - start_time).total_seconds()
                rate = i / elapsed if elapsed > 0 else 0
                eta = (total - i) / rate if rate > 0 else 0
                
                print(f"Progress: {i}/{total} ({i/total*100:.1f}%) | "
                      f"Rate: {rate*60:.1f} q/min | "
                      f"ETA: {eta/60:.1f} min")
            
            # Periodic save
            if i % save_every == 0:
                with open('categorized_questions_progress.json', 'w', encoding='utf-8') as f:
                    json.dump(categorized, f, indent=2, ensure_ascii=False)
                print(f"   üíæ Progress saved ({i} questions)")
        
        except Exception as e:
            print(f"‚ùå Error on question {i} ('{question['name']}'): {e}")
            # Add with empty categories
            categorized_q = question.copy()
            categorized_q['aiCategories'] = []
            categorized.append(categorized_q)
    
    elapsed = (datetime.now() - start_time).total_seconds()
    print(f"\n‚úÖ Categorization complete!")
    print(f"   Total time: {elapsed/60:.1f} minutes")
    print(f"   Average: {elapsed/total:.2f}s per question")
    
    return categorized

print("‚úÖ Batch categorization function defined")


‚úÖ Batch categorization function defined


In [None]:
# Cell 7: RUN FULL CATEGORIZATION

# ‚ö†Ô∏è UNCOMMENT TO RUN - This will take ~80 minutes for 1200 questions
# Gemini free tier: 15 requests per minute = 4 seconds per request
# if 'questions' in locals():
#     categorized_questions = categorize_all_questions(
#         questions,
#         rate_limit_delay=4.0,  # 15 requests per minute
#         save_every=10
#     )
#     
#     # Save final results
#     with open('categorized_questions_final.json', 'w', encoding='utf-8') as f:
#         json.dump(categorized_questions, f, indent=2, ensure_ascii=False)
#     
#     print("\nüíæ Final results saved to categorized_questions_final.json")
# else:
#     print("‚ö†Ô∏è  Run Cell 2 first")

print("‚ö†Ô∏è  Cell ready - uncomment to run full categorization")


‚ö†Ô∏è  Cell ready - uncomment to run full categorization


In [None]:
# Cell 8: Analyze Category Distribution

def analyze_categories(categorized_questions: List[Dict]) -> Dict:
    """
    Analyze the distribution of categories across all questions.
    """
    from collections import Counter
    
    all_categories = []
    categories_per_question = []
    no_category_count = 0
    
    for q in categorized_questions:
        cats = q.get('aiCategories', [])
        all_categories.extend(cats)
        categories_per_question.append(len(cats))
        if len(cats) == 0:
            no_category_count += 1
    
    category_counts = Counter(all_categories)
    
    print(f"üìä Category Analysis")
    print(f"{'='*80}\n")
    
    print(f"Total questions: {len(categorized_questions)}")
    print(f"Questions with NO categories: {no_category_count} ({no_category_count/len(categorized_questions)*100:.1f}%)")
    print(f"Questions with categories: {len(categorized_questions) - no_category_count}")
    if categories_per_question:
        print(f"Average categories per question: {sum(categories_per_question)/len(categories_per_question):.2f}")
        print(f"Max categories on one question: {max(categories_per_question)}\n")
    
    print(f"Category Usage:")
    print(f"{'-'*80}")
    for cat in FIXED_CATEGORIES:
        count = category_counts.get(cat, 0)
        pct = count / len(categorized_questions) * 100 if count > 0 else 0
        status = "‚úì" if count > 0 else "‚úó"
        print(f"{status} {cat:35s} {count:4d} questions ({pct:5.1f}%)")
    
    return {
        'total_questions': len(categorized_questions),
        'no_category_count': no_category_count,
        'unique_categories': len(category_counts),
        'avg_categories_per_question': sum(categories_per_question)/len(categories_per_question) if categories_per_question else 0,
        'category_counts': dict(category_counts)
    }

# Load and analyze if file exists
if Path('categorized_questions_final.json').exists():
    with open('categorized_questions_final.json', 'r', encoding='utf-8') as f:
        categorized = json.load(f)
    analyze_categories(categorized)
else:
    print("‚úÖ Analysis function defined")
    print("‚ö†Ô∏è  Run categorization first (Cell 7)")


‚úÖ Analysis function defined
‚ö†Ô∏è  Run categorization first (Cell 7)


In [None]:
# Cell 9: Export for Database Upload

def export_for_database(categorized_questions: List[Dict], output_file: str = 'categorized_for_db.json'):
    """
    Export categorized questions in format ready for Supabase upload.
    """
    db_ready = []
    
    for q in categorized_questions:
        db_ready.append({
            'url_ending': q['link'].replace('/questions/', ''),
            'name': q['name'],
            'aiCategories': q.get('aiCategories', [])
        })
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(db_ready, f, indent=2, ensure_ascii=False)
    
    print(f"‚úÖ Exported {len(db_ready)} questions to {output_file}")
    print(f"   Ready for database upload!")
    
    return db_ready

# Export if file exists
if Path('categorized_questions_final.json').exists():
    with open('categorized_questions_final.json', 'r', encoding='utf-8') as f:
        categorized = json.load(f)
    export_for_database(categorized)
else:
    print("‚úÖ Export function defined")
    print("‚ö†Ô∏è  Run categorization first (Cell 7)")


‚úÖ Export function defined
‚ö†Ô∏è  Run categorization first (Cell 7)
