## Setup: Imports and Environment Configuration

In [21]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
import json
from tqdm import tqdm

# Add project root to path
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(project_root))

# Load environment variables from .env file
#load_dotenv(project_root / '.env')

# Set OpenAI API key from environment
#os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', os.environ.get('OPENAI_API_KEY', ''))
#if not os.environ.get('OPENAI_API_KEY'):
    #raise ValueError("OPENAI_API_KEY not found in .env file or environment variables")

# Import project modules
from src.utils.config_loader import load_config
from src.ingestion.pdf_loader import load_pdf, clean_text
from src.ingestion.chunker import chunk_text
from src.ingestion.qa_generator import generate_qa_pairs
from src.ingestion.dataset_writer import save_to_jsonl, split_dataset, filter_valid_pairs

print("✓ Imports and environment setup complete")
#print(f"✓ Project root: {project_root}")
#print(f"✓ OpenAI API key loaded: {'Yes' if os.environ.get('OPENAI_API_KEY') else 'No'}")

✓ Imports and environment setup complete


In [22]:
from google.colab import userdata
import os

# Retrieve API key from Colab secrets
openai_api_key = userdata.get('OPENAI_API_KEY')

# Set environment variable for OpenAI client
os.environ['OPENAI_API_KEY'] = openai_api_key

print("✓ OpenAI API key loaded and set as environment variable")

✓ OpenAI API key loaded and set as environment variable


## Step 1: Load Configuration

In [23]:
# Load configuration
config_path = project_root / 'config' / 'config.yaml'
config = load_config(config_path)

# Extract data factory configuration
df_config = config['data_factory']
ingestion_config = df_config['ingestion']
chunking_config = df_config['chunking']
generation_config = df_config['generation']
dataset_config = df_config['dataset']

# Get paths
raw_data_path = project_root / config['environment']['paths']['raw_data']
output_path = project_root / config['environment']['paths']['data_dir'] / 'output'
output_path.mkdir(parents=True, exist_ok=True)

# Verify raw_data_path exists
if not raw_data_path.exists():
    raise FileNotFoundError(f"Raw data directory not found: {raw_data_path}")

# Get PDF filename from config, with fallback to actual file
pdf_filename = config['project']['document']
pdf_path = raw_data_path / pdf_filename

# If configured PDF doesn't exist, try to find any PDF in the directory
if not pdf_path.exists():
    pdf_files = list(raw_data_path.glob('*.pdf'))
    if pdf_files:
        pdf_path = pdf_files[0]
        pdf_filename = pdf_path.name
        print(f"⚠ Config PDF '{config['project']['document']}' not found, using: {pdf_filename}")
    else:
        raise FileNotFoundError(f"PDF file not found in {raw_data_path}. Expected: {pdf_path}")

# Verify PDF exists
if not pdf_path.exists():
    raise FileNotFoundError(f"PDF file not found: {pdf_path}")

print("✓ Configuration loaded")
print(f"  Raw data directory: {raw_data_path} (exists: {raw_data_path.exists()})")
print(f"  PDF path: {pdf_path} (exists: {pdf_path.exists()})")
print(f"  Output path: {output_path}")
print(f"  Chunk size: {chunking_config['chunk_size']} characters")
print(f"  Questions per chunk: {generation_config['questions_per_chunk']}")
print(f"  Categories: {', '.join(generation_config['categories'])}")

✓ Configuration loaded
  Raw data directory: /content/operation-ledger-mind/data/raw (exists: True)
  PDF path: /content/operation-ledger-mind/data/raw/2024-Annual-Report.pdf (exists: True)
  Output path: /content/operation-ledger-mind/data/output
  Chunk size: 1500 characters
  Questions per chunk: 10
  Categories: hard_facts, strategic_summary, stylistic_creative


## Step 2: PDF Ingestion & Cleaning

In [24]:
# Load PDF
print(f"Loading PDF from: {pdf_path}")

try:
    raw_text = load_pdf(pdf_path)
    print(f"✓ PDF loaded: {len(raw_text):,} characters")
except ValueError as e:
    print(f"Error loading PDF: {e}. Attempting to install pdfplumber...")
    %pip install pdfplumber  # Install pdfplumber if it's missing
    raw_text = load_pdf(pdf_path) # Retry loading after installation
    print(f"✓ PDF loaded after installing pdfplumber: {len(raw_text):,} characters")

# Clean text
print("\nCleaning text...")
cleaned_text = clean_text(
    raw_text,
    remove_headers=ingestion_config['remove_headers'],
    remove_footers=ingestion_config['remove_footers'],
    normalize_whitespace=ingestion_config['normalize_whitespace']
)
print(f"✓ Text cleaned: {len(cleaned_text):,} characters")
print(f"  Reduction: {len(raw_text) - len(cleaned_text):,} characters ({100*(len(raw_text)-len(cleaned_text))/len(raw_text):.1f}%")

Loading PDF from: /content/operation-ledger-mind/data/raw/2024-Annual-Report.pdf
✓ PDF loaded: 620,266 characters

Cleaning text...
✓ Text cleaned: 619,806 characters
  Reduction: 460 characters (0.1%


## Step 3: Chunking Strategy

In [25]:
# Chunk the document
print("Chunking document...")
chunks = chunk_text(
    cleaned_text,
    chunk_size=chunking_config['chunk_size'],
    overlap=chunking_config['overlap']
)

print(f"✓ Document chunked into {len(chunks)} chunks")
print(f"  Average chunk size: {sum(len(c['text']) for c in chunks) / len(chunks):.0f} characters")
print(f"  Total characters: {sum(len(c['text']) for c in chunks):,}")

# Display first chunk preview
if chunks:
    print(f"\nFirst chunk preview (chunk_id={chunks[0]['chunk_id']}):")
    print(f"  {chunks[0]['text'][:200]}...")

Chunking document...
✓ Document chunked into 505 chunks
  Average chunk size: 1722 characters
  Total characters: 869,643

First chunk preview (chunk_id=0):
  On Our Way
Uber’s Mission
We reimagine the way the world moves for the better
We are Uber.
The go-getters.
The kind of people who are relentless about our
mission to help people go anywhere and get an...


## Step 4: Q/A Generation Loop

In [26]:
# Prepare LLM configurations
question_llm_config = {
    'provider': generation_config['question_llm']['provider'],
    'model': generation_config['question_llm']['model'],
    'temperature': generation_config['question_llm']['temperature'],
    'max_tokens': generation_config['question_llm']['max_tokens'],
    'timeout_seconds': config['providers']['openai']['timeout_seconds'],
    'max_retries': config['providers']['openai']['max_retries']
}

answer_llm_config = {
    'provider': generation_config['answer_llm']['provider'],
    'model': generation_config['answer_llm']['model'],
    'temperature': generation_config['answer_llm']['temperature'],
    'max_tokens': generation_config['answer_llm']['max_tokens'],
    'timeout_seconds': config['providers']['openai']['timeout_seconds'],
    'max_retries': config['providers']['openai']['max_retries']
}

categories = generation_config['categories']

print("LLM Configuration:")
print(f"  Question LLM: {question_llm_config['model']} (temp={question_llm_config['temperature']})")
print(f"  Answer LLM: {answer_llm_config['model']} (temp={answer_llm_config['temperature']})")
print(f"  Categories: {', '.join(categories)}")

LLM Configuration:
  Question LLM: gpt-4o-mini (temp=0.3)
  Answer LLM: gpt-4.1-nano (temp=0.2)
  Categories: hard_facts, strategic_summary, stylistic_creative


In [27]:
# Generate Q/A pairs for each chunk
all_qa_pairs = []
failed_chunks = []

print(f"\nGenerating Q/A pairs for {len(chunks)} chunks...")
print("=" * 60)

for chunk in tqdm(chunks, desc="Processing chunks"):
    chunk_id = chunk['chunk_id']

    try:
        # Generate Q/A pairs for this chunk
        qa_pairs = generate_qa_pairs(
            chunk=chunk,
            question_llm_config=question_llm_config,
            answer_llm_config=answer_llm_config,
            categories=categories
        )

        # Add chunk text reference to each pair (optional, for debugging)
        for pair in qa_pairs:
            pair['chunk_text'] = chunk['text'][:500]  # Store first 500 chars for reference

        all_qa_pairs.extend(qa_pairs)

        # Progress update
        if (chunk_id + 1) % 5 == 0:
            print(f"  Processed {chunk_id + 1}/{len(chunks)} chunks - {len(all_qa_pairs)} Q/A pairs generated")

    except Exception as e:
        print(f"\n⚠ Error processing chunk {chunk_id}: {str(e)}")
        failed_chunks.append(chunk_id)
        continue

print("=" * 60)
print(f"\n✓ Q/A generation complete!")
print(f"  Total Q/A pairs: {len(all_qa_pairs)}")
print(f"  Expected: {len(chunks) * generation_config['questions_per_chunk']}")
print(f"  Failed chunks: {len(failed_chunks)}")
if failed_chunks:
    print(f"  Failed chunk IDs: {failed_chunks}")


Generating Q/A pairs for 505 chunks...


Processing chunks:   1%|          | 5/505 [00:25<44:38,  5.36s/it]

  Processed 5/505 chunks - 50 Q/A pairs generated


Processing chunks:   2%|▏         | 10/505 [00:55<49:36,  6.01s/it]

  Processed 10/505 chunks - 100 Q/A pairs generated


Processing chunks:   3%|▎         | 15/505 [01:22<45:40,  5.59s/it]

  Processed 15/505 chunks - 150 Q/A pairs generated


Processing chunks:   4%|▍         | 20/505 [01:49<44:44,  5.54s/it]

  Processed 20/505 chunks - 200 Q/A pairs generated


Processing chunks:   5%|▍         | 25/505 [02:16<44:31,  5.57s/it]

  Processed 25/505 chunks - 250 Q/A pairs generated


Processing chunks:   6%|▌         | 30/505 [02:46<49:51,  6.30s/it]

  Processed 30/505 chunks - 300 Q/A pairs generated


Processing chunks:   7%|▋         | 35/505 [03:12<41:01,  5.24s/it]

  Processed 35/505 chunks - 350 Q/A pairs generated


Processing chunks:   8%|▊         | 40/505 [03:39<42:42,  5.51s/it]

  Processed 40/505 chunks - 400 Q/A pairs generated


Processing chunks:   9%|▉         | 45/505 [04:07<40:25,  5.27s/it]

  Processed 45/505 chunks - 450 Q/A pairs generated


Processing chunks:  10%|▉         | 50/505 [04:39<47:36,  6.28s/it]

  Processed 50/505 chunks - 500 Q/A pairs generated


Processing chunks:  11%|█         | 55/505 [05:10<43:56,  5.86s/it]

  Processed 55/505 chunks - 550 Q/A pairs generated


Processing chunks:  12%|█▏        | 60/505 [05:40<45:15,  6.10s/it]

  Processed 60/505 chunks - 600 Q/A pairs generated


Processing chunks:  13%|█▎        | 65/505 [06:08<40:26,  5.51s/it]

  Processed 65/505 chunks - 650 Q/A pairs generated


Processing chunks:  14%|█▍        | 70/505 [06:34<38:31,  5.31s/it]

  Processed 70/505 chunks - 700 Q/A pairs generated


Processing chunks:  15%|█▍        | 75/505 [07:01<38:51,  5.42s/it]

  Processed 75/505 chunks - 750 Q/A pairs generated


Processing chunks:  16%|█▌        | 80/505 [07:32<40:00,  5.65s/it]

  Processed 80/505 chunks - 800 Q/A pairs generated


Processing chunks:  17%|█▋        | 85/505 [08:00<40:04,  5.73s/it]

  Processed 85/505 chunks - 850 Q/A pairs generated


Processing chunks:  18%|█▊        | 90/505 [08:29<40:34,  5.87s/it]

  Processed 90/505 chunks - 900 Q/A pairs generated


Processing chunks:  19%|█▉        | 95/505 [08:56<37:58,  5.56s/it]

  Processed 95/505 chunks - 950 Q/A pairs generated


Processing chunks:  20%|█▉        | 100/505 [09:24<37:51,  5.61s/it]

  Processed 100/505 chunks - 1000 Q/A pairs generated


Processing chunks:  21%|██        | 105/505 [09:51<34:21,  5.15s/it]

  Processed 105/505 chunks - 1050 Q/A pairs generated


Processing chunks:  22%|██▏       | 110/505 [10:17<34:38,  5.26s/it]

  Processed 110/505 chunks - 1100 Q/A pairs generated


Processing chunks:  23%|██▎       | 115/505 [10:43<32:46,  5.04s/it]

  Processed 115/505 chunks - 1150 Q/A pairs generated


Processing chunks:  24%|██▍       | 120/505 [11:12<36:41,  5.72s/it]

  Processed 120/505 chunks - 1200 Q/A pairs generated


Processing chunks:  25%|██▍       | 125/505 [11:37<33:15,  5.25s/it]

  Processed 125/505 chunks - 1250 Q/A pairs generated


Processing chunks:  26%|██▌       | 130/505 [12:02<31:46,  5.08s/it]

  Processed 130/505 chunks - 1300 Q/A pairs generated


Processing chunks:  27%|██▋       | 135/505 [12:29<34:22,  5.57s/it]

  Processed 135/505 chunks - 1350 Q/A pairs generated


Processing chunks:  28%|██▊       | 140/505 [12:56<32:58,  5.42s/it]

  Processed 140/505 chunks - 1400 Q/A pairs generated


Processing chunks:  29%|██▊       | 145/505 [13:24<32:48,  5.47s/it]

  Processed 145/505 chunks - 1450 Q/A pairs generated


Processing chunks:  30%|██▉       | 150/505 [13:47<27:48,  4.70s/it]

  Processed 150/505 chunks - 1500 Q/A pairs generated


Processing chunks:  31%|███       | 155/505 [14:16<31:13,  5.35s/it]

  Processed 155/505 chunks - 1550 Q/A pairs generated


Processing chunks:  32%|███▏      | 160/505 [14:46<33:36,  5.84s/it]

  Processed 160/505 chunks - 1600 Q/A pairs generated


Processing chunks:  33%|███▎      | 165/505 [15:12<30:56,  5.46s/it]

  Processed 165/505 chunks - 1650 Q/A pairs generated


Processing chunks:  34%|███▎      | 170/505 [15:38<30:42,  5.50s/it]

  Processed 170/505 chunks - 1700 Q/A pairs generated


Processing chunks:  35%|███▍      | 175/505 [16:08<30:17,  5.51s/it]

  Processed 175/505 chunks - 1750 Q/A pairs generated


Processing chunks:  36%|███▌      | 180/505 [16:33<27:11,  5.02s/it]

  Processed 180/505 chunks - 1800 Q/A pairs generated


Processing chunks:  37%|███▋      | 185/505 [17:02<31:07,  5.84s/it]

  Processed 185/505 chunks - 1850 Q/A pairs generated


Processing chunks:  38%|███▊      | 190/505 [17:32<32:25,  6.18s/it]

  Processed 190/505 chunks - 1900 Q/A pairs generated


Processing chunks:  39%|███▊      | 195/505 [18:02<30:45,  5.95s/it]

  Processed 195/505 chunks - 1950 Q/A pairs generated


Processing chunks:  40%|███▉      | 200/505 [18:30<28:53,  5.68s/it]

  Processed 200/505 chunks - 2000 Q/A pairs generated


Processing chunks:  41%|████      | 205/505 [18:58<29:11,  5.84s/it]

  Processed 205/505 chunks - 2050 Q/A pairs generated


Processing chunks:  42%|████▏     | 210/505 [19:29<31:50,  6.48s/it]

  Processed 210/505 chunks - 2100 Q/A pairs generated


Processing chunks:  43%|████▎     | 215/505 [19:58<28:55,  5.98s/it]

  Processed 215/505 chunks - 2150 Q/A pairs generated


Processing chunks:  44%|████▎     | 220/505 [20:39<40:35,  8.54s/it]

  Processed 220/505 chunks - 2200 Q/A pairs generated


Processing chunks:  45%|████▍     | 225/505 [21:06<27:29,  5.89s/it]

  Processed 225/505 chunks - 2250 Q/A pairs generated


Processing chunks:  46%|████▌     | 230/505 [21:36<30:27,  6.64s/it]

  Processed 230/505 chunks - 2300 Q/A pairs generated


Processing chunks:  47%|████▋     | 235/505 [22:04<25:44,  5.72s/it]

  Processed 235/505 chunks - 2350 Q/A pairs generated


Processing chunks:  48%|████▊     | 240/505 [22:32<25:58,  5.88s/it]

  Processed 240/505 chunks - 2400 Q/A pairs generated


Processing chunks:  49%|████▊     | 245/505 [22:56<21:53,  5.05s/it]

  Processed 245/505 chunks - 2450 Q/A pairs generated


Processing chunks:  50%|████▉     | 250/505 [23:28<26:41,  6.28s/it]

  Processed 250/505 chunks - 2500 Q/A pairs generated


Processing chunks:  50%|█████     | 255/505 [24:00<27:21,  6.56s/it]

  Processed 255/505 chunks - 2550 Q/A pairs generated


Processing chunks:  51%|█████▏    | 260/505 [24:30<24:59,  6.12s/it]

  Processed 260/505 chunks - 2600 Q/A pairs generated


Processing chunks:  52%|█████▏    | 265/505 [25:00<23:55,  5.98s/it]

  Processed 265/505 chunks - 2650 Q/A pairs generated


Processing chunks:  53%|█████▎    | 270/505 [25:31<23:14,  5.93s/it]

  Processed 270/505 chunks - 2700 Q/A pairs generated


Processing chunks:  54%|█████▍    | 275/505 [26:06<26:21,  6.88s/it]

  Processed 275/505 chunks - 2750 Q/A pairs generated


Processing chunks:  55%|█████▍    | 276/505 [26:12<24:25,  6.40s/it]


⚠ Error processing chunk 275: Failed to generate answers for chunk 275: Failed to parse answers from response: [
  "The net income attributable to Uber Technologies, Inc. for the year ended December 31, 2023, was $1,887 million.",
  "Adjusted EBITDA excludes certain restructuring and related charges, part of w


Processing chunks:  55%|█████▌    | 280/505 [26:37<24:49,  6.62s/it]

  Processed 280/505 chunks - 2790 Q/A pairs generated


Processing chunks:  56%|█████▋    | 285/505 [27:06<22:34,  6.16s/it]

  Processed 285/505 chunks - 2840 Q/A pairs generated


Processing chunks:  57%|█████▋    | 290/505 [27:34<20:59,  5.86s/it]

  Processed 290/505 chunks - 2890 Q/A pairs generated


Processing chunks:  58%|█████▊    | 295/505 [28:00<18:13,  5.21s/it]

  Processed 295/505 chunks - 2940 Q/A pairs generated


Processing chunks:  59%|█████▉    | 300/505 [28:26<18:34,  5.44s/it]

  Processed 300/505 chunks - 2990 Q/A pairs generated


Processing chunks:  60%|██████    | 305/505 [28:52<17:18,  5.19s/it]

  Processed 305/505 chunks - 3040 Q/A pairs generated


Processing chunks:  61%|██████▏   | 310/505 [29:22<18:05,  5.57s/it]

  Processed 310/505 chunks - 3090 Q/A pairs generated


Processing chunks:  62%|██████▏   | 315/505 [29:53<18:47,  5.93s/it]

  Processed 315/505 chunks - 3140 Q/A pairs generated


Processing chunks:  63%|██████▎   | 320/505 [30:20<16:50,  5.46s/it]

  Processed 320/505 chunks - 3190 Q/A pairs generated


Processing chunks:  64%|██████▍   | 325/505 [30:47<16:54,  5.64s/it]

  Processed 325/505 chunks - 3240 Q/A pairs generated


Processing chunks:  65%|██████▌   | 330/505 [31:11<14:22,  4.93s/it]

  Processed 330/505 chunks - 3290 Q/A pairs generated


Processing chunks:  66%|██████▋   | 335/505 [31:37<13:57,  4.92s/it]

  Processed 335/505 chunks - 3340 Q/A pairs generated


Processing chunks:  67%|██████▋   | 340/505 [32:03<14:38,  5.33s/it]

  Processed 340/505 chunks - 3390 Q/A pairs generated


Processing chunks:  68%|██████▊   | 345/505 [32:29<13:47,  5.17s/it]

  Processed 345/505 chunks - 3440 Q/A pairs generated


Processing chunks:  69%|██████▉   | 350/505 [32:54<12:31,  4.85s/it]

  Processed 350/505 chunks - 3490 Q/A pairs generated


Processing chunks:  70%|███████   | 355/505 [33:18<12:31,  5.01s/it]

  Processed 355/505 chunks - 3540 Q/A pairs generated


Processing chunks:  71%|███████▏  | 360/505 [33:44<12:18,  5.09s/it]

  Processed 360/505 chunks - 3590 Q/A pairs generated


Processing chunks:  72%|███████▏  | 365/505 [34:10<12:15,  5.26s/it]

  Processed 365/505 chunks - 3640 Q/A pairs generated


Processing chunks:  73%|███████▎  | 370/505 [34:33<10:30,  4.67s/it]

  Processed 370/505 chunks - 3690 Q/A pairs generated


Processing chunks:  74%|███████▍  | 375/505 [35:02<12:10,  5.62s/it]

  Processed 375/505 chunks - 3740 Q/A pairs generated


Processing chunks:  75%|███████▌  | 380/505 [35:28<11:04,  5.32s/it]

  Processed 380/505 chunks - 3790 Q/A pairs generated


Processing chunks:  76%|███████▌  | 385/505 [35:57<11:42,  5.85s/it]

  Processed 385/505 chunks - 3840 Q/A pairs generated


Processing chunks:  77%|███████▋  | 390/505 [36:24<10:59,  5.74s/it]

  Processed 390/505 chunks - 3890 Q/A pairs generated


Processing chunks:  78%|███████▊  | 395/505 [36:57<12:25,  6.78s/it]

  Processed 395/505 chunks - 3940 Q/A pairs generated


Processing chunks:  79%|███████▉  | 400/505 [37:25<09:43,  5.56s/it]

  Processed 400/505 chunks - 3990 Q/A pairs generated


Processing chunks:  80%|████████  | 405/505 [37:51<08:45,  5.26s/it]

  Processed 405/505 chunks - 4040 Q/A pairs generated


Processing chunks:  81%|████████  | 410/505 [38:18<08:29,  5.36s/it]

  Processed 410/505 chunks - 4090 Q/A pairs generated


Processing chunks:  82%|████████▏ | 415/505 [38:41<07:00,  4.68s/it]

  Processed 415/505 chunks - 4140 Q/A pairs generated


Processing chunks:  83%|████████▎ | 420/505 [39:08<07:21,  5.19s/it]

  Processed 420/505 chunks - 4190 Q/A pairs generated


Processing chunks:  84%|████████▍ | 425/505 [39:37<07:53,  5.91s/it]

  Processed 425/505 chunks - 4240 Q/A pairs generated


Processing chunks:  85%|████████▌ | 430/505 [40:04<06:54,  5.52s/it]

  Processed 430/505 chunks - 4290 Q/A pairs generated


Processing chunks:  86%|████████▌ | 435/505 [40:36<07:13,  6.19s/it]

  Processed 435/505 chunks - 4340 Q/A pairs generated


Processing chunks:  87%|████████▋ | 440/505 [41:07<06:50,  6.32s/it]

  Processed 440/505 chunks - 4390 Q/A pairs generated


Processing chunks:  88%|████████▊ | 445/505 [41:38<05:59,  6.00s/it]

  Processed 445/505 chunks - 4440 Q/A pairs generated


Processing chunks:  89%|████████▉ | 450/505 [42:09<05:38,  6.16s/it]

  Processed 450/505 chunks - 4490 Q/A pairs generated


Processing chunks:  90%|█████████ | 455/505 [42:41<05:16,  6.33s/it]

  Processed 455/505 chunks - 4540 Q/A pairs generated


Processing chunks:  91%|█████████ | 460/505 [43:06<03:59,  5.33s/it]

  Processed 460/505 chunks - 4590 Q/A pairs generated


Processing chunks:  92%|█████████▏| 465/505 [43:37<04:02,  6.06s/it]

  Processed 465/505 chunks - 4640 Q/A pairs generated


Processing chunks:  93%|█████████▎| 470/505 [44:02<03:00,  5.16s/it]

  Processed 470/505 chunks - 4690 Q/A pairs generated


Processing chunks:  94%|█████████▍| 475/505 [44:25<02:24,  4.81s/it]

  Processed 475/505 chunks - 4740 Q/A pairs generated


Processing chunks:  95%|█████████▌| 480/505 [44:54<02:17,  5.49s/it]

  Processed 480/505 chunks - 4790 Q/A pairs generated


Processing chunks:  96%|█████████▌| 485/505 [45:26<01:58,  5.91s/it]

  Processed 485/505 chunks - 4840 Q/A pairs generated


Processing chunks:  97%|█████████▋| 490/505 [45:58<01:38,  6.54s/it]

  Processed 490/505 chunks - 4890 Q/A pairs generated


Processing chunks:  98%|█████████▊| 495/505 [46:22<00:49,  4.95s/it]

  Processed 495/505 chunks - 4940 Q/A pairs generated


Processing chunks:  99%|█████████▉| 500/505 [46:50<00:25,  5.07s/it]

  Processed 500/505 chunks - 4990 Q/A pairs generated


Processing chunks: 100%|██████████| 505/505 [47:16<00:00,  5.62s/it]

  Processed 505/505 chunks - 5040 Q/A pairs generated

✓ Q/A generation complete!
  Total Q/A pairs: 5040
  Expected: 5050
  Failed chunks: 1
  Failed chunk IDs: [275]





In [28]:
# Filter out invalid pairs and show statistics
valid_pairs = filter_valid_pairs(all_qa_pairs)
invalid_count = len(all_qa_pairs) - len(valid_pairs)

print(f"Quality Check:")
print(f"  Valid pairs: {len(valid_pairs)}")
print(f"  Invalid pairs removed: {invalid_count}")

# Category distribution
category_counts = {}
for pair in valid_pairs:
    category = pair.get('category', 'unknown')
    category_counts[category] = category_counts.get(category, 0) + 1

print(f"\nCategory Distribution:")
for category, count in category_counts.items():
    percentage = 100 * count / len(valid_pairs) if valid_pairs else 0
    print(f"  {category}: {count} ({percentage:.1f}%)")

# Display sample Q/A pairs
if valid_pairs:
    print(f"\nSample Q/A Pairs:")
    for i, pair in enumerate(valid_pairs[:3], 1):
        print(f"\n  Example {i} ({pair.get('category', 'unknown')}):")
        print(f"    Q: {pair['question'][:100]}...")
        print(f"    A: {pair['answer'][:100]}...")

Quality Check:
  Valid pairs: 5040
  Invalid pairs removed: 0

Category Distribution:
  strategic_summary: 809 (16.1%)
  hard_facts: 3127 (62.0%)
  stylistic_creative: 1104 (21.9%)

Sample Q/A Pairs:

  Example 1 (strategic_summary):
    Q: What is Uber's mission as stated in the annual report?...
    A: We reimagine the way the world moves for the better...

  Example 2 (hard_facts):
    Q: For which fiscal year does this annual report apply?...
    A: For the fiscal year ended December 31, 2024...

  Example 3 (hard_facts):
    Q: What is the exact name of the registrant as specified in the charter?...
    A: UBER TECHNOLOGIES, INC....


## Step 5: Dataset Splitting & Storage

In [29]:
# Split dataset into train and test sets
print("Splitting dataset...")
train_pairs, test_pairs = split_dataset(
    all_pairs=valid_pairs,
    train_split=dataset_config['train_split'],
    shuffle=dataset_config['shuffle_before_split'],
    seed=dataset_config['seed']
)

print(f"✓ Dataset split:")
print(f"  Train set: {len(train_pairs)} pairs ({100*len(train_pairs)/len(valid_pairs):.1f}%)")
print(f"  Test set: {len(test_pairs)} pairs ({100*len(test_pairs)/len(valid_pairs):.1f}%)")

Splitting dataset...
✓ Dataset split:
  Train set: 4032 pairs (80.0%)
  Test set: 1008 pairs (20.0%)


In [30]:
# Save datasets to JSONL files
train_file = output_path / dataset_config['train_file']
test_file = output_path / dataset_config['test_file']

print(f"\nSaving datasets...")
save_to_jsonl(train_pairs, train_file)
save_to_jsonl(test_pairs, test_file)

print(f"✓ Datasets saved:")
print(f"  Train: {train_file}")
print(f"  Test: {test_file}")

# Verify files
print(f"\nFile verification:")
print(f"  Train file size: {train_file.stat().st_size / 1024:.1f} KB")
print(f"  Test file size: {test_file.stat().st_size / 1024:.1f} KB")


Saving datasets...
✓ Datasets saved:
  Train: /content/operation-ledger-mind/data/output/train.jsonl
  Test: /content/operation-ledger-mind/data/output/golden_test_set.jsonl

File verification:
  Train file size: 3318.8 KB
  Test file size: 831.3 KB


## Step 6: Verification & Summary

In [31]:
# Load and verify a few samples from each file
print("Verifying saved files...\n")

# Check train file
with open(train_file, 'r', encoding='utf-8') as f:
    train_samples = [json.loads(line) for line in f.readlines()[:3]]

print(f"Train file samples (first 3):")
for i, sample in enumerate(train_samples, 1):
    print(f"  {i}. Q: {sample['question'][:80]}...")
    print(f"     Category: {sample.get('category', 'N/A')}")

# Check test file
with open(test_file, 'r', encoding='utf-8') as f:
    test_samples = [json.loads(line) for line in f.readlines()[:3]]

print(f"\nTest file samples (first 3):")
for i, sample in enumerate(test_samples, 1):
    print(f"  {i}. Q: {sample['question'][:80]}...")
    print(f"     Category: {sample.get('category', 'N/A')}")

print("\n" + "=" * 60)
print("✓ DATA FACTORY PIPELINE COMPLETE!")
print("=" * 60)
print(f"\nSummary:")
print(f"  • PDF processed: {pdf_filename}")
print(f"  • Chunks created: {len(chunks)}")
print(f"  • Total Q/A pairs: {len(valid_pairs)}")
print(f"  • Train set: {len(train_pairs)} pairs")
print(f"  • Test set: {len(test_pairs)} pairs")
print(f"  • Output directory: {output_path}")

Verifying saved files...

Train file samples (first 3):
  1. Q: What narrative approach does the company take in discussing its tax assets and p...
     Category: stylistic_creative
  2. Q: How does the company's approach to stockholder proposals reflect its overall gov...
     Category: strategic_summary
  3. Q: What factors can cause the effective tax rate to vary for the company?...
     Category: hard_facts

Test file samples (first 3):
  1. Q: How long is each offering period in the Employee Stock Purchase Plan (ESPP)?...
     Category: strategic_summary
  2. Q: What is the purpose of the Power of Attorney as described in the document?...
     Category: hard_facts
  3. Q: How does the company plan to improve its cash flows from operating activities in...
     Category: strategic_summary

✓ DATA FACTORY PIPELINE COMPLETE!

Summary:
  • PDF processed: 2024-Annual-Report.pdf
  • Chunks created: 505
  • Total Q/A pairs: 5040
  • Train set: 4032 pairs
  • Test set: 1008 pairs
  • Outpu