## Setup: Imports and Environment Configuration

In [12]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
import json
from tqdm import tqdm

# Add project root to path
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(project_root))

# Load environment variables from .env file
#load_dotenv(project_root / '.env')

# Set OpenAI API key from environment
#os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', os.environ.get('OPENAI_API_KEY', ''))
#if not os.environ.get('OPENAI_API_KEY'):
    #raise ValueError("OPENAI_API_KEY not found in .env file or environment variables")

# Import project modules
from src.utils.config_loader import load_config
from src.ingestion.pdf_loader import load_pdf, clean_text
from src.ingestion.chunker import chunk_text
from src.ingestion.qa_generator import generate_qa_pairs
from src.ingestion.dataset_writer import save_to_jsonl, split_dataset, filter_valid_pairs

print("✓ Imports and environment setup complete")
#print(f"✓ Project root: {project_root}")
#print(f"✓ OpenAI API key loaded: {'Yes' if os.environ.get('OPENAI_API_KEY') else 'No'}")

✓ Imports and environment setup complete


In [13]:
from google.colab import userdata
import os

# Retrieve API key from Colab secrets
openai_api_key = userdata.get('OPENAI_API_KEY')

# Set environment variable for OpenAI client
os.environ['OPENAI_API_KEY'] = openai_api_key

print("✓ OpenAI API key loaded and set as environment variable")

✓ OpenAI API key loaded and set as environment variable


## Step 1: Load Configuration

In [14]:
# Load configuration
config_path = project_root / 'config' / 'config.yaml'
config = load_config(config_path)

# Extract data factory configuration
df_config = config['data_factory']
ingestion_config = df_config['ingestion']
chunking_config = df_config['chunking']
generation_config = df_config['generation']
dataset_config = df_config['dataset']

# Get paths
raw_data_path = project_root / config['environment']['paths']['raw_data']
output_path = project_root / config['environment']['paths']['data_dir'] / 'output'
output_path.mkdir(parents=True, exist_ok=True)

# Verify raw_data_path exists
if not raw_data_path.exists():
    raise FileNotFoundError(f"Raw data directory not found: {raw_data_path}")

# Get PDF filename from config, with fallback to actual file
pdf_filename = config['project']['document']
pdf_path = raw_data_path / pdf_filename

# If configured PDF doesn't exist, try to find any PDF in the directory
if not pdf_path.exists():
    pdf_files = list(raw_data_path.glob('*.pdf'))
    if pdf_files:
        pdf_path = pdf_files[0]
        pdf_filename = pdf_path.name
        print(f"⚠ Config PDF '{config['project']['document']}' not found, using: {pdf_filename}")
    else:
        raise FileNotFoundError(f"PDF file not found in {raw_data_path}. Expected: {pdf_path}")

# Verify PDF exists
if not pdf_path.exists():
    raise FileNotFoundError(f"PDF file not found: {pdf_path}")

print("✓ Configuration loaded")
print(f"  Raw data directory: {raw_data_path} (exists: {raw_data_path.exists()})")
print(f"  PDF path: {pdf_path} (exists: {pdf_path.exists()})")
print(f"  Output path: {output_path}")
print(f"  Chunk size: {chunking_config['chunk_size']} characters")
print(f"  Questions per chunk: {generation_config['questions_per_chunk']}")
print(f"  Categories: {', '.join(generation_config['categories'])}")

✓ Configuration loaded
  Raw data directory: /content/operation-ledger-mind/data/raw (exists: True)
  PDF path: /content/operation-ledger-mind/data/raw/2024-Annual-Report.pdf (exists: True)
  Output path: /content/operation-ledger-mind/data/output
  Chunk size: 1500 characters
  Questions per chunk: 10
  Categories: hard_facts, strategic_summary, stylistic_creative


## Step 2: PDF Ingestion & Cleaning

In [None]:
# Load PDF
print(f"Loading PDF from: {pdf_path}")

try:
    raw_text = load_pdf(pdf_path)
    print(f"✓ PDF loaded: {len(raw_text):,} characters")
except ValueError as e:
    print(f"Error loading PDF: {e}. Attempting to install pdfplumber...")
    %pip install pdfplumber  # Install pdfplumber if it's missing
    raw_text = load_pdf(pdf_path) # Retry loading after installation
    print(f"✓ PDF loaded after installing pdfplumber: {len(raw_text):,} characters")

# Clean text
print("\nCleaning text...")
cleaned_text = clean_text(
    raw_text,
    remove_headers=ingestion_config['remove_headers'],
    remove_footers=ingestion_config['remove_footers'],
    normalize_whitespace=ingestion_config['normalize_whitespace']
)
print(f"✓ Text cleaned: {len(cleaned_text):,} characters")
print(f"  Reduction: {len(raw_text) - len(cleaned_text):,} characters ({100*(len(raw_text)-len(cleaned_text))/len(raw_text):.1f}%")

Loading PDF from: /content/operation-ledger-mind/data/raw/2024-Annual-Report.pdf
Error loading PDF: Failed to extract text from PDF. Please install one of: PyMuPDF (fitz), pdfplumber, or PyPDF2. Attempting to install pdfplumber...
Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m6.5 MB/s[0m eta [36m0

## Step 3: Chunking Strategy

In [16]:
# Chunk the document
print("Chunking document...")
chunks = chunk_text(
    cleaned_text,
    chunk_size=chunking_config['chunk_size'],
    overlap=chunking_config['overlap']
)

print(f"✓ Document chunked into {len(chunks)} chunks")
print(f"  Average chunk size: {sum(len(c['text']) for c in chunks) / len(chunks):.0f} characters")
print(f"  Total characters: {sum(len(c['text']) for c in chunks):,}")

# Display first chunk preview
if chunks:
    print(f"\nFirst chunk preview (chunk_id={chunks[0]['chunk_id']}):")
    print(f"  {chunks[0]['text'][:200]}...")

Chunking document...
✓ Document chunked into 414 chunks
  Average chunk size: 1497 characters
  Total characters: 619,806

First chunk preview (chunk_id=0):
  On Our Way
Uber’s Mission
We reimagine the way the world moves for the better
We are Uber. The go-getters. The kind of people who are relentless about our
mission to help people go anywhere and get an...


## Step 4: Q/A Generation Loop

In [17]:
# Prepare LLM configurations
question_llm_config = {
    'provider': generation_config['question_llm']['provider'],
    'model': generation_config['question_llm']['model'],
    'temperature': generation_config['question_llm']['temperature'],
    'max_tokens': generation_config['question_llm']['max_tokens'],
    'timeout_seconds': config['providers']['openai']['timeout_seconds'],
    'max_retries': config['providers']['openai']['max_retries']
}

answer_llm_config = {
    'provider': generation_config['answer_llm']['provider'],
    'model': generation_config['answer_llm']['model'],
    'temperature': generation_config['answer_llm']['temperature'],
    'max_tokens': generation_config['answer_llm']['max_tokens'],
    'timeout_seconds': config['providers']['openai']['timeout_seconds'],
    'max_retries': config['providers']['openai']['max_retries']
}

categories = generation_config['categories']

print("LLM Configuration:")
print(f"  Question LLM: {question_llm_config['model']} (temp={question_llm_config['temperature']})")
print(f"  Answer LLM: {answer_llm_config['model']} (temp={answer_llm_config['temperature']})")
print(f"  Categories: {', '.join(categories)}")

LLM Configuration:
  Question LLM: gpt-4o-mini (temp=0.3)
  Answer LLM: gpt-4o (temp=0.2)
  Categories: hard_facts, strategic_summary, stylistic_creative


In [18]:
# Generate Q/A pairs for each chunk
all_qa_pairs = []
failed_chunks = []

print(f"\nGenerating Q/A pairs for {len(chunks)} chunks...")
print("=" * 60)

for chunk in tqdm(chunks, desc="Processing chunks"):
    chunk_id = chunk['chunk_id']

    try:
        # Generate Q/A pairs for this chunk
        qa_pairs = generate_qa_pairs(
            chunk=chunk,
            question_llm_config=question_llm_config,
            answer_llm_config=answer_llm_config,
            categories=categories
        )

        # Add chunk text reference to each pair (optional, for debugging)
        for pair in qa_pairs:
            pair['chunk_text'] = chunk['text'][:500]  # Store first 500 chars for reference

        all_qa_pairs.extend(qa_pairs)

        # Progress update
        if (chunk_id + 1) % 5 == 0:
            print(f"  Processed {chunk_id + 1}/{len(chunks)} chunks - {len(all_qa_pairs)} Q/A pairs generated")

    except Exception as e:
        print(f"\n⚠ Error processing chunk {chunk_id}: {str(e)}")
        failed_chunks.append(chunk_id)
        continue

print("=" * 60)
print(f"\n✓ Q/A generation complete!")
print(f"  Total Q/A pairs: {len(all_qa_pairs)}")
print(f"  Expected: {len(chunks) * generation_config['questions_per_chunk']}")
print(f"  Failed chunks: {len(failed_chunks)}")
if failed_chunks:
    print(f"  Failed chunk IDs: {failed_chunks}")


Generating Q/A pairs for 414 chunks...


Processing chunks:   1%|          | 5/414 [00:44<54:59,  8.07s/it]  

  Processed 5/414 chunks - 50 Q/A pairs generated


Processing chunks:   2%|▏         | 10/414 [01:24<53:06,  7.89s/it]

  Processed 10/414 chunks - 100 Q/A pairs generated


Processing chunks:   4%|▎         | 15/414 [02:07<59:24,  8.93s/it]  

  Processed 15/414 chunks - 150 Q/A pairs generated


Processing chunks:   5%|▍         | 20/414 [02:44<51:08,  7.79s/it]

  Processed 20/414 chunks - 200 Q/A pairs generated


Processing chunks:   6%|▌         | 25/414 [03:36<1:10:15, 10.84s/it]

  Processed 25/414 chunks - 250 Q/A pairs generated


Processing chunks:   7%|▋         | 30/414 [04:17<58:41,  9.17s/it]

  Processed 30/414 chunks - 300 Q/A pairs generated


Processing chunks:   8%|▊         | 35/414 [05:03<58:55,  9.33s/it]

  Processed 35/414 chunks - 350 Q/A pairs generated


Processing chunks:  10%|▉         | 40/414 [05:53<59:40,  9.57s/it]  

  Processed 40/414 chunks - 400 Q/A pairs generated


Processing chunks:  11%|█         | 45/414 [06:34<51:51,  8.43s/it]

  Processed 45/414 chunks - 450 Q/A pairs generated


Processing chunks:  12%|█▏        | 50/414 [07:28<1:07:41, 11.16s/it]

  Processed 50/414 chunks - 500 Q/A pairs generated


Processing chunks:  13%|█▎        | 55/414 [08:07<48:59,  8.19s/it]

  Processed 55/414 chunks - 550 Q/A pairs generated


Processing chunks:  14%|█▍        | 60/414 [08:55<53:22,  9.05s/it]

  Processed 60/414 chunks - 600 Q/A pairs generated


Processing chunks:  16%|█▌        | 65/414 [09:47<57:09,  9.83s/it]  

  Processed 65/414 chunks - 650 Q/A pairs generated


Processing chunks:  17%|█▋        | 70/414 [10:40<1:00:26, 10.54s/it]

  Processed 70/414 chunks - 700 Q/A pairs generated


Processing chunks:  18%|█▊        | 75/414 [11:35<1:06:22, 11.75s/it]

  Processed 75/414 chunks - 750 Q/A pairs generated


Processing chunks:  19%|█▉        | 80/414 [12:22<52:55,  9.51s/it]

  Processed 80/414 chunks - 800 Q/A pairs generated


Processing chunks:  21%|██        | 85/414 [13:11<52:43,  9.61s/it]

  Processed 85/414 chunks - 850 Q/A pairs generated


Processing chunks:  22%|██▏       | 90/414 [14:04<55:28, 10.27s/it]

  Processed 90/414 chunks - 900 Q/A pairs generated


Processing chunks:  23%|██▎       | 95/414 [14:47<46:21,  8.72s/it]

  Processed 95/414 chunks - 950 Q/A pairs generated


Processing chunks:  24%|██▍       | 100/414 [15:32<49:58,  9.55s/it]

  Processed 100/414 chunks - 1000 Q/A pairs generated


Processing chunks:  25%|██▌       | 105/414 [16:17<48:49,  9.48s/it]

  Processed 105/414 chunks - 1050 Q/A pairs generated


Processing chunks:  27%|██▋       | 110/414 [17:02<46:36,  9.20s/it]

  Processed 110/414 chunks - 1100 Q/A pairs generated


Processing chunks:  28%|██▊       | 115/414 [17:59<53:09, 10.67s/it]

  Processed 115/414 chunks - 1150 Q/A pairs generated


Processing chunks:  29%|██▉       | 120/414 [18:50<45:26,  9.27s/it]

  Processed 120/414 chunks - 1200 Q/A pairs generated


Processing chunks:  30%|███       | 125/414 [19:48<53:08, 11.03s/it]

  Processed 125/414 chunks - 1250 Q/A pairs generated


Processing chunks:  31%|███▏      | 130/414 [20:32<42:52,  9.06s/it]

  Processed 130/414 chunks - 1300 Q/A pairs generated


Processing chunks:  33%|███▎      | 135/414 [21:13<37:32,  8.07s/it]

  Processed 135/414 chunks - 1350 Q/A pairs generated


Processing chunks:  34%|███▍      | 140/414 [21:58<42:12,  9.24s/it]

  Processed 140/414 chunks - 1400 Q/A pairs generated


Processing chunks:  35%|███▌      | 145/414 [22:39<36:08,  8.06s/it]

  Processed 145/414 chunks - 1450 Q/A pairs generated


Processing chunks:  36%|███▌      | 150/414 [23:17<35:01,  7.96s/it]

  Processed 150/414 chunks - 1500 Q/A pairs generated


Processing chunks:  37%|███▋      | 155/414 [23:55<31:15,  7.24s/it]

  Processed 155/414 chunks - 1550 Q/A pairs generated


Processing chunks:  39%|███▊      | 160/414 [24:46<39:59,  9.45s/it]

  Processed 160/414 chunks - 1600 Q/A pairs generated


Processing chunks:  40%|███▉      | 165/414 [25:34<40:14,  9.70s/it]

  Processed 165/414 chunks - 1650 Q/A pairs generated


Processing chunks:  41%|████      | 170/414 [26:31<50:30, 12.42s/it]

  Processed 170/414 chunks - 1700 Q/A pairs generated


Processing chunks:  42%|████▏     | 175/414 [27:18<40:36, 10.19s/it]

  Processed 175/414 chunks - 1750 Q/A pairs generated


Processing chunks:  43%|████▎     | 180/414 [27:57<31:57,  8.19s/it]

  Processed 180/414 chunks - 1800 Q/A pairs generated


Processing chunks:  45%|████▍     | 185/414 [28:48<32:34,  8.53s/it]

  Processed 185/414 chunks - 1850 Q/A pairs generated


Processing chunks:  46%|████▌     | 190/414 [29:24<27:47,  7.44s/it]

  Processed 190/414 chunks - 1900 Q/A pairs generated


Processing chunks:  47%|████▋     | 195/414 [29:59<24:58,  6.84s/it]

  Processed 195/414 chunks - 1950 Q/A pairs generated


Processing chunks:  48%|████▊     | 200/414 [30:35<24:17,  6.81s/it]

  Processed 200/414 chunks - 2000 Q/A pairs generated


Processing chunks:  50%|████▉     | 205/414 [31:15<28:07,  8.07s/it]

  Processed 205/414 chunks - 2050 Q/A pairs generated


Processing chunks:  51%|█████     | 210/414 [31:52<24:01,  7.06s/it]

  Processed 210/414 chunks - 2100 Q/A pairs generated


Processing chunks:  52%|█████▏    | 215/414 [32:33<26:05,  7.87s/it]

  Processed 215/414 chunks - 2150 Q/A pairs generated


Processing chunks:  53%|█████▎    | 220/414 [33:07<21:51,  6.76s/it]

  Processed 220/414 chunks - 2200 Q/A pairs generated


Processing chunks:  54%|█████▍    | 225/414 [33:52<29:04,  9.23s/it]

  Processed 225/414 chunks - 2250 Q/A pairs generated


Processing chunks:  56%|█████▌    | 230/414 [34:41<29:18,  9.56s/it]

  Processed 230/414 chunks - 2300 Q/A pairs generated


Processing chunks:  57%|█████▋    | 235/414 [35:33<30:38, 10.27s/it]

  Processed 235/414 chunks - 2350 Q/A pairs generated


Processing chunks:  58%|█████▊    | 240/414 [36:20<26:07,  9.01s/it]

  Processed 240/414 chunks - 2400 Q/A pairs generated


Processing chunks:  59%|█████▉    | 245/414 [37:09<27:49,  9.88s/it]

  Processed 245/414 chunks - 2450 Q/A pairs generated


Processing chunks:  60%|██████    | 250/414 [37:48<21:52,  8.01s/it]

  Processed 250/414 chunks - 2500 Q/A pairs generated


Processing chunks:  62%|██████▏   | 255/414 [38:32<23:21,  8.81s/it]

  Processed 255/414 chunks - 2550 Q/A pairs generated


Processing chunks:  63%|██████▎   | 260/414 [39:12<21:20,  8.32s/it]

  Processed 260/414 chunks - 2600 Q/A pairs generated


Processing chunks:  64%|██████▍   | 265/414 [39:53<20:16,  8.16s/it]

  Processed 265/414 chunks - 2650 Q/A pairs generated


Processing chunks:  65%|██████▌   | 270/414 [40:37<20:06,  8.38s/it]

  Processed 270/414 chunks - 2700 Q/A pairs generated


Processing chunks:  66%|██████▋   | 275/414 [41:24<21:54,  9.45s/it]

  Processed 275/414 chunks - 2750 Q/A pairs generated


Processing chunks:  68%|██████▊   | 280/414 [41:58<16:08,  7.23s/it]

  Processed 280/414 chunks - 2800 Q/A pairs generated


Processing chunks:  69%|██████▉   | 285/414 [42:41<16:46,  7.80s/it]

  Processed 285/414 chunks - 2850 Q/A pairs generated


Processing chunks:  70%|███████   | 290/414 [43:25<19:48,  9.58s/it]

  Processed 290/414 chunks - 2900 Q/A pairs generated


Processing chunks:  71%|███████▏  | 295/414 [44:06<15:54,  8.02s/it]

  Processed 295/414 chunks - 2950 Q/A pairs generated


Processing chunks:  72%|███████▏  | 300/414 [44:46<15:00,  7.90s/it]

  Processed 300/414 chunks - 3000 Q/A pairs generated


Processing chunks:  74%|███████▎  | 305/414 [45:31<15:57,  8.78s/it]

  Processed 305/414 chunks - 3050 Q/A pairs generated


Processing chunks:  75%|███████▍  | 310/414 [46:12<14:30,  8.37s/it]

  Processed 310/414 chunks - 3100 Q/A pairs generated


Processing chunks:  76%|███████▌  | 315/414 [46:55<13:28,  8.17s/it]

  Processed 315/414 chunks - 3150 Q/A pairs generated


Processing chunks:  77%|███████▋  | 320/414 [47:42<14:04,  8.98s/it]

  Processed 320/414 chunks - 3200 Q/A pairs generated


Processing chunks:  79%|███████▊  | 325/414 [48:23<12:28,  8.41s/it]

  Processed 325/414 chunks - 3250 Q/A pairs generated


Processing chunks:  80%|███████▉  | 330/414 [48:57<09:54,  7.08s/it]

  Processed 330/414 chunks - 3300 Q/A pairs generated


Processing chunks:  81%|████████  | 335/414 [49:27<08:09,  6.19s/it]

  Processed 335/414 chunks - 3350 Q/A pairs generated


Processing chunks:  82%|████████▏ | 340/414 [50:06<10:03,  8.16s/it]

  Processed 340/414 chunks - 3400 Q/A pairs generated


Processing chunks:  83%|████████▎ | 345/414 [50:44<08:24,  7.31s/it]

  Processed 345/414 chunks - 3450 Q/A pairs generated


Processing chunks:  85%|████████▍ | 350/414 [51:20<07:20,  6.88s/it]

  Processed 350/414 chunks - 3500 Q/A pairs generated


Processing chunks:  86%|████████▌ | 355/414 [51:58<07:06,  7.22s/it]

  Processed 355/414 chunks - 3550 Q/A pairs generated


Processing chunks:  87%|████████▋ | 360/414 [52:45<08:36,  9.56s/it]

  Processed 360/414 chunks - 3600 Q/A pairs generated


Processing chunks:  88%|████████▊ | 365/414 [53:28<07:34,  9.27s/it]

  Processed 365/414 chunks - 3650 Q/A pairs generated


Processing chunks:  89%|████████▉ | 370/414 [54:09<06:13,  8.50s/it]

  Processed 370/414 chunks - 3700 Q/A pairs generated


Processing chunks:  91%|█████████ | 375/414 [54:45<04:39,  7.16s/it]

  Processed 375/414 chunks - 3750 Q/A pairs generated


Processing chunks:  92%|█████████▏| 380/414 [55:28<04:25,  7.81s/it]

  Processed 380/414 chunks - 3800 Q/A pairs generated


Processing chunks:  93%|█████████▎| 385/414 [56:08<03:44,  7.73s/it]

  Processed 385/414 chunks - 3850 Q/A pairs generated


Processing chunks:  94%|█████████▍| 390/414 [56:45<02:50,  7.12s/it]

  Processed 390/414 chunks - 3900 Q/A pairs generated


Processing chunks:  95%|█████████▌| 395/414 [57:18<02:13,  7.02s/it]

  Processed 395/414 chunks - 3950 Q/A pairs generated


Processing chunks:  97%|█████████▋| 400/414 [57:54<01:42,  7.29s/it]

  Processed 400/414 chunks - 4000 Q/A pairs generated


Processing chunks:  98%|█████████▊| 405/414 [58:27<00:58,  6.49s/it]

  Processed 405/414 chunks - 4050 Q/A pairs generated


Processing chunks:  99%|█████████▉| 410/414 [58:56<00:23,  5.99s/it]

  Processed 410/414 chunks - 4100 Q/A pairs generated


Processing chunks: 100%|██████████| 414/414 [59:21<00:00,  8.60s/it]


✓ Q/A generation complete!
  Total Q/A pairs: 4140
  Expected: 4140
  Failed chunks: 0





In [19]:
# Filter out invalid pairs and show statistics
valid_pairs = filter_valid_pairs(all_qa_pairs)
invalid_count = len(all_qa_pairs) - len(valid_pairs)

print(f"Quality Check:")
print(f"  Valid pairs: {len(valid_pairs)}")
print(f"  Invalid pairs removed: {invalid_count}")

# Category distribution
category_counts = {}
for pair in valid_pairs:
    category = pair.get('category', 'unknown')
    category_counts[category] = category_counts.get(category, 0) + 1

print(f"\nCategory Distribution:")
for category, count in category_counts.items():
    percentage = 100 * count / len(valid_pairs) if valid_pairs else 0
    print(f"  {category}: {count} ({percentage:.1f}%)")

# Display sample Q/A pairs
if valid_pairs:
    print(f"\nSample Q/A Pairs:")
    for i, pair in enumerate(valid_pairs[:3], 1):
        print(f"\n  Example {i} ({pair.get('category', 'unknown')}):")
        print(f"    Q: {pair['question'][:100]}...")
        print(f"    A: {pair['answer'][:100]}...")

Quality Check:
  Valid pairs: 4140
  Invalid pairs removed: 0

Category Distribution:
  strategic_summary: 708 (17.1%)
  hard_facts: 2533 (61.2%)
  stylistic_creative: 899 (21.7%)

Sample Q/A Pairs:

  Example 1 (strategic_summary):
    Q: What is Uber's mission as stated in the annual report?...
    A: We reimagine the way the world moves for the better...

  Example 2 (hard_facts):
    Q: For which fiscal year does this annual report apply?...
    A: For the fiscal year ended December 31, 2024...

  Example 3 (hard_facts):
    Q: What is the commission file number associated with Uber Technologies, Inc.?...
    A: 001-38902...


## Step 5: Dataset Splitting & Storage

In [21]:
# Split dataset into train and test sets
print("Splitting dataset...")
train_pairs, test_pairs = split_dataset(
    all_pairs=valid_pairs,
    train_split=dataset_config['train_split'],
    shuffle=dataset_config['shuffle_before_split'],
    seed=dataset_config['seed']
)

print(f"✓ Dataset split:")
print(f"  Train set: {len(train_pairs)} pairs ({100*len(train_pairs)/len(valid_pairs):.1f}%)")
print(f"  Test set: {len(test_pairs)} pairs ({100*len(test_pairs)/len(valid_pairs):.1f}%)")

Splitting dataset...
✓ Dataset split:
  Train set: 3312 pairs (80.0%)
  Test set: 828 pairs (20.0%)


In [22]:
# Save datasets to JSONL files
train_file = output_path / dataset_config['train_file']
test_file = output_path / dataset_config['test_file']

print(f"\nSaving datasets...")
save_to_jsonl(train_pairs, train_file)
save_to_jsonl(test_pairs, test_file)

print(f"✓ Datasets saved:")
print(f"  Train: {train_file}")
print(f"  Test: {test_file}")

# Verify files
print(f"\nFile verification:")
print(f"  Train file size: {train_file.stat().st_size / 1024:.1f} KB")
print(f"  Test file size: {test_file.stat().st_size / 1024:.1f} KB")


Saving datasets...
✓ Datasets saved:
  Train: /content/operation-ledger-mind/data/output/train.jsonl
  Test: /content/operation-ledger-mind/data/output/golden_test_set.jsonl

File verification:
  Train file size: 2620.1 KB
  Test file size: 659.1 KB


## Step 6: Verification & Summary

In [23]:
# Load and verify a few samples from each file
print("Verifying saved files...\n")

# Check train file
with open(train_file, 'r', encoding='utf-8') as f:
    train_samples = [json.loads(line) for line in f.readlines()[:3]]

print(f"Train file samples (first 3):")
for i, sample in enumerate(train_samples, 1):
    print(f"  {i}. Q: {sample['question'][:80]}...")
    print(f"     Category: {sample.get('category', 'N/A')}")

# Check test file
with open(test_file, 'r', encoding='utf-8') as f:
    test_samples = [json.loads(line) for line in f.readlines()[:3]]

print(f"\nTest file samples (first 3):")
for i, sample in enumerate(test_samples, 1):
    print(f"  {i}. Q: {sample['question'][:80]}...")
    print(f"     Category: {sample.get('category', 'N/A')}")

print("\n" + "=" * 60)
print("✓ DATA FACTORY PIPELINE COMPLETE!")
print("=" * 60)
print(f"\nSummary:")
print(f"  • PDF processed: {pdf_filename}")
print(f"  • Chunks created: {len(chunks)}")
print(f"  • Total Q/A pairs: {len(valid_pairs)}")
print(f"  • Train set: {len(train_pairs)} pairs")
print(f"  • Test set: {len(test_pairs)} pairs")
print(f"  • Output directory: {output_path}")

Verifying saved files...

Train file samples (first 3):
  1. Q: What valuation model is used to determine the grant-date fair value of market-ba...
     Category: strategic_summary
  2. Q: What is the tone of the report regarding the urgency of addressing climate chang...
     Category: stylistic_creative
  3. Q: What does Adjusted EBITDA represent in the context of the company's financial me...
     Category: hard_facts

Test file samples (first 3):
  1. Q: In what ways could the potential disclosure of personal data impact the company'...
     Category: hard_facts
  2. Q: What types of assets are considered when allocating the fair value of purchase c...
     Category: hard_facts
  3. Q: How does the company plan to evaluate the impact of the new accounting standards...
     Category: strategic_summary

✓ DATA FACTORY PIPELINE COMPLETE!

Summary:
  • PDF processed: 2024-Annual-Report.pdf
  • Chunks created: 414
  • Total Q/A pairs: 4140
  • Train set: 3312 pairs
  • Test set: 828 pai