## Setup: Imports and Environment Configuration

In [1]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
import json
from tqdm import tqdm

# Add project root to path
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(project_root))

# Load environment variables from .env file
load_dotenv(project_root / '.env')

# Set OpenAI API key from environment
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', os.environ.get('OPENAI_API_KEY', ''))
if not os.environ.get('OPENAI_API_KEY'):
    raise ValueError("OPENAI_API_KEY not found in .env file or environment variables")

# Import project modules
from src.utils.config_loader import load_config
from src.ingestion.pdf_loader import load_pdf, clean_text
from src.ingestion.chunker import chunk_text
from src.ingestion.qa_generator import generate_qa_pairs
from src.ingestion.dataset_writer import save_to_jsonl, split_dataset, filter_valid_pairs

print("✓ Imports and environment setup complete")
print(f"✓ Project root: {project_root}")
print(f"✓ OpenAI API key loaded: {'Yes' if os.environ.get('OPENAI_API_KEY') else 'No'}")

✓ Imports and environment setup complete
✓ Project root: c:\Users\Lochani\Documents\AEE-Bootcamp-Projects\AI Engineer Essentials - Mini Project 01\operation-ledger-mind
✓ OpenAI API key loaded: Yes


## Step 1: Load Configuration

In [3]:
# Load configuration
config_path = project_root / 'config' / 'config.yaml'
config = load_config(config_path)

# Extract data factory configuration
df_config = config['data_factory']
ingestion_config = df_config['ingestion']
chunking_config = df_config['chunking']
generation_config = df_config['generation']
dataset_config = df_config['dataset']

# Get paths
raw_data_path = project_root / config['environment']['paths']['raw_data']
output_path = project_root / config['environment']['paths']['data_dir'] / 'output'
output_path.mkdir(parents=True, exist_ok=True)

# Verify raw_data_path exists
if not raw_data_path.exists():
    raise FileNotFoundError(f"Raw data directory not found: {raw_data_path}")

# Get PDF filename from config, with fallback to actual file
pdf_filename = config['project']['document']
pdf_path = raw_data_path / pdf_filename

# If configured PDF doesn't exist, try to find any PDF in the directory
if not pdf_path.exists():
    pdf_files = list(raw_data_path.glob('*.pdf'))
    if pdf_files:
        pdf_path = pdf_files[0]
        pdf_filename = pdf_path.name
        print(f"⚠ Config PDF '{config['project']['document']}' not found, using: {pdf_filename}")
    else:
        raise FileNotFoundError(f"PDF file not found in {raw_data_path}. Expected: {pdf_path}")

# Verify PDF exists
if not pdf_path.exists():
    raise FileNotFoundError(f"PDF file not found: {pdf_path}")

print("✓ Configuration loaded")
print(f"  Raw data directory: {raw_data_path} (exists: {raw_data_path.exists()})")
print(f"  PDF path: {pdf_path} (exists: {pdf_path.exists()})")
print(f"  Output path: {output_path}")
print(f"  Chunk size: {chunking_config['chunk_size']} characters")
print(f"  Questions per chunk: {generation_config['questions_per_chunk']}")
print(f"  Categories: {', '.join(generation_config['categories'])}")

✓ Configuration loaded
  Raw data directory: c:\Users\Lochani\Documents\AEE-Bootcamp-Projects\AI Engineer Essentials - Mini Project 01\operation-ledger-mind\data\raw (exists: True)
  PDF path: c:\Users\Lochani\Documents\AEE-Bootcamp-Projects\AI Engineer Essentials - Mini Project 01\operation-ledger-mind\data\raw\2024-Annual-Report.pdf (exists: True)
  Output path: c:\Users\Lochani\Documents\AEE-Bootcamp-Projects\AI Engineer Essentials - Mini Project 01\operation-ledger-mind\data\output
  Chunk size: 1500 characters
  Questions per chunk: 10
  Categories: hard_facts, strategic_summary, stylistic_creative


## Step 2: PDF Ingestion & Cleaning

In [4]:
# Load PDF
print(f"Loading PDF from: {pdf_path}")
raw_text = load_pdf(pdf_path)
print(f"✓ PDF loaded: {len(raw_text):,} characters")

# Clean text
print("\nCleaning text...")
cleaned_text = clean_text(
    raw_text,
    remove_headers=ingestion_config['remove_headers'],
    remove_footers=ingestion_config['remove_footers'],
    normalize_whitespace=ingestion_config['normalize_whitespace']
)
print(f"✓ Text cleaned: {len(cleaned_text):,} characters")
print(f"  Reduction: {len(raw_text) - len(cleaned_text):,} characters ({100*(len(raw_text)-len(cleaned_text))/len(raw_text):.1f}%)")

Loading PDF from: c:\Users\Lochani\Documents\AEE-Bootcamp-Projects\AI Engineer Essentials - Mini Project 01\operation-ledger-mind\data\raw\2024-Annual-Report.pdf
✓ PDF loaded: 637,048 characters

Cleaning text...
✓ Text cleaned: 621,413 characters
  Reduction: 15,635 characters (2.5%)


## Step 3: Chunking Strategy

In [5]:
# Chunk the document
print("Chunking document...")
chunks = chunk_text(
    cleaned_text,
    chunk_size=chunking_config['chunk_size'],
    overlap=chunking_config['overlap']
)

print(f"✓ Document chunked into {len(chunks)} chunks")
print(f"  Average chunk size: {sum(len(c['text']) for c in chunks) / len(chunks):.0f} characters")
print(f"  Total characters: {sum(len(c['text']) for c in chunks):,}")

# Display first chunk preview
if chunks:
    print(f"\nFirst chunk preview (chunk_id={chunks[0]['chunk_id']}):")
    print(f"  {chunks[0]['text'][:200]}...")

Chunking document...
✓ Document chunked into 415 chunks
  Average chunk size: 1497 characters
  Total characters: 621,413

First chunk preview (chunk_id=0):
  On Our Way

Uber’s Mission
We reimagine the way the world moves for the better
We are Uber. The go-getters. The kind of people who are relentless about our
mission to help people go anywhere and get a...


## Step 4: Q/A Generation Loop

In [6]:
# Prepare LLM configurations
question_llm_config = {
    'provider': generation_config['question_llm']['provider'],
    'model': generation_config['question_llm']['model'],
    'temperature': generation_config['question_llm']['temperature'],
    'max_tokens': generation_config['question_llm']['max_tokens'],
    'timeout_seconds': config['providers']['openai']['timeout_seconds'],
    'max_retries': config['providers']['openai']['max_retries']
}

answer_llm_config = {
    'provider': generation_config['answer_llm']['provider'],
    'model': generation_config['answer_llm']['model'],
    'temperature': generation_config['answer_llm']['temperature'],
    'max_tokens': generation_config['answer_llm']['max_tokens'],
    'timeout_seconds': config['providers']['openai']['timeout_seconds'],
    'max_retries': config['providers']['openai']['max_retries']
}

categories = generation_config['categories']

print("LLM Configuration:")
print(f"  Question LLM: {question_llm_config['model']} (temp={question_llm_config['temperature']})")
print(f"  Answer LLM: {answer_llm_config['model']} (temp={answer_llm_config['temperature']})")
print(f"  Categories: {', '.join(categories)}")

LLM Configuration:
  Question LLM: gpt-4o-mini (temp=0.3)
  Answer LLM: gpt-4o (temp=0.2)
  Categories: hard_facts, strategic_summary, stylistic_creative


In [7]:
# Generate Q/A pairs for each chunk
all_qa_pairs = []
failed_chunks = []

print(f"\nGenerating Q/A pairs for {len(chunks)} chunks...")
print("=" * 60)

for chunk in tqdm(chunks, desc="Processing chunks"):
    chunk_id = chunk['chunk_id']
    
    try:
        # Generate Q/A pairs for this chunk
        qa_pairs = generate_qa_pairs(
            chunk=chunk,
            question_llm_config=question_llm_config,
            answer_llm_config=answer_llm_config,
            categories=categories
        )
        
        # Add chunk text reference to each pair (optional, for debugging)
        for pair in qa_pairs:
            pair['chunk_text'] = chunk['text'][:500]  # Store first 500 chars for reference
        
        all_qa_pairs.extend(qa_pairs)
        
        # Progress update
        if (chunk_id + 1) % 5 == 0:
            print(f"  Processed {chunk_id + 1}/{len(chunks)} chunks - {len(all_qa_pairs)} Q/A pairs generated")
    
    except Exception as e:
        print(f"\n⚠ Error processing chunk {chunk_id}: {str(e)}")
        failed_chunks.append(chunk_id)
        continue

print("=" * 60)
print(f"\n✓ Q/A generation complete!")
print(f"  Total Q/A pairs: {len(all_qa_pairs)}")
print(f"  Expected: {len(chunks) * generation_config['questions_per_chunk']}")
print(f"  Failed chunks: {len(failed_chunks)}")
if failed_chunks:
    print(f"  Failed chunk IDs: {failed_chunks}")


Generating Q/A pairs for 415 chunks...


Processing chunks:   1%|          | 5/415 [01:34<1:54:49, 16.80s/it]

  Processed 5/415 chunks - 50 Q/A pairs generated


Processing chunks:   2%|▏         | 10/415 [03:30<2:28:15, 21.96s/it]

  Processed 10/415 chunks - 100 Q/A pairs generated


Processing chunks:   4%|▎         | 15/415 [05:27<2:37:07, 23.57s/it]

  Processed 15/415 chunks - 150 Q/A pairs generated


Processing chunks:   5%|▍         | 20/415 [07:17<2:31:36, 23.03s/it]

  Processed 20/415 chunks - 200 Q/A pairs generated


Processing chunks:   6%|▌         | 25/415 [09:03<2:19:34, 21.47s/it]

  Processed 25/415 chunks - 250 Q/A pairs generated


Processing chunks:   7%|▋         | 30/415 [11:00<2:35:24, 24.22s/it]

  Processed 30/415 chunks - 300 Q/A pairs generated


Processing chunks:   8%|▊         | 35/415 [12:33<2:06:17, 19.94s/it]

  Processed 35/415 chunks - 350 Q/A pairs generated


Processing chunks:  10%|▉         | 40/415 [14:10<1:57:22, 18.78s/it]

  Processed 40/415 chunks - 400 Q/A pairs generated


Processing chunks:  11%|█         | 45/415 [16:11<2:31:31, 24.57s/it]

  Processed 45/415 chunks - 450 Q/A pairs generated


Processing chunks:  12%|█▏        | 50/415 [18:33<2:45:14, 27.16s/it]

  Processed 50/415 chunks - 500 Q/A pairs generated


Processing chunks:  13%|█▎        | 55/415 [20:30<2:20:19, 23.39s/it]

  Processed 55/415 chunks - 550 Q/A pairs generated


Processing chunks:  14%|█▍        | 60/415 [22:51<2:44:31, 27.81s/it]

  Processed 60/415 chunks - 600 Q/A pairs generated


Processing chunks:  16%|█▌        | 65/415 [24:40<2:17:12, 23.52s/it]

  Processed 65/415 chunks - 650 Q/A pairs generated


Processing chunks:  17%|█▋        | 70/415 [26:39<2:17:48, 23.97s/it]

  Processed 70/415 chunks - 700 Q/A pairs generated


Processing chunks:  18%|█▊        | 75/415 [28:36<2:19:06, 24.55s/it]

  Processed 75/415 chunks - 750 Q/A pairs generated


Processing chunks:  19%|█▉        | 80/415 [30:30<2:10:48, 23.43s/it]

  Processed 80/415 chunks - 800 Q/A pairs generated


Processing chunks:  20%|██        | 85/415 [32:40<2:22:42, 25.95s/it]

  Processed 85/415 chunks - 850 Q/A pairs generated


Processing chunks:  21%|██        | 86/415 [33:04<2:06:32, 23.08s/it]


KeyboardInterrupt: 

In [None]:
# Filter out invalid pairs and show statistics
valid_pairs = filter_valid_pairs(all_qa_pairs)
invalid_count = len(all_qa_pairs) - len(valid_pairs)

print(f"Quality Check:")
print(f"  Valid pairs: {len(valid_pairs)}")
print(f"  Invalid pairs removed: {invalid_count}")

# Category distribution
category_counts = {}
for pair in valid_pairs:
    category = pair.get('category', 'unknown')
    category_counts[category] = category_counts.get(category, 0) + 1

print(f"\nCategory Distribution:")
for category, count in category_counts.items():
    percentage = 100 * count / len(valid_pairs) if valid_pairs else 0
    print(f"  {category}: {count} ({percentage:.1f}%)")

# Display sample Q/A pairs
if valid_pairs:
    print(f"\nSample Q/A Pairs:")
    for i, pair in enumerate(valid_pairs[:3], 1):
        print(f"\n  Example {i} ({pair.get('category', 'unknown')}):")
        print(f"    Q: {pair['question'][:100]}...")
        print(f"    A: {pair['answer'][:100]}...")

## Step 5: Dataset Splitting & Storage

In [None]:
# Split dataset into train and test sets
print("Splitting dataset...")
train_pairs, test_pairs = split_dataset(
    all_pairs=valid_pairs,
    train_split=dataset_config['train_split'],
    shuffle=dataset_config['shuffle_before_split'],
    seed=dataset_config['seed']
)

print(f"✓ Dataset split:")
print(f"  Train set: {len(train_pairs)} pairs ({100*len(train_pairs)/len(valid_pairs):.1f}%)")
print(f"  Test set: {len(test_pairs)} pairs ({100*len(test_pairs)/len(valid_pairs):.1f}%)")

In [None]:
# Save datasets to JSONL files
train_file = output_path / dataset_config['train_file']
test_file = output_path / dataset_config['test_file']

print(f"\nSaving datasets...")
save_to_jsonl(train_pairs, train_file)
save_to_jsonl(test_pairs, test_file)

print(f"✓ Datasets saved:")
print(f"  Train: {train_file}")
print(f"  Test: {test_file}")

# Verify files
print(f"\nFile verification:")
print(f"  Train file size: {train_file.stat().st_size / 1024:.1f} KB")
print(f"  Test file size: {test_file.stat().st_size / 1024:.1f} KB")

## Step 6: Verification & Summary

In [None]:
# Load and verify a few samples from each file
print("Verifying saved files...\n")

# Check train file
with open(train_file, 'r', encoding='utf-8') as f:
    train_samples = [json.loads(line) for line in f.readlines()[:3]]

print(f"Train file samples (first 3):")
for i, sample in enumerate(train_samples, 1):
    print(f"  {i}. Q: {sample['question'][:80]}...")
    print(f"     Category: {sample.get('category', 'N/A')}")

# Check test file
with open(test_file, 'r', encoding='utf-8') as f:
    test_samples = [json.loads(line) for line in f.readlines()[:3]]

print(f"\nTest file samples (first 3):")
for i, sample in enumerate(test_samples, 1):
    print(f"  {i}. Q: {sample['question'][:80]}...")
    print(f"     Category: {sample.get('category', 'N/A')}")

print("\n" + "=" * 60)
print("✓ DATA FACTORY PIPELINE COMPLETE!")
print("=" * 60)
print(f"\nSummary:")
print(f"  • PDF processed: {pdf_filename}")
print(f"  • Chunks created: {len(chunks)}")
print(f"  • Total Q/A pairs: {len(valid_pairs)}")
print(f"  • Train set: {len(train_pairs)} pairs")
print(f"  • Test set: {len(test_pairs)} pairs")
print(f"  • Output directory: {output_path}")