# 08 - Incremental Update

This notebook performs an incremental update of the ITA Law document database:
1. Re-scrapes all case pages to detect new documents
2. Updates metadata for existing documents if changed
3. Downloads only new PDFs that don't exist locally

**Prerequisites:** You should have already run notebooks 01-05 at least once.

In [None]:
import pandas as pd
import json
from scraper.incremental import (
    load_existing_documents,
    scrape_case_documents,
    compare_documents,
    merge_updates,
    get_missing_pdfs,
    run_incremental_update
)
from doc_download.download_docs import parallel_download_pdfs

## 1. Load Existing Data

In [None]:
existing_df = load_existing_documents('data/unctad_document_level_data.csv')
print(f"Existing documents: {len(existing_df):,}")
print(f"Existing cases: {existing_df['arbitration_id'].nunique():,}")
print(f"Unique doc_ids: {existing_df['doc_id'].nunique():,}")

## 2. Load Case URLs to Scrape

Get the unique case page URLs from the existing data.

In [None]:
# Get unique case-level data for scraping
case_cols = [
    'year_of_initiation', 'short_case_name', 'full_case_name',
    'link_to_italaws_case_page', 'respondent_state', 'home_state_of_investor'
]
case_urls_df = existing_df[case_cols].drop_duplicates(subset=['link_to_italaws_case_page'])
case_urls_df = case_urls_df[case_urls_df['link_to_italaws_case_page'].notna()]
print(f"Case URLs to scrape: {len(case_urls_df):,}")

## 3. Test on Small Subset (Optional)

Before running the full scrape, test on a small subset to verify the logic works.

In [None]:
# Test with first 5 cases
test_df = case_urls_df.head(5).copy()
print(f"Testing with {len(test_df)} cases...")

test_cases = scrape_case_documents(test_df, delay_range=(0.5, 1.0))

# Show documents found
for case in test_cases:
    docs = case.get('documents', [])
    print(f"  {case.get('short_case_name')}: {len(docs)} documents")

In [None]:
# Compare against existing
comparison = compare_documents(existing_df, test_cases)
print(f"New documents: {len(comparison['new'])}")
print(f"Updated documents: {len(comparison['updated'])}")
print(f"Unchanged documents: {len(comparison['unchanged'])}")

## 4. Full Incremental Scrape

**Warning:** This will scrape all ~1,300 case pages. With polite delays, expect ~25-30 minutes.

In [None]:
# Uncomment to run full scrape
# result = run_incremental_update(
#     existing_csv='data/unctad_document_level_data.csv',
#     case_urls_df=case_urls_df,
#     output_csv='data/unctad_document_level_data.csv',
#     delay_range=(0.5, 1.5),
#     documents_dir='documents'
# )

### Or run step-by-step for more control:

In [None]:
# Step 1: Scrape all case pages
# scraped_cases = scrape_case_documents(case_urls_df, delay_range=(0.5, 1.5))

In [None]:
# Step 2: Compare documents
# comparison = compare_documents(existing_df, scraped_cases)
# print(f"New documents: {len(comparison['new'])}")
# print(f"Updated metadata: {len(comparison['updated'])}")
# print(f"Unchanged: {len(comparison['unchanged'])}")
# print(f"New cases: {len(comparison['new_cases'])}")

In [None]:
# Step 3: Merge updates
# updated_df = merge_updates(existing_df, comparison)
# print(f"Total documents: {len(updated_df)}")

In [None]:
# Step 4: Save updated data
# updated_df.to_csv('data/unctad_document_level_data.csv', index=False)

## 5. Download New PDFs

Download only PDFs that don't exist in the documents folder.

In [None]:
# Load the updated data (or use result['missing_pdfs'] from above)
updated_df = pd.read_csv('data/unctad_document_level_data.csv')
missing_pdfs = get_missing_pdfs(updated_df, documents_dir='documents')
print(f"Documents needing download: {len(missing_pdfs):,}")

In [None]:
# Download missing PDFs
if len(missing_pdfs) > 0:
    print(f"Downloading {len(missing_pdfs)} PDFs...")
    # results = parallel_download_pdfs(missing_pdfs)
    
    # Save download results
    # with open('data/download_results_incremental.json', 'w') as f:
    #     json.dump(results, f, indent=2)
else:
    print("No new PDFs to download.")

## 6. Summary

In [None]:
# Reload and show final stats
final_df = pd.read_csv('data/unctad_document_level_data.csv')
print(f"Total documents: {len(final_df):,}")
print(f"Total cases: {final_df['arbitration_id'].nunique():,}")
print(f"Documents with PDF links: {final_df['doc_link'].notna().sum():,}")