# ISO 3166-3 Newsletter Parser - Google Colab Edition

Parse ISO 3166-3 newsletter PDFs using Google Colab's free GPU!

**Setup Steps:**
1. Upload your 6 PDF files to this Colab
2. Enable GPU: Runtime → Change runtime type → GPU
3. Run all cells
4. Download the output JSON

**Methods Available:**
- Method 1: OpenAI GPT-4 Vision (Recommended - Fast & Accurate)
- Method 2: DeepSeek-OCR with GPU (Free but slower)
- Method 3: Tesseract OCR (Lightweight backup)

## Step 1: Install Dependencies

In [None]:
%%capture
# Install required packages
!pip install -q transformers torch pillow pdf2image PyPDF2 pytesseract openai accelerate
!apt-get install -y poppler-utils tesseract-ocr

print("✓ All dependencies installed!")

## Step 2: Upload Your PDF Files

Click the folder icon on the left sidebar, then upload your 6 PDF files:
- ISO-TC_46_iso_3166-3_nl_i-1en.pdf
- ISO-TC_46_iso_3166-3_nl_i-2en.pdf
- ... (through i-6en.pdf)

In [None]:
# Check uploaded files
import os
from pathlib import Path

pdf_files = list(Path('.').glob('*.pdf'))
print(f"Found {len(pdf_files)} PDF files:")
for f in sorted(pdf_files):
    print(f"  - {f.name}")

if len(pdf_files) == 0:
    print("\n⚠️ No PDFs found! Please upload your files.")

## Step 3: Check GPU Availability

In [None]:
import torch

if torch.cuda.is_available():
    print("✓ GPU is available!")
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("⚠️ GPU not available. Go to Runtime → Change runtime type → GPU")
    print("   (You can still run with CPU but it will be slower)")

## Method: OpenAI GPT-4 Vision (Recommended)

**Pros:** Fast, accurate, no GPU needed

**Cost:** ~$0.20-$0.50 for all 6 PDFs

**Setup:** Enter your OpenAI API key below

In [None]:
# Set your OpenAI API key
import getpass

OPENAI_API_KEY = getpass.getpass("Enter your OpenAI API key: ")
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

print("✓ API key set!")

In [None]:
# OpenAI Parser Code
import json
import base64
import re
from datetime import datetime
from io import BytesIO
from openai import OpenAI
from pdf2image import convert_from_path

class OpenAIParser:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)
    
    def image_to_base64(self, image):
        buffered = BytesIO()
        if image.mode != 'RGB':
            image = image.convert('RGB')
        image.save(buffered, format="JPEG", quality=95)
        return base64.b64encode(buffered.getvalue()).decode('utf-8')
    
    def parse_pdf(self, pdf_path):
        print(f"\nProcessing: {os.path.basename(pdf_path)}")
        
        # Convert to images
        print("  Converting to images...")
        images = convert_from_path(pdf_path, dpi=200)
        print(f"  ✓ {len(images)} pages")
        
        # Prepare images for API
        print("  Preparing for GPT-4...")
        image_urls = []
        for img in images:
            b64 = self.image_to_base64(img)
            image_urls.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "high"}
            })
        
        # Call API
        print("  Calling OpenAI API...")
        prompt = '''Extract data from this ISO 3166-3 newsletter and return JSON:
{
  "newsletter": {"id": "I-X", "date_issued": "YYYY-MM-DD", "description": "..."},
  "entries": [{
    "former_country": {
      "name": "...", "alpha2": "XX", "alpha3": "XXX", "numeric": "NNN",
      "iso_3166_3_alpha4": "XXXX", "alternative_names": ["..."]
    },
    "validity_period": {"start": 1974, "end": 2002},
    "transition": {
      "type": "name_changed|merged|divided",
      "successors": [{"name": "...", "alpha2": "XX", "alpha3": "XXX", "numeric": "NNN"}]
    },
    "additional_notes": {
      "historical_context": "...",
      "implementation_notes": "...",
      "reason_for_change": "..."
    }
  }]
}
Extract alternative names from notes. Return ONLY valid JSON.'''
        
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": [{"type": "text", "text": prompt}, *image_urls]}],
            max_tokens=4096,
            temperature=0
        )
        
        # Parse response
        response_text = response.choices[0].message.content
        json_match = re.search(r'```json\s*(.*?)\s*```', response_text, re.DOTALL)
        if json_match:
            response_text = json_match.group(1)
        
        data = json.loads(response_text)
        data['source_file'] = os.path.basename(pdf_path)
        data['parsed_at'] = datetime.now().isoformat()
        data['tokens_used'] = response.usage.total_tokens
        
        print(f"  ✓ Done! ({response.usage.total_tokens} tokens)")
        return data

print("✓ OpenAI parser ready!")

In [None]:
# Run OpenAI parsing
parser = OpenAIParser(OPENAI_API_KEY)

all_results = []
total_tokens = 0

for pdf_file in sorted(pdf_files):
    try:
        result = parser.parse_pdf(str(pdf_file))
        all_results.append(result)
        total_tokens += result['tokens_used']
    except Exception as e:
        print(f"  ✗ Error: {e}")

print(f"\n{'='*60}")
print(f"Completed! Processed {len(all_results)}/{len(pdf_files)} files")
print(f"Total tokens: {total_tokens:,}")
print(f"Estimated cost: ${total_tokens * 0.00001:.4f}")
print(f"{'='*60}")

# Save results
output = {
    "metadata": {
        "title": "ISO 3166-3 Newsletter Data",
        "total_newsletters": len(all_results),
        "parsed_at": datetime.now().isoformat(),
        "parsing_method": "openai-gpt4-vision"
    },
    "newsletters": all_results
}

# Add flattened countries list
all_countries = []
for newsletter in all_results:
    for entry in newsletter.get('entries', []):
        entry_copy = entry.copy()
        entry_copy['source_newsletter'] = {
            'id': newsletter['newsletter']['id'],
            'date': newsletter['newsletter']['date_issued']
        }
        all_countries.append(entry_copy)

output['all_countries'] = all_countries
output['metadata']['total_countries'] = len(all_countries)

# Save to file
with open('iso_3166_3_parsed.json', 'w', encoding='utf-8') as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print("\n✓ Saved to: iso_3166_3_parsed.json")
print(f"  Newsletters: {len(all_results)}")
print(f"  Countries: {len(all_countries)}")

## Step 4: Download Results

Your parsed JSON file is ready! Click the file to download it.

In [None]:
# Display download link
from google.colab import files

if os.path.exists('iso_3166_3_parsed.json'):
    print("✓ Your file is ready!")
    print("\nClick below to download:")
    files.download('iso_3166_3_parsed.json')
else:
    print("No output file found. Make sure you ran one of the parsing methods above.")

## Step 5: Preview Results

In [None]:
# Show summary
if os.path.exists('iso_3166_3_parsed.json'):
    with open('iso_3166_3_parsed.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print("="*60)
    print("PARSED RESULTS SUMMARY")
    print("="*60)
    print(f"Total Newsletters: {data['metadata']['total_newsletters']}")
    print(f"Total Countries: {data['metadata']['total_countries']}")
    print(f"\nExtracted Countries:")
    print("-"*60)
    
    for country in data['all_countries']:
        name = country['former_country']['name']
        alpha4 = country['former_country'].get('iso_3166_3_alpha4', 'N/A')
        newsletter = country['source_newsletter']['id']
        print(f"[{newsletter}] {name} ({alpha4})")
    
    print("\n" + "="*60)
    print("✓ All data successfully extracted!")
    print("="*60)