In [None]:
# Optional: Preview first few publications
print("üìö Preview of first 5 publications:\n")
for i, pub in enumerate(filtered_publications[:5], 1):
    print(f"{i}. {pub['title']}")
    print(f"   Authors: {pub['creators']}")
    print(f"   Year: {pub['year']}, Region: {pub['region']}")
    if pub['doi']:
        print(f"   DOI: {pub['doi']}")
    if pub['url']:
        print(f"   URL: {pub['url']}")
    print()

# Optional: Show a sample YAML entry
print("üìã Sample YAML entry (how it will appear in Jekyll):")
print(yaml.dump([filtered_publications[0]], default_flow_style=False, allow_unicode=True))


## Step 6: Deploy to GitHub Repository

**Instructions to integrate with your GitHub Pages site:**

1. **Copy the YAML file to your repo:**
   - Take the file `filtered_pifsc_publications.yml` generated above
   - Place it in: `_data/filtered_pifsc_publications.yml` in your GitHub repository
   - (Create the `_data` folder if it doesn't exist)

2. **Verify the Jekyll template:**
   - Your `_layouts/publications.html` expects the data at `site.data.filtered_pifsc_publications`
   - The YAML file will be automatically loaded by Jekyll with this key

3. **Push to GitHub:**
   - Commit and push the updated `_data/filtered_pifsc_publications.yml` file
   - GitHub Pages will rebuild and the publications will appear on your site

4. **Testing locally (optional):**
   - If you have Jekyll installed locally, run `bundle exec jekyll serve` in your repo root
   - Navigate to the publications page to verify rendering

**File mapping:**
- Your repo should have this structure:
  ```
  esd-arp-resources/
  ‚îú‚îÄ‚îÄ _data/
  ‚îÇ   ‚îî‚îÄ‚îÄ filtered_pifsc_publications.yml  ‚Üê Place the file here
  ‚îú‚îÄ‚îÄ _layouts/
  ‚îÇ   ‚îî‚îÄ‚îÄ publications.html
  ‚îî‚îÄ‚îÄ ... (other files)
  ```

In [None]:
# Export to CSV (for easy spreadsheet viewing/editing)
csv_file = "filtered_pifsc_publications.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    if filtered_publications:
        writer = csv.DictWriter(f, fieldnames=filtered_publications[0].keys())
        writer.writeheader()
        writer.writerows(filtered_publications)
print(f"‚úì CSV export: {csv_file}")

# Export to JSON (for debugging/reference)
json_file = "filtered_pifsc_publications.json"
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(filtered_publications, f, indent=2, ensure_ascii=False)
print(f"‚úì JSON export: {json_file}")

# Export to YAML (for Jekyll/GitHub Pages)
# The Jekyll template expects: site.data.filtered_pifsc_publications
yaml_file = "filtered_pifsc_publications.yml"
with open(yaml_file, "w", encoding="utf-8") as f:
    # YAML format for Jekyll _data folder
    yaml.dump(filtered_publications, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"‚úì YAML export: {yaml_file}")

print(f"\nüìä Summary:")
print(f"  Total publications: {len(filtered_publications)}")
print(f"  Years covered: {min(pub['year'] for pub in filtered_publications if pub['year'])} - {max(pub['year'] for pub in filtered_publications if pub['year'])}")


## Step 5: Export Data for GitHub Pages

Generate outputs in multiple formats: CSV for spreadsheet, JSON for debugging, and YAML for Jekyll integration.

In [None]:
filtered_publications = []
duplicate_checker = set()

for entry in all_items:
    try:
        data = entry.get("data", {})
        
        # Extract fields
        title = data.get("title", "").strip()
        if not title:
            continue  # Skip entries without title
        
        # Check for duplicates (by title)
        if title in duplicate_checker:
            continue
        duplicate_checker.add(title)
        
        # Build publication record
        pub = {
            "title": title,
            "creators": clean_creators(data.get("creators", [])),
            "year": extract_year(data.get("date", "")),
            "doi": data.get("DOI", "").strip() or None,
            "issn": data.get("ISSN", "").strip() or None,
            "url": data.get("url", "").strip() or None,
            "region": assign_region(title),
            # Additional useful fields
            "item_type": data.get("itemType", ""),
            "publication_title": data.get("publicationTitle", "").strip() or None,
        }
        
        filtered_publications.append(pub)
    
    except Exception as e:
        print(f"‚ö†Ô∏è Error processing entry: {e}")
        continue

# Sort by year (descending)
filtered_publications.sort(key=lambda x: x["year"] or 0, reverse=True)

print(f"\n‚úì Processed {len(filtered_publications)} publications")
print(f"‚úì Removed {len(all_items) - len(filtered_publications)} duplicates/invalid entries")
print(f"\nRegion distribution:")
regions = {}
for pub in filtered_publications:
    region = pub["region"]
    regions[region] = regions.get(region, 0) + 1
for region, count in sorted(regions.items()):
    print(f"  {region}: {count}")


## Step 4: Process and Filter Publications

Extract relevant fields from each publication and apply cleaning/enrichment.

In [None]:
def extract_year(date_str):
    """Extract 4-digit year from date string."""
    if not date_str:
        return None
    match = re.search(r"\b\d{4}\b", str(date_str))
    return int(match.group(0)) if match else None

def assign_region(title):
    """Assign region based on publication title keywords."""
    if not title:
        return 'Unknown'
    
    title_lower = title.lower()
    
    # Hawaiian Archipelago keywords
    if any(area in title_lower for area in ['hawai', 'hawaii', 'kahekili', 'maui', 'ahu', 'northwestern', 'papahƒÅnaumokuƒÅkea', 'kauai', 'oahu', 'big island']):
        return 'Hawaiian Archipelago'
    
    # American Samoa keywords
    elif any(area in title_lower for area in ['samoa', 'aua', 'swains', 'american samoa']):
        return 'American Samoa'
    
    # Mariana Archipelago keywords
    elif any(area in title_lower for area in ['guam', 'mariana', 'saipan', 'tinian', 'rota']):
        return 'Mariana Archipelago'
    
    # Pacific Remote Island Areas (PRIA)
    elif any(area in title_lower for area in ['wake', 'baker', 'howland', 'jarvis', 'palmyra', 'kingman', 'johnstonpfkobia', 'jarvisisland']):
        return 'Pacific Remote Island Areas'
    
    # Pacific-wide (catch-all for broad Pacific studies)
    elif 'pacific' in title_lower:
        return 'Pacific-wide'
    
    else:
        return 'Unknown'

def clean_creators(creators):
    """Format creator names from API response."""
    if not creators:
        return ""
    names = []
    for creator in creators:
        first = creator.get('firstName', '').strip()
        last = creator.get('lastName', '').strip()
        if first or last:
            names.append(f"{first} {last}".strip())
    return "; ".join(names)

# Test the functions
print("‚úì Helper functions defined")
print(f"  - extract_year: Extracts 4-digit years from dates")
print(f"  - assign_region: Classifies publications by geographic region")
print(f"  - clean_creators: Formats author names")


## Step 3: Clean and Transform Publication Data

Define helper functions to extract year, assign regions, and standardize the publication metadata.

In [None]:
def fetch_all_items(base_url, headers, batch_size=100):
    """
    Fetch all items from Zotero API with pagination support.
    
    Args:
        base_url: Zotero API endpoint URL
        headers: Request headers with API key
        batch_size: Items per request (max 100)
    
    Returns:
        List of all items from the collection
    """
    all_items = []
    start = 0
    
    while True:
        params = {"format": "json", "limit": batch_size, "start": start}
        print(f"Fetching items {start} to {start + batch_size}...")
        
        try:
            response = requests.get(base_url, headers=headers, params=params, timeout=10)
            response.raise_for_status()
            items = response.json()
            
            if not items:  # No more items
                break
            
            all_items.extend(items)
            start += batch_size
            
            # Respect rate limits
            if 'Backoff' in response.headers:
                import time
                backoff_seconds = int(response.headers['Backoff'])
                print(f"Rate limited. Waiting {backoff_seconds} seconds...")
                time.sleep(backoff_seconds)
                
        except requests.exceptions.RequestException as e:
            print(f"‚ùå Error fetching data: {e}")
            break
    
    return all_items

# Fetch all publications
print("üîÑ Fetching publications from Zotero...")
all_items = fetch_all_items(BASE_URL, HEADERS)
print(f"‚úì Fetched {len(all_items)} publications")

# Save raw JSON for reference
with open("pifsc_arp_publications_raw.json", "w", encoding="utf-8") as f:
    json.dump(all_items, f, indent=4)
print("‚úì Raw data saved to 'pifsc_arp_publications_raw.json'")


## Step 2: Fetch Publications from Zotero API

This function fetches all publications from your Zotero collection, handling pagination automatically.

In [None]:
# Zotero API Credentials
GROUP_ID = "UPDATEGROUPID"
COLLECTION_KEY = "VD8Z582Z" # default if pulling all publications from group id
API_KEY = "UPDATEWITHAPIKEY"  #  Consider using environment variables for security
BASE_URL = f"https://api.zotero.org/groups/{GROUP_ID}/collections/{COLLECTION_KEY}/items"

HEADERS = {
    "Zotero-API-Key": API_KEY,
    "Accept": "application/json",
}

print(f"‚úì Zotero API configured for collection: {COLLECTION_KEY}")
print(f"‚úì Base URL: {BASE_URL}")


## Step 1: Zotero API Configuration

Update the variables below with your Zotero credentials. Obtain your API key from: https://www.zotero.org/settings/keys

In [None]:
import os
import json
import csv
import re
import requests
import yaml
from pathlib import Path

# Verify working directory
print("Current working directory:", os.getcwd())
# If needed, change to your working directory:
# os.chdir('C:/Users/YOUR_USER/Desktop/')  # Uncomment and modify as needed


# Zotero Publications Data Pipeline for GitHub Pages

This notebook fetches publications from a Zotero collection, cleans and enriches the data with region information, and exports it as YAML for rendering on GitHub Pages via Jekyll.

**Workflow:**
1. Connect to Zotero API and fetch publications
2. Clean and standardize publication metadata
3. Assign regions based on title keywords
4. Export as YAML for `_data/filtered_pifsc_publications.yml` in your GitHub repo
5. The Jekyll template will load and render the data in the publications table

**Prerequisites:**
- Zotero API key (obtain from https://www.zotero.org/settings/keys)
- Group ID and Collection Key from your Zotero account
- Working directory set up for file output