# IoS-001 Asset Backfill - FjordHQ

**STIG-2025-001 Directive Compliant**

Denne notebook henter prisdata for 500+ assets med:
- Rate limiting optimalisert for Colab
- Checkpoint/resume ved disconnect
- Iron Curtain compliance (IoS-001 §4.1)
- 3+ års historie for StatArb

## Pools:
- **A**: Top ETFs (SPY, QQQ, etc.)
- **B**: Mag7 + Sector Leaders
- **C**: Crypto Top 25
- **D**: S&P 500 Extended
- **E**: FX Majors
- **F**: Oslo Børs
- **G**: European Majors

## 1. Setup

In [None]:
# Installer dependencies
!pip install yfinance pandas tqdm -q

# Mount Google Drive for checkpoints
from google.colab import drive
drive.mount('/content/drive')

# Opprett checkpoint mappe
!mkdir -p "/content/drive/MyDrive/FjordHQ/ios001_backfill/data"

print("Setup fullført!")

## 2. Konfigurasjon

In [None]:
import os
import json
import time
import logging
from datetime import datetime, timedelta, date
from pathlib import Path
from typing import Dict, List, Optional
from dataclasses import dataclass, field

import pandas as pd
import yfinance as yf
from tqdm import tqdm

# Konfigurasjon
CHECKPOINT_DIR = "/content/drive/MyDrive/FjordHQ/ios001_backfill"

# Rate Limits (Colab-optimalisert)
BATCH_SIZE = 5
DELAY_BETWEEN_ASSETS = 8.0  # sekunder
DELAY_BETWEEN_BATCHES = 180.0  # sekunder (3 min)
MAX_RETRIES = 5
RETRY_BASE_DELAY = 30.0
RATE_LIMIT_BACKOFF = 600.0  # 10 min

# Historie
MAX_HISTORY_YEARS = 10

# Iron Curtain Thresholds (IoS-001 §4.1)
EQUITY_FX_QUARANTINE = 252
EQUITY_FX_FULL_HISTORY = 1260
CRYPTO_QUARANTINE = 365
CRYPTO_FULL_HISTORY = 1825

print(f"Checkpoint dir: {CHECKPOINT_DIR}")
print(f"Rate limits: {DELAY_BETWEEN_ASSETS}s mellom assets, {DELAY_BETWEEN_BATCHES}s mellom batches")

## 3. Asset Pools (STIG-2025-001)

In [None]:
# Asset Universe Pools
POOLS = {
    "A": {
        "name": "Top ETFs & Index",
        "tickers": [
            "SPY", "QQQ", "IWM", "DIA", "XLF", "XLE", "XLK", "XLV",
            "XLI", "XLU", "XLP", "XLY", "XLB", "XLRE", "XLC",
            "VTI", "VOO", "VEA", "VWO", "BND", "GLD", "SLV", "USO"
        ]
    },
    "B": {
        "name": "Mag7 + Sector Leaders",
        "tickers": [
            "AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "TSLA",
            "JPM", "V", "MA", "JNJ", "UNH", "PG", "HD", "BAC",
            "XOM", "CVX", "PFE", "ABBV", "KO", "PEP", "MRK", "TMO",
            "COST", "WMT", "DIS", "NFLX", "ADBE", "CRM", "ORCL"
        ]
    },
    "C": {
        "name": "Crypto Top 25",
        "tickers": [
            "BTC-USD", "ETH-USD", "BNB-USD", "XRP-USD", "ADA-USD",
            "SOL-USD", "DOGE-USD", "DOT-USD", "MATIC-USD", "SHIB-USD",
            "LTC-USD", "TRX-USD", "AVAX-USD", "LINK-USD", "ATOM-USD",
            "UNI-USD", "XMR-USD", "ETC-USD", "XLM-USD", "BCH-USD",
            "ALGO-USD", "VET-USD", "FIL-USD", "ICP-USD", "AAVE-USD"
        ]
    },
    "D": {
        "name": "S&P 500 Extended",
        "tickers": [
            "GS", "MS", "C", "WFC", "AXP", "BLK", "SCHW", "CME",
            "AMD", "INTC", "QCOM", "TXN", "MU", "AMAT", "LRCX",
            "LLY", "BMY", "GILD", "AMGN", "REGN", "VRTX", "ISRG",
            "MCD", "SBUX", "NKE", "TGT", "LOW", "TJX", "ORLY",
            "CAT", "DE", "HON", "UPS", "RTX", "LMT", "GE",
            "COP", "SLB", "EOG", "PXD", "MPC", "VLO", "PSX"
        ]
    },
    "E": {
        "name": "FX Majors",
        "tickers": [
            "EURUSD=X", "GBPUSD=X", "USDJPY=X", "USDCHF=X",
            "AUDUSD=X", "USDCAD=X", "NZDUSD=X",
            "EURGBP=X", "EURJPY=X", "GBPJPY=X"
        ]
    },
    "F": {
        "name": "Oslo Børs",
        "tickers": [
            "EQNR.OL", "DNB.OL", "TEL.OL", "MOWI.OL", "ORK.OL",
            "YAR.OL", "AKRBP.OL", "SALM.OL", "SUBC.OL", "FRO.OL",
            "AKER.OL", "STB.OL", "GOGL.OL", "BAKKA.OL", "KOG.OL"
        ]
    },
    "G": {
        "name": "European Majors",
        "tickers": [
            "SAP.DE", "SIE.DE", "ALV.DE", "DTE.DE", "BAS.DE",
            "BMW.DE", "MBG.DE", "VOW3.DE", "ADS.DE", "MUV2.DE",
            "MC.PA", "OR.PA", "SAN.PA", "AI.PA", "BNP.PA",
            "AIR.PA", "TTE.PA", "SU.PA", "CS.PA", "CAP.PA",
            "SHEL.L", "AZN.L", "HSBA.L", "BP.L", "GSK.L",
            "RIO.L", "ULVR.L", "DGE.L", "LLOY.L", "VOD.L"
        ]
    }
}

# Vis pools
print("Asset Pools:")
print("-" * 40)
total = 0
for pool_id, pool in POOLS.items():
    count = len(pool['tickers'])
    total += count
    print(f"  {pool_id}: {pool['name']} ({count} tickers)")
print("-" * 40)
print(f"Total: {total} tickers")

## 4. Checkpoint System

In [None]:
def load_checkpoint():
    """Last checkpoint fra Google Drive"""
    checkpoint_file = Path(CHECKPOINT_DIR) / "checkpoint.json"
    if checkpoint_file.exists():
        with open(checkpoint_file, 'r') as f:
            return json.load(f)
    return {
        "completed_tickers": [],
        "failed_tickers": [],
        "last_update": None
    }

def save_checkpoint(checkpoint):
    """Lagre checkpoint til Google Drive"""
    checkpoint["last_update"] = datetime.now().isoformat()
    checkpoint_file = Path(CHECKPOINT_DIR) / "checkpoint.json"
    with open(checkpoint_file, 'w') as f:
        json.dump(checkpoint, f, indent=2)

def save_ticker_data(ticker, df):
    """Lagre ticker data til CSV"""
    data_dir = Path(CHECKPOINT_DIR) / "data"
    data_dir.mkdir(exist_ok=True)
    safe_ticker = ticker.replace("/", "_").replace("=", "_")
    filepath = data_dir / f"{safe_ticker}.csv"
    df.to_csv(filepath)
    return filepath

# Last eksisterende checkpoint
checkpoint = load_checkpoint()
print(f"Checkpoint status:")
print(f"  Fullført: {len(checkpoint['completed_tickers'])}")
print(f"  Feilet: {len(checkpoint['failed_tickers'])}")
print(f"  Sist oppdatert: {checkpoint.get('last_update', 'Aldri')}")

## 5. Fetch-funksjon med Rate Limiting

In [None]:
def fetch_with_backoff(ticker, start_date, end_date):
    """Fetch data med eksponentiell backoff for Colab"""
    for attempt in range(MAX_RETRIES):
        try:
            if attempt > 0:
                delay = RETRY_BASE_DELAY * (2 ** (attempt - 1))
                print(f"  Retry {attempt + 1}/{MAX_RETRIES}, venter {delay:.0f}s...")
                time.sleep(delay)

            df = yf.download(
                ticker,
                start=start_date.strftime('%Y-%m-%d'),
                end=(end_date + timedelta(days=1)).strftime('%Y-%m-%d'),
                interval='1d',
                auto_adjust=False,
                progress=False,
                threads=False
            )

            if df is None or df.empty:
                print(f"  [{ticker}] Tom respons")
                continue

            df = df.dropna(subset=['Close'])
            if df.empty:
                print(f"  [{ticker}] Ingen gyldige data etter cleaning")
                continue

            return df

        except Exception as e:
            error_str = str(e).lower()
            is_rate_limit = any(x in error_str for x in [
                "too many requests", "rate limit", "429", "throttle"
            ])

            if is_rate_limit:
                print(f"  [{ticker}] RATE LIMITED! Venter {RATE_LIMIT_BACKOFF}s...")
                time.sleep(RATE_LIMIT_BACKOFF)
            else:
                print(f"  [{ticker}] Feil: {e}")

    return None

print("Fetch-funksjon klar!")

## 6. Hovedfunksjon - Kjør Backfill

In [None]:
def run_backfill(pool_ids=None, resume=True):
    """Kjør backfill for valgte pools"""
    global checkpoint

    # Bestem tickers
    if pool_ids:
        tickers = []
        for pid in pool_ids:
            if pid in POOLS:
                tickers.extend(POOLS[pid]['tickers'])
        tickers = list(set(tickers))
    else:
        tickers = []
        for pool in POOLS.values():
            tickers.extend(pool['tickers'])
        tickers = list(set(tickers))

    # Filtrer ut allerede fullførte
    if resume:
        completed = set(checkpoint['completed_tickers'])
        pending = [t for t in tickers if t not in completed]
    else:
        checkpoint = {"completed_tickers": [], "failed_tickers": [], "last_update": None}
        pending = tickers

    print("=" * 60)
    print("IoS-001 BACKFILL - STIG-2025-001")
    print("=" * 60)
    print(f"Pools: {pool_ids or 'ALL'}")
    print(f"Total tickers: {len(tickers)}")
    print(f"Allerede fullført: {len(checkpoint['completed_tickers'])}")
    print(f"Gjenstår: {len(pending)}")
    print("=" * 60)

    if not pending:
        print("Ingen tickers å prosessere!")
        return

    # Datoer
    end_date = date.today() - timedelta(days=1)
    start_date = date.today() - timedelta(days=MAX_HISTORY_YEARS * 365)

    results = {
        "processed": 0,
        "success": 0,
        "failed": 0,
        "total_rows": 0
    }

    batch_count = 0

    for i, ticker in enumerate(tqdm(pending, desc="Backfill")):
        results["processed"] += 1

        try:
            print(f"\n[{i+1}/{len(pending)}] {ticker}...")

            df = fetch_with_backoff(ticker, start_date, end_date)

            if df is None or df.empty:
                print(f"  FEILET: Ingen data")
                results["failed"] += 1
                if ticker not in checkpoint["failed_tickers"]:
                    checkpoint["failed_tickers"].append(ticker)
            else:
                # Lagre til CSV
                csv_path = save_ticker_data(ticker, df)
                rows = len(df)
                results["total_rows"] += rows

                # Iron Curtain status
                is_crypto = ticker.endswith("-USD")
                if is_crypto:
                    quarantine, full = CRYPTO_QUARANTINE, CRYPTO_FULL_HISTORY
                else:
                    quarantine, full = EQUITY_FX_QUARANTINE, EQUITY_FX_FULL_HISTORY

                if rows < quarantine:
                    status = "QUARANTINED"
                elif rows < full:
                    status = "SHORT_HISTORY"
                else:
                    status = "FULL_HISTORY"

                print(f"  OK: {rows} rader, {status}")
                results["success"] += 1

                if ticker not in checkpoint["completed_tickers"]:
                    checkpoint["completed_tickers"].append(ticker)
                if ticker in checkpoint["failed_tickers"]:
                    checkpoint["failed_tickers"].remove(ticker)

            # Lagre checkpoint
            save_checkpoint(checkpoint)

        except KeyboardInterrupt:
            print("\n\nAvbrutt! Checkpoint lagret.")
            save_checkpoint(checkpoint)
            return results

        except Exception as e:
            print(f"  ERROR: {e}")
            results["failed"] += 1

        # Rate limiting
        batch_count += 1
        time.sleep(DELAY_BETWEEN_ASSETS)

        if batch_count >= BATCH_SIZE:
            batch_count = 0
            print(f"\nBatch pause: {DELAY_BETWEEN_BATCHES}s...")
            time.sleep(DELAY_BETWEEN_BATCHES)

    # Oppsummering
    print("\n" + "=" * 60)
    print("FULLFØRT")
    print("=" * 60)
    print(f"Prosessert: {results['processed']}")
    print(f"Suksess: {results['success']}")
    print(f"Feilet: {results['failed']}")
    print(f"Totalt rader: {results['total_rows']}")

    return results

## 7. Kjør Pool A (ETFs) - START HER

In [None]:
# Kjør Pool A
results_a = run_backfill(pool_ids=["A"], resume=True)

## 8. Kjør Pool B (Mag7)

In [None]:
# Kjør Pool B
results_b = run_backfill(pool_ids=["B"], resume=True)

## 9. Kjør Pool C (Crypto)

In [None]:
# Kjør Pool C
results_c = run_backfill(pool_ids=["C"], resume=True)

## 10. Kjør Alle Gjenværende Pools

In [None]:
# Kjør D, E, F, G
results_rest = run_backfill(pool_ids=["D", "E", "F", "G"], resume=True)

## 11. Generer Import Script for Lokal Database

In [None]:
# Generer Python import script
data_dir = Path(CHECKPOINT_DIR) / "data"
csv_files = list(data_dir.glob("*.csv"))

print(f"Fant {len(csv_files)} CSV-filer")

import_script = f'''
#!/usr/bin/env python3
"""
IoS-001 IMPORT SCRIPT
Generert: {datetime.now().isoformat()}
Filer: {len(csv_files)}
"""
import pandas as pd
import psycopg2
from psycopg2.extras import execute_values
from pathlib import Path
from datetime import datetime

# Database config
DB_CONFIG = {{
    "host": "127.0.0.1",
    "port": "54322",
    "database": "postgres",
    "user": "postgres",
    "password": "postgres"
}}

DATA_DIR = Path("./ios001_data")  # Kopier CSV-filene hit

def import_csv(conn, filepath):
    ticker = filepath.stem.replace("_", "-").replace("-X", "=X")
    df = pd.read_csv(filepath, index_col=0, parse_dates=True)

    insert_sql = """
        INSERT INTO fhq_data.price_series (
            listing_id, timestamp, vendor_id, frequency, price_type,
            open, high, low, close, volume, adj_close,
            source_id, is_verified
        ) VALUES %s
        ON CONFLICT (listing_id, timestamp, vendor_id) DO UPDATE SET
            open = EXCLUDED.open,
            high = EXCLUDED.high,
            low = EXCLUDED.low,
            close = EXCLUDED.close,
            volume = EXCLUDED.volume,
            adj_close = EXCLUDED.adj_close
    """

    values = []
    for idx, row in df.iterrows():
        ts = idx.tz_localize(None) if hasattr(idx, "tz_localize") and idx.tzinfo else idx
        values.append((
            ticker, ts, "YFINANCE_COLAB", "DAILY", "RAW",
            row.get("Open"), row.get("High"), row.get("Low"),
            row.get("Close"), row.get("Volume"),
            row.get("Adj Close", row.get("Close")),
            "COLAB_IMPORT", False
        ))

    with conn.cursor() as cur:
        execute_values(cur, insert_sql, values)
    conn.commit()
    return len(values)

if __name__ == "__main__":
    conn = psycopg2.connect(**DB_CONFIG)
    csv_files = list(DATA_DIR.glob("*.csv"))
    print(f"Importerer {{len(csv_files)}} filer...")

    total = 0
    for f in csv_files:
        try:
            rows = import_csv(conn, f)
            total += rows
            print(f"  {{f.stem}}: {{rows}} rader")
        except Exception as e:
            print(f"  {{f.stem}}: FEIL - {{e}}")

    conn.close()
    print(f"Totalt: {{total}} rader importert")
'''

import_script_path = Path(CHECKPOINT_DIR) / "import_to_db.py"
with open(import_script_path, 'w') as f:
    f.write(import_script)

print(f"Import script generert: {import_script_path}")
print("\nInstruksjoner:")
print("1. Last ned CSV-filene fra Google Drive")
print("2. Kopier til ./ios001_data/")
print("3. Kjør: python import_to_db.py")

## 12. Sjekk Status

In [None]:
# Oppdater checkpoint
checkpoint = load_checkpoint()

# Analyser data
data_dir = Path(CHECKPOINT_DIR) / "data"
csv_files = list(data_dir.glob("*.csv"))

status_counts = {"FULL_HISTORY": 0, "SHORT_HISTORY": 0, "QUARANTINED": 0}

for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    rows = len(df)
    ticker = csv_file.stem

    is_crypto = "-USD" in ticker
    if is_crypto:
        quarantine, full = CRYPTO_QUARANTINE, CRYPTO_FULL_HISTORY
    else:
        quarantine, full = EQUITY_FX_QUARANTINE, EQUITY_FX_FULL_HISTORY

    if rows < quarantine:
        status_counts["QUARANTINED"] += 1
    elif rows < full:
        status_counts["SHORT_HISTORY"] += 1
    else:
        status_counts["FULL_HISTORY"] += 1

print("=" * 60)
print("IoS-001 IRON CURTAIN STATUS")
print("=" * 60)
print(f"Fullført tickers: {len(checkpoint['completed_tickers'])}")
print(f"Feilet tickers: {len(checkpoint['failed_tickers'])}")
print(f"CSV-filer: {len(csv_files)}")
print()
print("Data Quality Status:")
print(f"  FULL_HISTORY (5+ år): {status_counts['FULL_HISTORY']}")
print(f"  SHORT_HISTORY (1-5 år): {status_counts['SHORT_HISTORY']}")
print(f"  QUARANTINED (<1 år): {status_counts['QUARANTINED']}")
print()
print(f"IoS-003 Eligible: {status_counts['FULL_HISTORY'] + status_counts['SHORT_HISTORY']}")
print("=" * 60)