<a href="https://colab.research.google.com/github/Jessietbl/aviation-scsirisk-showcase/blob/main/03_revenue_extraction_upsampling_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Revenue Extraction & Monthly Upsampling — Showcase Demo

This demo shows:
1) Parse quarterly revenue (AAX-style) → (using PDF or sample CSV)
2) Shape-preserving monthly interpolation with **PCHIP**
3) Plot + save outputs

> The PDF parsing uses `src/revenue_utils.py`. For the showcase, we use a tiny CSV so it runs fast; you can switch to PDFs later.


In [None]:
# Imports
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

from src.revenue_utils import (
    harvest_from_pdfs,
    piecewise_monthly_pchip,
)

DATA_DIR = Path("data")
OUT_DIR  = Path("outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
!mkdir -p src


In [None]:
%%writefile src/revenue_utils.py
from __future__ import annotations
import re
from pathlib import Path
from typing import Iterable, Optional, List, Tuple

import numpy as np
import pandas as pd

# Optional PDF text extraction (only used if you pass PDF paths)
def extract_text_pdf(path: str) -> str:
    """
    Extracts text from a PDF using pdfplumber if available.
    If pdfplumber isn't installed, raises a clear error.
    """
    try:
        import pdfplumber
    except ImportError as e:
        raise ImportError("pdfplumber is required to read PDFs in this demo. "
                          "Install it or provide pre-extracted text/CSV.") from e

    chunks = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            chunks.append(page.extract_text() or "")
    return "\n".join(chunks)


# --- Regex patterns (robust to AAX formats) ---
DATE_PAT_DDMMYYYY = re.compile(
    r"(?:Quarter|Year)\s+ended\s*:?\s*(\d{2}/\d{2}/\d{4})",
    re.IGNORECASE
)
DATE_PAT_WORD = re.compile(
    r"(?:Quarter|Year)\s+ended\s*:?\s*([0-9]{1,2}\s+\w+\s+\d{4})",
    re.IGNORECASE
)

# "Revenue" followed by optional footnote digits, then first large number
REV_PAT = re.compile(
    r"Revenue(?:\s+\d+)*\s+((?:\d{1,3}(?:,\d{3})+|\d{4,}))",
    re.IGNORECASE
)


def parse_quarter_end(text: str) -> Optional[pd.Timestamp]:
    """Parse 'Quarter ended DD/MM/YYYY' or 'Quarter ended 31 March 2020'."""
    m = DATE_PAT_DDMMYYYY.search(text or "")
    if m:
        return pd.to_datetime(m.group(1), format="%d/%m/%Y", errors="coerce")
    m = DATE_PAT_WORD.search(text or "")
    if m:
        return pd.to_datetime(m.group(1), dayfirst=True, errors="coerce")
    return None


def parse_revenue_rm000(text: str) -> Optional[int]:
    """
    Return first revenue integer in **RM thousands** (as is).
    Caller can multiply by 1000 to get RM.
    """
    m = REV_PAT.search(text or "")
    if not m:
        return None
    return int(m.group(1).replace(",", ""))


def harvest_from_pdfs(pdf_paths: Iterable[str]) -> pd.DataFrame:
    """
    Extract (quarter_end, revenue_RM000) from a list of PDF files.
    Returns a tidy DataFrame sorted by date.
    """
    rows, miss = [], []
    for p in sorted(set(map(str, pdf_paths))):
        try:
            t = extract_text_pdf(p)
            qe = parse_quarter_end(t)
            rev = parse_revenue_rm000(t)
            if qe is None or rev is None:
                miss.append(Path(p).name)
            rows.append({"file": Path(p).name, "quarter_end": qe, "revenue_RM000": rev})
        except Exception:
            miss.append(Path(p).name)
            rows.append({"file": Path(p).name, "quarter_end": None, "revenue_RM000": None})

    df = pd.DataFrame(rows).dropna(subset=["quarter_end", "revenue_RM000"]).sort_values("quarter_end").reset_index(drop=True)
    df["calendar_year"] = df["quarter_end"].dt.year
    df["q_seq_in_year"] = df.groupby("calendar_year")["quarter_end"].rank(method="first").astype(int)
    df["revenue_RM"] = df["revenue_RM000"] * 1000

    return df


# ---------- Piecewise monthly interpolation (shape-preserving) ----------
def _months_from_start(dts: pd.Series, start_m: pd.Timestamp) -> np.ndarray:
    return ((dts.dt.year - start_m.year) * 12 + (dts.dt.month - start_m.month)).astype(int).to_numpy()


def piecewise_monthly_pchip(qdf: pd.DataFrame, value_col: str = "revenue_RM", gap_days: int = 150) -> pd.DataFrame:
    """
    Interpolate monthly values within contiguous segments using PCHIP.
    Large gaps (e.g., missing a year) are NOT bridged.
    Requires SciPy for PchipInterpolator.
    """
    try:
        from scipy.interpolate import PchipInterpolator
    except ImportError as e:
        raise ImportError("scipy is required for PCHIP interpolation. Install scipy or replace with linear.") from e

    qdf = qdf.sort_values("quarter_end").copy()
    qdf["gap_days"] = qdf["quarter_end"].diff().dt.days

    # Segment on big gaps
    breaks = qdf.index[qdf["gap_days"] > gap_days].tolist()
    segments = []
    start = 0
    idxs = list(qdf.index)

    def _slice(df, s, e): return df.iloc[s:e].copy()

    for b in breaks + [len(qdf)]:
        seg = _slice(qdf, start, b)
        if len(seg) >= 2:
            segments.append(seg)
        start = b

    monthly_parts = []
    for seg in segments:
        start_m = seg["quarter_end"].min().to_period("M").to_timestamp("M")
        end_m   = seg["quarter_end"].max().to_period("M").to_timestamp("M")
        m_idx   = pd.date_range(start=start_m, end=end_m, freq="M")
        x_q     = _months_from_start(seg["quarter_end"], start_m)
        y_q     = seg[value_col].astype(float).to_numpy()
        x_m     = np.arange(len(m_idx))

        interp  = PchipInterpolator(x_q, y_q)
        y_m     = interp(x_m)

        monthly_parts.append(pd.DataFrame({"month_end": m_idx, f"{value_col}_monthly": y_m}))

    if not monthly_parts:
        return pd.DataFrame(columns=["month_end", f"{value_col}_monthly"])
    mdf = pd.concat(monthly_parts, axis=0).drop_duplicates("month_end").sort_values("month_end")
    return mdf.reset_index(drop=True)


In [None]:
# ---- Parse real PDFs (uncomment & provide paths) ----
# pdfs = list(Path("inputs").glob("*.pdf"))
# df = harvest_from_pdfs(pdfs)
# df.to_csv(OUT_DIR / "quarterly_revenue_unified.csv", index=False)
# df.head()


In [None]:
# Monthly interpolation (piecewise PCHIP)
mdf = piecewise_monthly_pchip(df, value_col="revenue_RM", gap_days=150)
mdf.to_csv(OUT_DIR / "monthly_revenue_piecewise_pchip.csv", index=False)
mdf.head()


In [None]:
# Plot
plt.figure(figsize=(11,5))
plt.plot(df["quarter_end"], df["revenue_RM"], "o-", label="Quarterly (RM)")
if not mdf.empty:
    plt.plot(mdf["month_end"], mdf["revenue_RM_monthly"], "--", label="Monthly (PCHIP)")
plt.title("Revenue: Quarterly + Piecewise Monthly Interpolation")
plt.xlabel("Date"); plt.ylabel("RM"); plt.grid(True); plt.legend(); plt.tight_layout()
plt.show()
