In [21]:
import pandas as pd
import re
from pathlib import Path

# ---------- CONFIG ----------
# Point this to your file. Works for .xlsx or .csv
DATA_PATH = Path(r"C:\Users\kfq6\Documents\Data\labs_3m_windows_avg.csv")
TOP_N = 25
OUTPUT_CSV = DATA_PATH.with_name("labs_3m_windows_analyte_availability.csv")

# Columns that are NOT analytes
META_COLS = {
    'DW_EK_Borger', 'window_idx', 'window_start', 'window_end',
    'first_sample_in_window', 'last_sample_in_window', 'n_samples_in_window'
}

In [22]:
# ---------- HELPERS ----------
def read_table(path: Path) -> pd.DataFrame:
    """Read Excel or CSV with sensible defaults."""
    if path.suffix.lower() in {'.xlsx', '.xls'}:
        return pd.read_excel(path, engine="openpyxl")
    elif path.suffix.lower() == '.csv':
        # If your CSV has semicolons or commas as decimal separators,
        # tweak sep and decimal here.
        return pd.read_csv(path)
    else:
        raise ValueError(f"Unsupported file extension: {path.suffix}")

def decode_excel_escapes(s: str) -> str:
    """
    Clean up Excel-style escapes like _x0020_ (space), _x002F_ (/), _x00B5_ (µ),
    and remove noisy superscript fragments seen in some LABKA exports.
    """
    if not isinstance(s, str):
        return s

    # Common direct replacements
    replacements = {
        "_x0020_": " ",
        "_x002F_": "/",
        "_x00B5_": "µ",
        "_x003A_": ":",      # colon
        "_x002C_": ",",      # comma
        "_x0028_": "(",      # (
        "_x0029_": ")",      # )
        "_x005B_": "[",      # [
        "_x005D_": "]",      # ]
        "_x003D_": "=",      # =
        "_x002B_": "+",      # +
        "_x003C_sup_x003E_": "",   # <sup> (strip)
        "_x003C__x002F_sup_x003E_": "",  # </sup> (strip)
    }
    for k, v in replacements.items():
        s = s.replace(k, v)

    # Remove duplicated encodings that sometimes appear like "__"
    s = re.sub(r"__+", "_", s)

    # Collapse multiple spaces
    s = re.sub(r"\s{2,}", " ", s).strip()

    # Drop trailing "_mean"
    s = re.sub(r"_mean$", "", s)

    return s

def prettify_columns(cols):
    return [decode_excel_escapes(c) for c in cols]

In [23]:
# ---------- LOAD ----------
labs = read_table(DATA_PATH)

# ---------- BASIC INFO ----------
print(f"Loaded: {DATA_PATH}")
print(f"Shape: {labs.shape[0]} rows × {labs.shape[1]} cols")
pd.set_option("display.max_colwidth", 120)
pd.set_option("display.width", 140)

Loaded: C:\Users\kfq6\Documents\Data\labs_3m_windows_avg.csv
Shape: 24209 rows × 1 cols


In [24]:
# ---------- IDENTIFY ANALYTE COLUMNS ----------
# If your file is fully pivoted (one column per analyte), the non-meta columns are analytes.
analysis_cols = [c for c in labs.columns if c not in META_COLS]

# Optional: if you want to be extra strict, keep only columns that look like measurements:
# analysis_cols = [c for c in analysis_cols if c.endswith("_mean") or c.startswith(("P-", "B-", "U-", "eGFR"))]

# Attempt to coerce analysis columns to numeric where possible (won't harm numeric cols; strings become NaN)
labs[analysis_cols] = labs[analysis_cols].apply(pd.to_numeric, errors="coerce")

# ---------- AVAILABILITY (COUNT + COVERAGE %) ----------
availability = labs[analysis_cols].notna().sum().sort_values(ascending=False)
coverage = (availability / len(labs) * 100)

summary = pd.DataFrame({
    "Test_raw": availability.index,
    "Count": availability.values,
    "Coverage_%": coverage.values
})

# Prettify test names
summary["Test"] = prettify_columns(summary["Test_raw"])
# If prettified names collide (rare), keep the raw too
summary = summary[["Test", "Test_raw", "Count", "Coverage_%"]].sort_values(by="Count", ascending=False)
summary["Coverage_%"] = summary["Coverage_%"].round(1)

# ---------- PRINT TOP N ----------
print("\nMost frequent analytes (top N):")
print(summary.head(TOP_N).to_string(index=False))


Most frequent analytes (top N):
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  Test                                                                                                                                                                                                                                                                                                 