## Set Environment Configurations and Variables 

In [None]:
# SET GLOBAL VARIABLES
import os
from pathlib import Path

WORKSPACE_BUCKET = os.getenv("WORKSPACE_BUCKET", "").rstrip("/")
GOOGLE_PROJECT = os.getenv("GOOGLE_PROJECT", "")
PET_SA_EMAIL = os.getenv("PET_SA_EMAIL", "")

outputFold = os.getenv("outputFold", "mtDNA_v25_pilot_5")
PORTID = int(os.getenv("PORTID", "8094"))
USE_MEM = int(os.getenv("USE_MEM", "32"))
SQL_DB_NAME = os.getenv("SQL_DB_NAME", "local_cromwell_run.db")

PROJECT_ROOT = Path(os.getenv("PROJECT_ROOT", "/mnt/f/research_drive/mtdna/leelab/mtDNA-analysis")).resolve()

print("WORKSPACE_BUCKET:", WORKSPACE_BUCKET)
print("GOOGLE_PROJECT:", GOOGLE_PROJECT)
print("PET_SA_EMAIL:", PET_SA_EMAIL)
print("PROJECT_ROOT:", PROJECT_ROOT)
print("PORTID:", PORTID, "USE_MEM:", USE_MEM)


## Helper Functions 

In [None]:
import pandas as pd
import subprocess
import tempfile
from pathlib import Path

def load_one_tsv_gcs(tsv_gs_path):
    tmpdir = Path(tempfile.mkdtemp())
    local_tsv = tmpdir / Path(tsv_gs_path).name
    subprocess.run(f"gsutil cp {tsv_gs_path} {local_tsv}", shell=True, check=True)

    cols = ["CHROM","POS","REF","ALT","QUAL","FILTER","DP","FORMAT","SAMPLE"]
    df = pd.read_csv(local_tsv, sep="\t", header=None, names=cols)

    # parse sample_id, age, sex from filename: <sample>_<age>_<sex>_mt.vaf0.01.tsv
    fname = Path(tsv_gs_path).name
    parts = fname.split("_")
    df["sample_id"] = parts[0] if len(parts) > 0 else None
    df["age"] = parts[1] if len(parts) > 1 else None
    df["sex"] = parts[2] if len(parts) > 2 else None

    # parse FORMAT/SAMPLE into AF and DP
    fmt_keys = df["FORMAT"].iloc[0].split(":")
    def get_fmt(sample, key):
        if key in fmt_keys:
            idx = fmt_keys.index(key)
            vals = sample.split(":")
            return vals[idx] if idx < len(vals) else None
        return None

    df["AF_raw"] = df["SAMPLE"].apply(lambda x: get_fmt(x, "AF"))
    df["DP_fmt"] = df["SAMPLE"].apply(lambda x: get_fmt(x, "DP"))

    # For multi-allelic, AF can be comma-separated. Keep first AF for now.
    df["AF"] = df["AF_raw"].apply(lambda x: float(str(x).split(",")[0]) if x not in [None, "None"] else None)

    return df


In [None]:
one_tsv = df_runs.loc[0, "s3_tsv"]
df_one = load_one_tsv_gcs(one_tsv)
df_one.head()


In [None]:
sample_frames = {}
for _, row in df_runs.iterrows():
    tsv_path = row["s3_tsv"]
    sample_id = row["person_id"]
    sample_frames[sample_id] = load_one_tsv_gcs(tsv_path)

# example
sample_frames["1000093"].head()


In [None]:
import pandas as pd
import numpy as np

sample_frames = {}
for _, row in df_runs.iterrows():
    tsv_path = row["s3_tsv"]
    sample_id = str(row["person_id"])
    sample_frames[sample_id] = load_one_tsv_gcs(tsv_path)

# example
sample_frames["1000093"].head()

summary_rows = []
for sample_id, df in sample_frames.items():
    af = pd.to_numeric(df["AF"], errors="coerce").dropna()
    if len(af) == 0:
        continue

    summary_rows.append({
        "sample_id": sample_id,
        "age": df["age"].iloc[0],
        "sex": df["sex"].iloc[0],
        "variant_count": len(af),
        "mean_AF": af.mean(),
        "max_AF": af.max(),
        "median_AF": af.median()
    })

summary = pd.DataFrame(summary_rows)
summary.head()


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

# summary["age"] = pd.to_numeric(summary["age"], errors="coerce")
# 
plt.figure(figsize=(8,5))
sns.scatterplot(data=summary, x=summary["age"], y=summary["variant_count"])
plt.title("Mean Heteroplasmy (AF) by Age")
plt.ylabel("Mean AF")
plt.xlabel("Age")
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

summary["age"] = pd.to_numeric(summary["age"], errors="coerce")

plt.figure(figsize=(8,5))
sns.scatterplot(data=summary, x="age", y="mean_AF", hue="sex")
plt.title("Mean Heteroplasmy (AF) by Age")
plt.ylabel("Mean AF")
plt.xlabel("Age")
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=summary, x="age", y="variant_count", hue="sex")
plt.title("Variant Count (AFâ‰¥0.01) by Age")
plt.ylabel("Variant Count")
plt.xlabel("Age")
plt.show()


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

summary["age"] = pd.to_numeric(summary["age"], errors="coerce")

# Spearman (robust to nonlinearity)
rho, p = spearmanr(summary["age"], summary["mean_AF"], nan_policy="omit")
print(f"Spearman rho={rho:.3f}, p={p:.3g}")

plt.figure(figsize=(8,5))
sns.regplot(data=summary, x="age", y="mean_AF", scatter=True, lowess=True)
plt.title("Mean Heteroplasmy vs Age (LOWESS)")
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.regplot(data=summary, x="age", y="max_AF", scatter=True, lowess=True)
plt.title("Max Heteroplasmy vs Age (LOWESS)")
plt.show()


In [None]:
summary["age_bin"] = pd.cut(summary["age"], bins=[18,30,40,50,60,70,80,90,120])
plt.figure(figsize=(10,5))
sns.boxplot(data=summary, x="age_bin", y="mean_AF")
plt.xticks(rotation=45)
plt.title("Mean Heteroplasmy by Age Bin")
plt.show()
