In [None]:
import pandas as pd
from scipy.stats import hypergeom

In [None]:
pt_orthologs = pd.read_csv('./pseudotime_orthologs.csv')

In [None]:
pt_orthologs

In [None]:
human_top_genes = set(pt_orthologs[pt_orthologs["top_10_pct_mi"].isin(["human","both"]) & pt_orthologs["ortholog_univ"]]['human_gene'])
mouse_top_genes = set(pt_orthologs[pt_orthologs["top_10_pct_mi"].isin(["mouse","both"]) & pt_orthologs["ortholog_univ"]]['human_gene'])
overlap = human_top_genes & mouse_top_genes

In [None]:
N = sum(pt_orthologs['ortholog_univ']) # number of 1-1 orthologs
print(f"ortholog universe: {N}\n")

n = len(human_top_genes) # number of significant human genes
print(f"number of human top genes: {n}")
print(f"      exclusive to humans: {n - k}\n")


K = len(mouse_top_genes) # number of significant mouse genes
print(f"number of mouse top genes: {K}")
print(f"        exclusive to mice: {K - k}\n")

k = len(overlap) # overlapping genes
print(f"size of overlap: {k}")

print(f"percent overlap in human genes: {k / n * 100:.1f}%")

# do a hypergeometric test
pval = hypergeom.sf(k - 1, N, n, K)   # sf = 1 – cdf(k-1)

print()
print(f"hypergeometric p-value: {pval:.3e}")
print(f"mean overlap of null: {K * n / N:.2f}, or ~{round(K * n / N)} genes")

In [None]:
n_top_genes = 50

# get top mouse genes
top_mouse_df = (
    pt_orthologs
    .sort_values("mouse_mi_rank", ascending=True, na_position="last")
    .head(n_top_genes)
)
top_mouse = top_mouse_df["mouse_gene"].tolist()

# get overlap subset (use human symbols)
top_mouse_overlap_df = top_mouse_df[top_mouse_df["human_gene"].isin(overlap)]
top_mouse_overlap = top_mouse_overlap_df["mouse_gene"].tolist()

# get top human genes
top_human_df = (
    pt_orthologs
    .sort_values("human_mi_rank", ascending=True, na_position="last")
    .head(n_top_genes)
)
top_human = top_human_df["human_gene"].tolist()

# get overlap subset (use human symbols)
top_human_overlap_df = top_human_df[top_human_df["human_gene"].isin(overlap)]
top_human_overlap = top_human_overlap_df["human_gene"].tolist()

# print out side-by-side
print(f"\nTop {n_top_genes} pseudotime genes with 1–1 orthologs:")
print(f"{'Mouse':<20} | {'Human':<20}")
print("-" * 43)

max_len = max(len(top_mouse), len(top_human))
for i in range(max_len):
    m = top_mouse[i] if i < len(top_mouse) else ""
    h = top_human[i] if i < len(top_human) else ""
    print(f"{i+1:>2}) {m:<17} | {i+1:>2}) {h:<17}")

# print out overlap subset
print("\nOverlap subset (within top list):")
print(f"{'Mouse':<20} | {'Human':<20}")
print("-" * 43)

max_len = max(len(top_mouse_overlap), len(top_human_overlap))
for i in range(max_len):
    m = top_mouse_overlap[i] if i < len(top_mouse_overlap) else ""
    h = top_human_overlap[i] if i < len(top_human_overlap) else ""
    print(f"{m:<20} | {h:<20}")

In [None]:
print("\nFull overlap (as human genes):\n")
print(overlap)