In [None]:
import pandas as pd
import os

In [None]:
orthology = pd.read_csv('./HOM_MouseHumanSequence.txt', sep = '\t')

In [None]:
# define some labels
mouse_label = "mouse, laboratory"
human_label = "human"
species_col = "Common Organism Name"
mouse_symbol_col = "mouse_symbol"
human_symbol_col = "human_symbol"

In [None]:
# keep only mouse/human entries
orthology = orthology[orthology[species_col].isin([mouse_label, human_label])]

# keep groups with exactly 1 mouse + 1 human
g = orthology.groupby("DB Class Key")
strict = g.filter(lambda x: len(x)==2 and set(x[species_col])=={mouse_label, human_label})

# pivot so each group is one row
wide = strict.pivot(index="DB Class Key", columns=species_col)

def col(name, sp):
    return wide[(name, sp)].astype(str)

pairs = pd.DataFrame({
    mouse_symbol_col: col("Symbol", mouse_label),
    human_symbol_col: col("Symbol", human_label),
    "mouse_mgi":    col("Mouse MGI ID", mouse_label),
    "human_hgnc":   col("HGNC ID", human_label),
    "mouse_entrez": col("EntrezGene ID", mouse_label),
    "human_entrez": col("EntrezGene ID", human_label),
})

print("Strict mouse-human 1:1 groups:", len(pairs))
print(pairs.head(10))

In [None]:
# count species composition per homology group
counts = orthology.groupby(["DB Class Key", species_col]).size().unstack(fill_value=0)

# keys where both species appear exactly once
one2one_keys = counts[
    (counts[mouse_label] == 1) & (counts[human_label] == 1)
].index

# check counts directly
bad_keys = counts.loc[one2one_keys][
    (counts[mouse_label] != 1) | (counts[human_label] != 1)
]
print(f"non-1:1 keys sneaking in: {len(bad_keys)}")

# check uniqueness of mapping
mouse_dupes = pairs[mouse_symbol_col][pairs[mouse_symbol_col].duplicated()]
human_dupes = pairs[human_symbol_col][pairs[human_symbol_col].duplicated()]

print(f"duplicate mouse symbols: {len(mouse_dupes)}")
print(f"duplicate human symbols: {len(human_dupes)}")

In [None]:
# count how often each mouse / human gene shows up
m_counts = pairs[mouse_symbol_col].value_counts()
h_counts = pairs[human_symbol_col].value_counts()

# keep pairs where both symbols are unique
pairs_1to1 = pairs[
    pairs[mouse_symbol_col].map(m_counts).eq(1) &
    pairs[human_symbol_col].map(h_counts).eq(1)
].reset_index(drop=True)

print("After enforcing global uniqueness,")
print("mouse symbols:", pairs_1to1[mouse_symbol_col].nunique())
print("human symbols:", pairs_1to1[human_symbol_col].nunique())
print("rows:", len(pairs_1to1))

In [None]:
pairs_1to1

In [None]:
# write to .csv
pairs_1to1.to_csv('./jax_orthology.csv', index = False)

In [None]:
def load_orthology(path, mouse_symbol_col = "mouse_symbol", human_symbol_col = "human_symbol"):
    """
    Load pruned JAX orthology file and return two dicts:
    - mouse2human: mouse_symbol -> human_symbol
    - human2mouse: human_symbol -> mouse_symbol
    """
    orthology = pd.read_csv(path, dtype=str)

    # Defensive: drop rows with missing values
    orthology = orthology.dropna(subset=[mouse_symbol_col,human_symbol_col])

    mouse2human = dict(zip(orthology[mouse_symbol_col], orthology[human_symbol_col]))
    human2mouse = dict(zip(orthology[human_symbol_col], orthology[mouse_symbol_col]))

    return mouse2human, human2mouse

In [None]:
# path where orthology is tored
base_path = "."
orth_file = "jax_orthology.csv"
orth_path = os.path.join(base_path, orth_file)

# example usage
mouse2human, human2mouse = load_orthology(orth_path)

mm_gene = "Foxo1"
hs_gene = "FOXO1"

print(f"{mm_gene} is", mouse2human.get(mm_gene))
print(f"{hs_gene} is", human2mouse.get(hs_gene))