In [16]:
# Extract and save IDs to files for BioMart upload
import pandas as pd

# Load your data
df = pd.read_csv('SQLtable.9606.fa-9913.fa', sep='\t', 
                 names=['group_id', 'cluster_id', 'species', 'score', 'uniprot_id', 'bootstrap'])
df.head()

Unnamed: 0,group_id,cluster_id,species,score,uniprot_id,bootstrap
0,1,11349,9606.fa,1.0,Q9UPN3,1.0
1,1,11349,9913.fa,1.0,F1N6H4,1.0
2,2,10792,9606.fa,1.0,Q8WXG9,1.0
3,2,10792,9913.fa,1.0,F1N0A6,1.0
4,3,10774,9606.fa,1.0,Q8WXH0,1.0


In [17]:

# Filter main orthologs
main_orthologs = df[df['score'] == 1.0]

# Extract human IDs
human_ids = main_orthologs[main_orthologs['species'] == '9606.fa']['uniprot_id'].unique()
with open('human_uniprot_ids.txt', 'w') as f:
    for uid in human_ids:
        f.write(f'{uid}\n')

# Extract species IDs  
cattle_ids = main_orthologs[main_orthologs['species'] == '9913.fa']['uniprot_id'].unique()
with open('cattle_uniprot_ids.txt', 'w') as f:
    for uid in cattle_ids:
        f.write(f'{uid}\n')

print(f"Created human_uniprot_ids.txt with {len(human_ids)} IDs")
print(f"Created cattle_uniprot_ids.txt with {len(cattle_ids)} IDs")

Created human_uniprot_ids.txt with 16754 IDs
Created cattle_uniprot_ids.txt with 17046 IDs


In [8]:
!ls -l

total 1560
-rw-r--r-- 1 bbf3630 7073630  116373 Jun  4 15:37 human_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630   23173 Jun  4 15:33 prepare_orthologs.ipynb
-rw-r--r-- 1 bbf3630 7073630  121081 Jun  4 15:37 rat_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 1158427 Jun  4 15:32 SQLtable.9606.fa-10116.fa


In [4]:
# Filter for main orthologs (score = 1.0)
main_orthologs = df[df['score'] == 1.0]

# Separate by species
human_orthologs = main_orthologs[main_orthologs['species'] == '9606.fa']
cattle_orthologs = main_orthologs[main_orthologs['species'] == '9913.fa']


In [6]:
cattle_orthologs.head()

Unnamed: 0,group_id,cluster_id,species,score,uniprot_id,bootstrap
1,1,11349,9913.fa,1.0,F1N6H4,1.0
3,2,10792,9913.fa,1.0,F1N0A6,1.0
5,3,10774,9913.fa,1.0,F1MF78,1.0
7,4,10272,9913.fa,1.0,E1BE11,1.0
9,5,9733,9913.fa,1.0,E1BHT5,1.0


In [7]:
human_orthologs.head()


Unnamed: 0,group_id,cluster_id,species,score,uniprot_id,bootstrap
0,1,11349,9606.fa,1.0,Q9UPN3,1.0
2,2,10792,9606.fa,1.0,Q8WXG9,1.0
4,3,10774,9606.fa,1.0,Q8WXH0,1.0
6,4,10272,9606.fa,1.0,Q96RW7,1.0
8,5,9733,9606.fa,1.0,Q5T4S7,1.0


In [9]:

# Find ortholog pairs
ortholog_pairs = []
for group_id in main_orthologs['group_id'].unique():
    human_genes = human_orthologs[human_orthologs['group_id'] == group_id]['uniprot_id'].tolist()
    cattle_genes = cattle_orthologs[cattle_orthologs['group_id'] == group_id]['uniprot_id'].tolist()
    
    # Create pairs (handling 1:many relationships)
    for h_gene in human_genes:
        for c_gene in cattle_genes:
            ortholog_pairs.append((h_gene, c_gene))

# Save UniProt pairs for conversion
with open('uniprot_pairs.txt', 'w') as f:
    f.write('human_uniprot\tcattle_uniprot\n')
    for h, y in ortholog_pairs:
        f.write(f'{h}\t{y}\n')

In [13]:
!ls -l

total 2304
-rw-r--r-- 1 bbf3630 7073630  116373 Jun  4 15:37 human_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630   23173 Jun  4 15:33 prepare_orthologs.ipynb
-rw-r--r-- 1 bbf3630 7073630  121081 Jun  4 15:37 rat_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 1158427 Jun  4 15:32 SQLtable.9606.fa-10116.fa
-rw-r--r-- 1 bbf3630 7073630  239330 Jun  4 15:41 uniprot_pairs.txt


In [10]:
!head uniprot_pairs.txt

human_uniprot	cattle_uniprot
Q9UPN3	F1N6H4
Q8WXG9	F1N0A6
Q8WXH0	F1MF78
Q96RW7	E1BE11
Q5T4S7	E1BHT5
Q9NU22	E1BC24
Q92736	E1BHV1
O95714	E1B782
Q6V0I7	E1B949


In [17]:
!ls -l

total 4936
-rw-r--r-- 1 bbf3630 7073630  161848 Jun  4 16:00 homogene.txt
-rw-r--r-- 1 bbf3630 7073630  116373 Jun  4 15:37 human_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 1017152 Jun  4 15:58 mart_export_human.txt
-rw-r--r-- 1 bbf3630 7073630  136811 Jun  4 15:58 mart_export_rat.txt
-rw-r--r-- 1 bbf3630 7073630   23173 Jun  4 15:33 prepare_orthologs.ipynb
-rw-r--r-- 1 bbf3630 7073630  121081 Jun  4 15:37 rat_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 1158427 Jun  4 15:32 SQLtable.9606.fa-10116.fa
-rw-r--r-- 1 bbf3630 7073630  239330 Jun  4 15:41 uniprot_pairs.txt


In [20]:
!wc -l homogene.txt

4625 homogene.txt


In [18]:
import pandas as pd
import re

def load_biomart_map(path, gene_col="Gene stable ID",
                     sp_col="UniProtKB/Swiss-Prot ID",
                     trembl_col="UniProtKB/TrEMBL ID",
                     species_tag="hsa"):
    df = pd.read_csv(path, sep="\t", dtype=str).rename(columns=lambda c: c.strip())
    # Keep only columns we need (some exports add extras)
    keep = [c for c in [gene_col, sp_col, trembl_col] if c in df.columns]
    df = df[keep].copy()

    # Melt Swiss-Prot + TrEMBL into one 'uniprot' column
    value_vars = [c for c in [sp_col, trembl_col] if c in df.columns]
    if not value_vars:
        raise ValueError(f"{path}: no UniProt columns found (Swiss-Prot/TrEMBL). Re-export with both.")

    m = (
        df.melt(id_vars=[gene_col], value_vars=value_vars, value_name="uniprot")
          .drop(columns="variable")
          .dropna(subset=["uniprot", gene_col])
          .drop_duplicates()
    )
    # Clean whitespace, drop empties
    m["uniprot"] = m["uniprot"].str.strip()
    m = m[m["uniprot"] != ""].copy()

    # Rename gene column to species code for clarity
    m = m.rename(columns={gene_col: species_tag})
    return m  # columns: [species_tag, uniprot]

def create_egio_ortholog_file(uniprot_pairs_path,
                              human_map_path, cattle_map_path,
                              out_pairs="homogene1.txt"):
    # Load mappings (include BOTH Swiss-Prot & TrEMBL)
    hmap = load_biomart_map(human_map_path, species_tag="hsa")
    bmap = load_biomart_map(cattle_map_path, species_tag="bta")

    # Load UniProt ortholog pairs (two columns: human_uniprot, cattle_uniprot)
    pairs = pd.read_csv(uniprot_pairs_path, sep="\t", dtype=str).rename(columns=lambda c: c.strip())
    if not {"human_uniprot", "cattle_uniprot"}.issubset(pairs.columns):
        raise ValueError("uniprot_pairs.txt must have columns: human_uniprot, cattle_uniprot")

    # Merge to get Ensembl Gene IDs
    tmp = (pairs
           .merge(hmap, left_on="human_uniprot", right_on="uniprot", how="left")
           .drop(columns=["uniprot"])
           .merge(bmap, left_on="cattle_uniprot", right_on="uniprot", how="left", suffixes=("", "_bta"))
           .drop(columns=["uniprot"]))

    # Keep rows that mapped to both genes
    def keep_gene(x, pat): 
        return x.notna() & x.str.match(pat)

    pat_hsa = r"^ENSG[0-9]+(\.[0-9]+)?$"
    pat_bta = r"^ENSBTAG[0-9]+(\.[0-9]+)?$"

    mapped = tmp[ keep_gene(tmp["hsa"], pat_hsa) & keep_gene(tmp["bta"], pat_bta) ].copy()

    # Optionally strip version suffixes like ".12"
    mapped["hsa"] = mapped["hsa"].str.replace(r"\.[0-9]+$", "", regex=True)
    mapped["bta"] = mapped["bta"].str.replace(r"\.[0-9]+$", "", regex=True)

    # De-duplicate gene pairs
    mapped = mapped[["hsa", "bta"]].drop_duplicates()

    # Write EGIO file
    with open(out_pairs, "w") as f:
        f.write("hsa\tbta\n")
        for h, b in mapped.sort_values(["hsa", "bta"]).itertuples(index=False):
            f.write(f"{h}\t{b}\n")

    # Simple stats
    total_pairs = len(pairs)
    ok = len(mapped)
    print(f"Pairs in uniprot_pairs: {total_pairs}")
    print(f"Mapped to Ensembl genes: {ok} ({ok/total_pairs:.1%})")
    # Helpful misses
    missing_h = pairs[~pairs["human_uniprot"].isin(hmap["uniprot"])].human_uniprot.unique()[:10]
    missing_b = pairs[~pairs["cattle_uniprot"].isin(bmap["uniprot"])].cattle_uniprot.unique()[:10]
    print(f"Example human UniProt with no mapping: {list(missing_h)}")
    print(f"Example cattle UniProt with no mapping: {list(missing_b)}")

In [20]:
create_egio_ortholog_file("uniprot_pairs.txt", "mart_export_human.txt", "mart_export_cattle.txt")


Pairs in uniprot_pairs: 17136
Mapped to Ensembl genes: 14385 (83.9%)
Example human UniProt with no mapping: ['A2VEC9', 'Q9Y4D8', 'Q9UMZ3', 'Q7Z2Y8', 'P24928', 'Q2KHR3', 'Q9BYB0', 'Q02817', 'Q96NW7', 'Q8TE59']
Example cattle UniProt with no mapping: ['A0A3Q1MUQ4', 'A0A3Q1N7T9', 'A0A3Q1M7H2', 'F1MER7', 'A0A3Q1MGJ7', 'A0A3Q1MEB6', 'A0A3Q1LNJ1', 'A0A3Q1LTH1', 'E1BNF6', 'A0A3Q1MFV7']


In [21]:
!ls -l

total 14008
-rw-r--r-- 1 bbf3630 7073630 1921041 Sep 24 15:24 biomart_hsa_bta.tsv
-rw-r--r-- 1 bbf3630 7073630  422237 Sep 24 15:26 bta_ensembl_ids.txt
-rw-r--r-- 1 bbf3630 7073630  132218 Sep 24 16:26 cattle_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630  503483 Sep 24 16:27 homogene1.txt
-rw-r--r-- 1 bbf3630 7073630  777813 Sep 24 15:27 homogene.txt
-rw-r--r-- 1 bbf3630 7073630  355568 Sep 24 15:26 hsa_ensembl_ids.txt
-rw-r--r-- 1 bbf3630 7073630  117654 Sep 24 16:26 human_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 1495493 Sep 24 16:25 mart_export_cattle.txt
-rw-r--r-- 1 bbf3630 7073630 5395140 Sep 24 16:24 mart_export_human.txt
-rw-r--r-- 1 bbf3630 7073630  777805 Sep 24 15:26 pairs_dedup.tsv
-rw-r--r-- 1 bbf3630 7073630  777805 Sep 24 15:25 pairs_raw.tsv
-rw-r--r-- 1 bbf3630 7073630  777805 Sep 24 15:26 pairs.tsv
-rw-r--r-- 1 bbf3630 7073630   19930 Aug 27 22:45 prepare_orthologs.ipynb
-rw-r--r-- 1 bbf3630 7073630       0 Sep 24 15:33 rat_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 707363