In [2]:
# Extract and save IDs to files for BioMart upload
import pandas as pd

# Load your data
df = pd.read_csv('SQLtable.9606.fa-10116.fa', sep='\t', 
                 names=['group_id', 'cluster_id', 'species', 'score', 'uniprot_id', 'bootstrap'])
df.head()

Unnamed: 0,group_id,cluster_id,species,score,uniprot_id,bootstrap
0,1,11982,9606.fa,1.0,Q5VST9,1.0
1,1,11982,10116.fa,1.0,A0A0G2K8N1,1.0
2,2,10721,9606.fa,1.0,Q03001,1.0
3,2,10721,10116.fa,1.0,D3ZC56,1.0
4,3,10047,9606.fa,1.0,Q8WXG9,1.0


In [7]:

# Filter main orthologs
main_orthologs = df[df['score'] == 1.0]

# Extract human IDs
human_ids = main_orthologs[main_orthologs['species'] == '9606.fa']['uniprot_id'].unique()
with open('human_uniprot_ids.txt', 'w') as f:
    for uid in human_ids:
        f.write(f'{uid}\n')

# Extract yeast IDs  
yeast_ids = main_orthologs[main_orthologs['species'] == '10116.fa']['uniprot_id'].unique()
with open('rat_uniprot_ids.txt', 'w') as f:
    for uid in yeast_ids:
        f.write(f'{uid}\n')

print(f"Created human_uniprot_ids.txt with {len(human_ids)} IDs")
print(f"Created rat_uniprot_ids.txt with {len(yeast_ids)} IDs")

Created human_uniprot_ids.txt with 16579 IDs
Created rat_uniprot_ids.txt with 16579 IDs


In [8]:
!ls -l

total 1560
-rw-r--r-- 1 bbf3630 7073630  116373 Jun  4 15:37 human_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630   23173 Jun  4 15:33 prepare_orthologs.ipynb
-rw-r--r-- 1 bbf3630 7073630  121081 Jun  4 15:37 rat_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 1158427 Jun  4 15:32 SQLtable.9606.fa-10116.fa


In [9]:
# Filter for main orthologs (score = 1.0)
main_orthologs = df[df['score'] == 1.0]

# Separate by species
human_orthologs = main_orthologs[main_orthologs['species'] == '9606.fa']
rat_orthologs = main_orthologs[main_orthologs['species'] == '10116.fa']


In [10]:
rat_orthologs.head()

Unnamed: 0,group_id,cluster_id,species,score,uniprot_id,bootstrap
1,1,11982,10116.fa,1.0,A0A0G2K8N1,1.0
3,2,10721,10116.fa,1.0,D3ZC56,1.0
5,3,10047,10116.fa,1.0,A0A096MK89,1.0
7,4,9941,10116.fa,1.0,F1M4Q3,1.0
9,5,9710,10116.fa,1.0,Q2TL32,1.0


In [11]:
human_orthologs.head()


Unnamed: 0,group_id,cluster_id,species,score,uniprot_id,bootstrap
0,1,11982,9606.fa,1.0,Q5VST9,1.0
2,2,10721,9606.fa,1.0,Q03001,1.0
4,3,10047,9606.fa,1.0,Q8WXG9,1.0
6,4,9941,9606.fa,1.0,Q96RW7,1.0
8,5,9710,9606.fa,1.0,Q5T4S7,1.0


In [12]:

# Find ortholog pairs
ortholog_pairs = []
for group_id in main_orthologs['group_id'].unique():
    human_genes = human_orthologs[human_orthologs['group_id'] == group_id]['uniprot_id'].tolist()
    rat_genes = rat_orthologs[rat_orthologs['group_id'] == group_id]['uniprot_id'].tolist()
    
    # Create pairs (handling 1:many relationships)
    for h_gene in human_genes:
        for r_gene in rat_genes:
            ortholog_pairs.append((h_gene, r_gene))

# Save UniProt pairs for conversion
with open('uniprot_pairs.txt', 'w') as f:
    f.write('human_uniprot\trat_uniprot\n')
    for h, y in ortholog_pairs:
        f.write(f'{h}\t{y}\n')

In [13]:
!ls -l

total 2304
-rw-r--r-- 1 bbf3630 7073630  116373 Jun  4 15:37 human_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630   23173 Jun  4 15:33 prepare_orthologs.ipynb
-rw-r--r-- 1 bbf3630 7073630  121081 Jun  4 15:37 rat_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 1158427 Jun  4 15:32 SQLtable.9606.fa-10116.fa
-rw-r--r-- 1 bbf3630 7073630  239330 Jun  4 15:41 uniprot_pairs.txt


In [14]:
!head uniprot_pairs.txt

human_uniprot	rat_uniprot
Q5VST9	A0A0G2K8N1
Q03001	D3ZC56
Q8WXG9	A0A096MK89
Q96RW7	F1M4Q3
Q5T4S7	Q2TL32
Q2LD37	A0A096MJT6
Q92736	B0LPN4
O95714	A0A0G2JUC9
Q15751	A0A0G2JTT6


In [None]:

def create_egio_ortholog_file():
    # Load the mapping files from BioMart
    human_mapping = pd.read_csv('mart_export_human.txt', sep='\t')
    species_mapping = pd.read_csv('mart_export_AT.txt', sep='\t')
    
    # Load the UniProt ortholog pairs
    uniprot_pairs = pd.read_csv('uniprot_pairs.txt', sep='\t')
    
    # Clean column names (remove any extra spaces)
    human_mapping.columns = human_mapping.columns.str.strip()
    rat_mapping.columns = rat_mapping.columns.str.strip()
    uniprot_pairs.columns = uniprot_pairs.columns.str.strip()
    
    # Create dictionaries for faster lookup
    # Handle potential duplicates by taking the first match
    human_uniprot_to_ensembl = human_mapping.drop_duplicates('UniProtKB/Swiss-Prot ID').set_index('UniProtKB/Swiss-Prot ID')['Gene stable ID'].to_dict()
    rat_uniprot_to_ensembl = rat_mapping.drop_duplicates('UniProtKB/Swiss-Prot ID').set_index('UniProtKB/Swiss-Prot ID')['Gene stable ID'].to_dict()
    
    print(f"Human mapping: {len(human_uniprot_to_ensembl)} UniProt -> Ensembl")
    print(f"Rat mapping: {len(rat_uniprot_to_ensembl)} UniProt -> Ensembl")
    print(f"UniProt pairs: {len(uniprot_pairs)} pairs")
    
    # Map UniProt pairs to Ensembl pairs
    ensembl_pairs = []
    missing_human = []
    missing_rat = []
    
    for _, row in uniprot_pairs.iterrows():
        human_uniprot = row['human_uniprot']
        rat_uniprot = row['rat_uniprot']
        
        # Look up Ensembl IDs
        human_ensembl = human_uniprot_to_ensembl.get(human_uniprot)
        rat_ensembl = rat_uniprot_to_ensembl.get(rat_uniprot)
        
        if human_ensembl and rat_ensembl:
            ensembl_pairs.append((human_ensembl, rat_ensembl))
        else:
            if not human_ensembl:
                missing_human.append(human_uniprot)
            if not rat_ensembl:
                missing_rat.append(rat_uniprot)
    
    # Remove duplicates
    ensembl_pairs = list(set(ensembl_pairs))
    
    print(f"\nSuccessfully mapped: {len(ensembl_pairs)} ortholog pairs")
    print(f"Missing human mappings: {len(set(missing_human))}")
    print(f"Missing rat mappings: {len(set(missing_rat))}")
    
    # Write the final EGIO input file
    with open('homogene.txt', 'w') as f:
        f.write('hsa\tsce\n')  # Header
        for human_gene, rat_gene in sorted(ensembl_pairs):
            f.write(f'{human_gene}\t{rat_gene}\n')
    
    print(f"\nCreated homogene.txt with {len(ensembl_pairs)} ortholog pairs")
    
    # Print some examples
    print("\nFirst 5 ortholog pairs:")
    for i, (h, y) in enumerate(sorted(ensembl_pairs)[:5]):
        print(f"{h}\t{y}")
    
    # Print missing IDs summary (first few)
    if missing_human:
        print(f"\nSome missing human UniProt IDs: {missing_human[:5]}")
    if missing_rat:
        print(f"Some missing rat UniProt IDs: {missing_rat[:5]}")


In [16]:
create_egio_ortholog_file()

Human mapping: 16409 UniProt -> Ensembl
Rat mapping: 4632 UniProt -> Ensembl
UniProt pairs: 16698 pairs

Successfully mapped: 4624 ortholog pairs
Missing human mappings: 170
Missing rat mappings: 11947

Created homogene.txt with 4624 ortholog pairs

First 5 ortholog pairs:
ENSG00000000005	ENSRNOG00000060970
ENSG00000000460	ENSRNOG00000059276
ENSG00000000938	ENSRNOG00000009912
ENSG00000001036	ENSRNOG00000015551
ENSG00000001084	ENSRNOG00000006302

Some missing human UniProt IDs: ['A2VEC9', 'Q9Y4D8', 'Q9UMZ3', 'Q7Z2Y8', 'P24928']
Some missing rat UniProt IDs: ['A0A0G2K8N1', 'D3ZC56', 'A0A096MK89', 'F1M4Q3', 'Q2TL32']


In [17]:
!ls -l

total 4936
-rw-r--r-- 1 bbf3630 7073630  161848 Jun  4 16:00 homogene.txt
-rw-r--r-- 1 bbf3630 7073630  116373 Jun  4 15:37 human_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 1017152 Jun  4 15:58 mart_export_human.txt
-rw-r--r-- 1 bbf3630 7073630  136811 Jun  4 15:58 mart_export_rat.txt
-rw-r--r-- 1 bbf3630 7073630   23173 Jun  4 15:33 prepare_orthologs.ipynb
-rw-r--r-- 1 bbf3630 7073630  121081 Jun  4 15:37 rat_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 1158427 Jun  4 15:32 SQLtable.9606.fa-10116.fa
-rw-r--r-- 1 bbf3630 7073630  239330 Jun  4 15:41 uniprot_pairs.txt


In [20]:
!wc -l homogene.txt

4625 homogene.txt


In [3]:
import pandas as pd

def create_egio_ortholog_file():
    # Load the mapping files from BioMart
    human_mapping = pd.read_csv('mart_export_human.txt', sep='\t')
    ath_mapping = pd.read_csv('mart_export_AT.txt', sep='\t')
    
    # Load the UniProt ortholog pairs (must have columns: human_uniprot, arabidopsis_uniprot)
    uniprot_pairs = pd.read_csv('uniprot_pairs.txt', sep='\t')
    
    # Clean column names
    human_mapping.columns = human_mapping.columns.str.strip()
    ath_mapping.columns = ath_mapping.columns.str.strip()
    uniprot_pairs.columns = uniprot_pairs.columns.str.strip()
    
    # Create UniProt → Ensembl dictionaries
    human_uniprot_to_ensembl = (
        human_mapping.drop_duplicates('UniProtKB/Swiss-Prot ID')
        .set_index('UniProtKB/Swiss-Prot ID')['Gene stable ID']
        .to_dict()
    )
    ath_uniprot_to_ensembl = (
        ath_mapping.drop_duplicates('UniProtKB/Swiss-Prot ID')
        .set_index('UniProtKB/Swiss-Prot ID')['Gene stable ID']
        .to_dict()
    )
    
    print(f"Human mapping: {len(human_uniprot_to_ensembl)} UniProt -> Ensembl")
    print(f"Arabidopsis mapping: {len(ath_uniprot_to_ensembl)} UniProt -> Ensembl")
    print(f"UniProt pairs: {len(uniprot_pairs)} pairs")
    
    # Map UniProt pairs to Ensembl pairs
    ensembl_pairs = []
    missing_human = []
    missing_ath = []
    
    for _, row in uniprot_pairs.iterrows():
        human_uniprot = row['human_uniprot']
        ath_uniprot = row['arabidopsis_uniprot']
        
        human_ensembl = human_uniprot_to_ensembl.get(human_uniprot)
        ath_ensembl = ath_uniprot_to_ensembl.get(ath_uniprot)
        
        if human_ensembl and ath_ensembl:
            ensembl_pairs.append((human_ensembl, ath_ensembl))
        else:
            if not human_ensembl:
                missing_human.append(human_uniprot)
            if not ath_ensembl:
                missing_ath.append(ath_uniprot)
    
    # Remove duplicates
    ensembl_pairs = list(set(ensembl_pairs))
    
    print(f"\nSuccessfully mapped: {len(ensembl_pairs)} ortholog pairs")
    print(f"Missing human mappings: {len(set(missing_human))}")
    print(f"Missing Arabidopsis mappings: {len(set(missing_ath))}")
    
    # Write EGIO input file
    with open('homogene.txt', 'w') as f:
        f.write('hsa\tath\n')  # hsa = human, ath = arabidopsis
        for h_gene, ath_gene in sorted(ensembl_pairs):
            f.write(f'{h_gene}\t{ath_gene}\n')
    
    print(f"\nCreated homogene.txt with {len(ensembl_pairs)} ortholog pairs")
    
    # Print examples
    print("\nFirst 5 ortholog pairs:")
    for i, (h, a) in enumerate(sorted(ensembl_pairs)[:5]):
        print(f"{h}\t{a}")
    
    if missing_human:
        print(f"\nSome missing human UniProt IDs: {missing_human[:5]}")
    if missing_ath:
        print(f"Some missing Arabidopsis UniProt IDs: {missing_ath[:5]}")


In [1]:
import pandas as pd

# Load ortholog table
df = pd.read_csv('SQLtable.3702.fa-9606.fa', sep='\t', 
                 names=['group_id', 'cluster_id', 'species', 'score', 'uniprot_id', 'bootstrap'])

# Keep only main orthologs (score == 1.0)
main_orthologs = df[df['score'] == 1.0]

# Separate species
human_orthologs = main_orthologs[main_orthologs['species'] == '9606.fa']
ath_orthologs   = main_orthologs[main_orthologs['species'] == '3702.fa']

# Create ortholog pairs (handles 1:many)
ortholog_pairs = []
for group_id in main_orthologs['group_id'].unique():
    human_genes = human_orthologs[human_orthologs['group_id'] == group_id]['uniprot_id'].tolist()
    ath_genes   = ath_orthologs[ath_orthologs['group_id'] == group_id]['uniprot_id'].tolist()
    
    for h in human_genes:
        for a in ath_genes:
            ortholog_pairs.append((h, a))

# Save UniProt pairs
with open('uniprot_pairs.txt', 'w') as f:
    f.write('human_uniprot\tarabidopsis_uniprot\n')
    for h, a in ortholog_pairs:
        f.write(f'{h}\t{a}\n')

print(f"Created uniprot_pairs.txt with {len(ortholog_pairs)} pairs")


Created uniprot_pairs.txt with 3573 pairs


In [4]:
create_egio_ortholog_file()

Human mapping: 2000 UniProt -> Ensembl
Arabidopsis mapping: 15765 UniProt -> Ensembl
UniProt pairs: 3573 pairs

Successfully mapped: 1246 ortholog pairs
Missing human mappings: 1648
Missing Arabidopsis mappings: 897

Created homogene.txt with 1246 ortholog pairs

First 5 ortholog pairs:
ENSG00000000419	AT1G20575
ENSG00000001630	AT1G11680
ENSG00000004455	AT5G63400
ENSG00000004779	AT5G27200
ENSG00000004897	AT2G20000

Some missing human UniProt IDs: ['P24928', 'Q13085', 'O60306', 'Q5T4S7', 'Q15029']
Some missing Arabidopsis UniProt IDs: ['Q9FMF9', 'F4IPJ1', 'Q8L5Y4', 'F4HVV6', 'Q9FLH2']
