In [3]:
# Extract and save IDs to files for BioMart upload
import pandas as pd

# Load your data
df = pd.read_csv('../Saccharomyces/SQLtable.9606.fa-559292.fa', sep='\t', 
                 names=['group_id', 'cluster_id', 'species', 'score', 'uniprot_id', 'bootstrap'])
df.head()

Unnamed: 0,group_id,cluster_id,species,score,uniprot_id,bootstrap
0,1,3010,559292.fa,1.0,P33334,1.0
1,1,3010,9606.fa,1.0,Q6P2Q9,1.0
2,2,2084,559292.fa,1.0,P07259,1.0
3,2,2084,9606.fa,1.0,P27708,1.0
4,3,1909,559292.fa,1.0,P36022,1.0


In [4]:

# Filter main orthologs
main_orthologs = df[df['score'] == 1.0]

# Extract human IDs
human_ids = main_orthologs[main_orthologs['species'] == '9606.fa']['uniprot_id'].unique()
with open('human_uniprot_ids.txt', 'w') as f:
    for uid in human_ids:
        f.write(f'{uid}\n')

# Extract yeast IDs  
yeast_ids = main_orthologs[main_orthologs['species'] == '559292.fa']['uniprot_id'].unique()
with open('yeast_uniprot_ids.txt', 'w') as f:
    for uid in yeast_ids:
        f.write(f'{uid}\n')

print(f"Created human_uniprot_ids.txt with {len(human_ids)} IDs")
print(f"Created yeast_uniprot_ids.txt with {len(yeast_ids)} IDs")

Created human_uniprot_ids.txt with 2013 IDs
Created yeast_uniprot_ids.txt with 2014 IDs


In [2]:
import pandas as pd

# Load your InParanoid data
inparanoid_data = pd.read_csv('SQLtable.9606.fa-559292.fa', sep='\t', 
                             names=['group_id', 'cluster_id', 'species', 'score', 'uniprot_id', 'bootstrap'])

inparanoid_data.head()

Unnamed: 0,group_id,cluster_id,species,score,uniprot_id,bootstrap
0,1,3010,559292.fa,1.0,P33334,1.0
1,1,3010,9606.fa,1.0,Q6P2Q9,1.0
2,2,2084,559292.fa,1.0,P07259,1.0
3,2,2084,9606.fa,1.0,P27708,1.0
4,3,1909,559292.fa,1.0,P36022,1.0


In [None]:
# Filter for main orthologs (score = 1.0)
main_orthologs = inparanoid_data[inparanoid_data['score'] == 1.0]

# Separate by species
human_orthologs = main_orthologs[main_orthologs['species'] == '9606.fa']
yeast_orthologs = main_orthologs[main_orthologs['species'] == '559292.fa']


In [4]:
yeast_orthologs.head()

Unnamed: 0,group_id,cluster_id,species,score,uniprot_id,bootstrap
0,1,3010,559292.fa,1.0,P33334,1.0
2,2,2084,559292.fa,1.0,P07259,1.0
4,3,1909,559292.fa,1.0,P36022,1.0
6,4,1855,559292.fa,1.0,Q00955,1.0
10,5,1818,559292.fa,1.0,P32600,1.0


In [5]:
human_orthologs.head()


Unnamed: 0,group_id,cluster_id,species,score,uniprot_id,bootstrap
1,1,3010,9606.fa,1.0,Q6P2Q9,1.0
3,2,2084,9606.fa,1.0,P27708,1.0
5,3,1909,9606.fa,1.0,Q14204,1.0
8,4,1855,9606.fa,1.0,O00763,1.0
12,5,1818,9606.fa,1.0,P42345,1.0


In [6]:

# Find ortholog pairs
ortholog_pairs = []
for group_id in main_orthologs['group_id'].unique():
    human_genes = human_orthologs[human_orthologs['group_id'] == group_id]['uniprot_id'].tolist()
    yeast_genes = yeast_orthologs[yeast_orthologs['group_id'] == group_id]['uniprot_id'].tolist()
    
    # Create pairs (handling 1:many relationships)
    for h_gene in human_genes:
        for y_gene in yeast_genes:
            ortholog_pairs.append((h_gene, y_gene))

# Save UniProt pairs for conversion
with open('uniprot_pairs.txt', 'w') as f:
    f.write('human_uniprot\tyeast_uniprot\n')
    for h, y in ortholog_pairs:
        f.write(f'{h}\t{y}\n')

In [7]:
!ls -l

total 1264
-rw-r--r-- 1 bbf3630 7073630  14095 Jun  4 12:56 human_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 123185 Jun  4 13:07 mart_export_human.txt
-rw-r--r-- 1 bbf3630 7073630  30834 Jun  4 13:07 mart_export_saccharomyces.txt
-rw-r--r-- 1 bbf3630 7073630   4975 Jun  4 12:57 prepare_orthologs.ipynb
-rw-r--r-- 1 bbf3630 7073630 171239 Jun  4 12:52 SQLtable.9606.fa-559292.fa
-rw-r--r-- 1 bbf3630 7073630  29418 Jun  4 13:14 uniprot_pairs.txt
-rw-r--r-- 1 bbf3630 7073630  14098 Jun  4 12:56 yeast_uniprot_ids.txt


In [8]:
!head uniprot_pairs.txt

human_uniprot	yeast_uniprot
Q6P2Q9	P33334
P27708	P07259
Q14204	P36022
O00763	Q00955
P42345	P32600
Q9NU22	Q12019
Q00610	P22137
Q07864	P21951
Q8N3C0	P53327


In [9]:

def create_egio_ortholog_file():
    # Load the mapping files from BioMart
    human_mapping = pd.read_csv('mart_export_human.txt', sep='\t')
    yeast_mapping = pd.read_csv('mart_export_saccharomyces.txt', sep='\t')
    
    # Load the UniProt ortholog pairs
    uniprot_pairs = pd.read_csv('uniprot_pairs.txt', sep='\t')
    
    # Clean column names (remove any extra spaces)
    human_mapping.columns = human_mapping.columns.str.strip()
    yeast_mapping.columns = yeast_mapping.columns.str.strip()
    uniprot_pairs.columns = uniprot_pairs.columns.str.strip()
    
    # Create dictionaries for faster lookup
    # Handle potential duplicates by taking the first match
    human_uniprot_to_ensembl = human_mapping.drop_duplicates('UniProtKB/Swiss-Prot ID').set_index('UniProtKB/Swiss-Prot ID')['Gene stable ID'].to_dict()
    yeast_uniprot_to_ensembl = yeast_mapping.drop_duplicates('UniProtKB/Swiss-Prot ID').set_index('UniProtKB/Swiss-Prot ID')['Gene stable ID'].to_dict()
    
    print(f"Human mapping: {len(human_uniprot_to_ensembl)} UniProt -> Ensembl")
    print(f"Yeast mapping: {len(yeast_uniprot_to_ensembl)} UniProt -> Ensembl")
    print(f"UniProt pairs: {len(uniprot_pairs)} pairs")
    
    # Map UniProt pairs to Ensembl pairs
    ensembl_pairs = []
    missing_human = []
    missing_yeast = []
    
    for _, row in uniprot_pairs.iterrows():
        human_uniprot = row['human_uniprot']
        yeast_uniprot = row['yeast_uniprot']
        
        # Look up Ensembl IDs
        human_ensembl = human_uniprot_to_ensembl.get(human_uniprot)
        yeast_ensembl = yeast_uniprot_to_ensembl.get(yeast_uniprot)
        
        if human_ensembl and yeast_ensembl:
            ensembl_pairs.append((human_ensembl, yeast_ensembl))
        else:
            if not human_ensembl:
                missing_human.append(human_uniprot)
            if not yeast_ensembl:
                missing_yeast.append(yeast_uniprot)
    
    # Remove duplicates
    ensembl_pairs = list(set(ensembl_pairs))
    
    print(f"\nSuccessfully mapped: {len(ensembl_pairs)} ortholog pairs")
    print(f"Missing human mappings: {len(set(missing_human))}")
    print(f"Missing yeast mappings: {len(set(missing_yeast))}")
    
    # Write the final EGIO input file
    with open('homogene.txt', 'w') as f:
        f.write('hsa\tsce\n')  # Header
        for human_gene, yeast_gene in sorted(ensembl_pairs):
            f.write(f'{human_gene}\t{yeast_gene}\n')
    
    print(f"\nCreated homogene.txt with {len(ensembl_pairs)} ortholog pairs")
    
    # Print some examples
    print("\nFirst 5 ortholog pairs:")
    for i, (h, y) in enumerate(sorted(ensembl_pairs)[:5]):
        print(f"{h}\t{y}")
    
    # Print missing IDs summary (first few)
    if missing_human:
        print(f"\nSome missing human UniProt IDs: {missing_human[:5]}")
    if missing_yeast:
        print(f"Some missing yeast UniProt IDs: {missing_yeast[:5]}")


In [10]:
create_egio_ortholog_file()

Human mapping: 2000 UniProt -> Ensembl
Yeast mapping: 2006 UniProt -> Ensembl
UniProt pairs: 2099 pairs

Successfully mapped: 2059 ortholog pairs
Missing human mappings: 13
Missing yeast mappings: 8

Created homogene.txt with 2059 ortholog pairs

First 5 ortholog pairs:
ENSG00000000419	YPR183W
ENSG00000001084	YJL101C
ENSG00000001630	YHR007C
ENSG00000003987	YJR110W
ENSG00000003989	YDL210W

Some missing human UniProt IDs: ['P24928', 'Q8NI36', 'A8MPP1', 'Q96GQ7', 'Q9H270']
Some missing yeast UniProt IDs: ['P40825', 'P38088', 'Q01532', 'P0CX24', 'P0CX23']


In [11]:
!ls -l

total 1456
-rw-r--r-- 1 bbf3630 7073630  49472 Jun  4 13:22 homogene.txt
-rw-r--r-- 1 bbf3630 7073630  14095 Jun  4 12:56 human_uniprot_ids.txt
-rw-r--r-- 1 bbf3630 7073630 123185 Jun  4 13:07 mart_export_human.txt
-rw-r--r-- 1 bbf3630 7073630  30834 Jun  4 13:07 mart_export_saccharomyces.txt
-rw-r--r-- 1 bbf3630 7073630   4975 Jun  4 12:57 prepare_orthologs.ipynb
-rw-r--r-- 1 bbf3630 7073630 171239 Jun  4 12:52 SQLtable.9606.fa-559292.fa
-rw-r--r-- 1 bbf3630 7073630  29418 Jun  4 13:14 uniprot_pairs.txt
-rw-r--r-- 1 bbf3630 7073630  14098 Jun  4 12:56 yeast_uniprot_ids.txt


In [12]:
!head homogene.txt

hsa	sce
ENSG00000000419	YPR183W
ENSG00000001084	YJL101C
ENSG00000001630	YHR007C
ENSG00000003987	YJR110W
ENSG00000003989	YDL210W
ENSG00000004455	YDR226W
ENSG00000004779	YKL192C
ENSG00000004897	YBL084C
ENSG00000004939	YNL275W
