In [24]:
#!/usr/bin/env python
#This script is used to generate training and testing datasets.
import click as ck
import numpy as np
import pandas as pd
from collections import Counter
from utils import Ontology, FUNC_DICT
import logging

logging.basicConfig(level=logging.INFO)


In [21]:


def main(go_file, hp_file, hp_annots_file, deepgo_annots_file, id_mapping_file,data_file, 
         string_mapping_file,out_data_file, out_terms_file, min_count):
    go = Ontology(go_file, with_rels=True)
    print('GO loaded')
    hp = Ontology(hp_file, with_rels=True)
    print('HP loaded')
    print('Load Gene2prot mapping')
    df = pd.read_pickle(data_file)
    prot2gene = {}
    with open(id_mapping_file) as f:
        next(f)
        for line in f:
            it = line.strip().split('\t')
            prot2gene[it[2]] = it[0]
#         print ( prot2gene)
    # for row in df.itertuples():
    #     if row.genes != '':
    #         prot2gene[row.proteins] = row.genes
#     print ('-_-ccccccccccccccccccc',it[1:])
    print('Loading HP annotations')
    hp_annots = {}
    with open(hp_annots_file) as f:
        next(f)
        for line in f:
            it = line.strip().split('\t')
            gene_id = it[0]
            hp_id = it[3]
            if gene_id not in hp_annots:
                hp_annots[gene_id] = set()
            if hp.has_term(hp_id):
                hp_annots[gene_id] |= hp.get_anchestors(hp_id)
          
    total_annots = 0
    for g_id, annots in hp_annots.items():
        annots.discard('HP:0000001')
        total_annots += len(annots)
    print('HP Annotations', len(hp_annots), total_annots, (total_annots / len(hp_annots)))
    dg_annots = {}
    gos = set()
    with open(deepgo_annots_file) as f:
        for line in f:
            it = line.strip().split('\t')
#             print( 'YES_first ')
            if it[0] not in prot2gene:
#                 print( 'YES_second ')
                continue
            gene_id = prot2gene[it[0]]
            annots = dg_annots.get(gene_id, {})
            for item in it[1:]:
                go_id, score = item.split('|')
                score = float(score)
                annots[go_id] = max(score, annots.get(go_id, 0))
                print( 'we are here ____3',annots[go_id])
            dg_annots[gene_id] = annots
            gos |= set(annots.keys())
#         print ('annots___________-------------------------------' ,annots)
    print('DeepGO Annotations', len(dg_annots))
    deepgo_annots = {}
    for g_id, annots in dg_annots.items():
        deepgo_annots[g_id] = [go_id + '|' + str(score) for go_id, score in annots.items()]
    print('Number of GOs', len(gos))
    gos_df = pd.DataFrame({'gos': list(gos)})
    gos_df.to_pickle('data/new/gos.pkl')

    go_annots = {}
    iea_annots = {}
    seqs = {}
    
    for i, row in df.iterrows():
        if row.proteins not in prot2gene:
            continue
        g_id = prot2gene[row.proteins]
        if g_id not in go_annots:
            go_annots[g_id] = set()
            iea_annots[g_id] = set()
        go_annots[g_id] |= set(row.exp_annotations)
        iea_annots[g_id] |= set(row.iea_annotations)
        seqs[g_id] = row.sequences

    print('GO Annotations', len(go_annots))
    logging.info('Processing annotations')
    
    cnt = Counter()
    annotations = list()
    for g_id, annots in hp_annots.items():
        for term in annots:
            cnt[term] += 1
    
    
    deepgo_annotations = []
    go_annotations = []
    iea_annotations = []
    hpos = []
    genes = []
    sequences = []
    for g_id, phenos in hp_annots.items():
        if g_id not in dg_annots:
            continue
        genes.append(g_id)
        hpos.append(phenos)
        go_annotations.append(go_annots[g_id])
        iea_annotations.append(iea_annots[g_id])
        deepgo_annotations.append(deepgo_annots[g_id])
        sequences.append(seqs[g_id])

#     for g_id, gos in dg_annots.items():
#         genes.append(g_id)
#         phenos = set()
#         if g_id in hp_annots:
#             phenos = hp_annots[g_id]
#         hpos.append(phenos)
#         go_annotations.append(go_annots[g_id])
#         iea_annotations.append(iea_annots[g_id])
#         deepgo_annotations.append(deepgo_annots[g_id])
#         sequences.append(seqs[g_id])
    
        
    df = pd.DataFrame(
        {'genes': genes, 'hp_annotations': hpos,
         'go_annotations': go_annotations, 'iea_annotations': iea_annotations,
         'deepgo_annotations': deepgo_annotations,
         'sequences': sequences})
    df.to_pickle(out_data_file)
    print(f'Number of proteins {len(df)}')
    
    # Filter terms with annotations more than min_count
    terms_set = set()
    all_terms = []
    for key, val in cnt.items():
        if key == 'HP:0000001':
            continue
        all_terms.append(key)
        if val >= min_count:
            terms_set.add(key)
    terms = []
    labels = []
    for t_id in hp.get_ordered_terms():
        if t_id in terms_set:
            terms.append(t_id)
            labels.append(hp.get_term(t_id)['name'])
    
    logging.info(f'Number of terms {len(terms)}')
#     Save the list of terms
    df = pd.DataFrame({'terms': terms, 'labels': labels})
    df.to_pickle(out_terms_file)
    df = pd.DataFrame({'terms': all_terms})
    df.to_pickle(out_terms_file)
                



In [22]:
# import tensorflow as tf; print(tf.__version__)
# ALL_SOURCES_ALL_FREQUENCIES_genes_to_phenotype

In [23]:
#main(go_file, hp_file, hp_annots_file, deepgo_annots_file, id_mapping_file,data_file, string_mapping_file,out_data_file, out_terms_file, min_count):
main('data/go.obo','data/hp.obo','data/ALL_SOURCES_ALL_FREQUENCIES_genes_to_phenotype.txt','data/human.res','data/gene_annotations_all.tab','data/new/swissprot_version_2.pkl','data/string2uni.tab','data/test/human_2.pkl','data/test/terms_2.pkl',10)

GO loaded
HP loaded
Load Gene2prot mapping
Loading HP annotations
HP Annotations 4073 529475 129.99631721090105
DeepGO Annotations 0
Number of GOs 0


INFO:root:Processing annotations
INFO:root:Number of terms 3783


GO Annotations 0
Number of proteins 0


GPU Available:  True


GPU Available:  True


In [19]:
f=pd.read_pickle('data/terms.pkl')
f

Unnamed: 0,terms,labels
0,HP:0002898,Embryonal neoplasm
1,HP:0010614,Fibroma
2,HP:0030448,Soft tissue sarcoma
3,HP:0100242,Sarcoma
4,HP:0002861,Melanoma
...,...,...
1634,HP:0011420,Age of death
1635,HP:0040006,Mortality/Aging
1636,HP:0003674,Onset
1637,HP:0031797,Clinical course


In [18]:
f=pd.read_pickle('data/test/terms_2.pkl')
f

Unnamed: 0,terms,labels
0,HP:0003828,Variable expressivity
1,HP:0003829,Incomplete penetrance
2,HP:0003812,Phenotypic variability
3,HP:0012823,Clinical modifier
4,HP:0003593,Infantile onset
...,...,...
3778,HP:0001620,High pitched voice
3779,HP:0031434,Abnormal speech prosody
3780,HP:0001608,Abnormality of the voice
3781,HP:0000118,Phenotypic abnormality


In [10]:
f=pd.read_pickle('data/new/human.pkl')
f

Unnamed: 0,genes,hp_annotations,go_annotations,iea_annotations,deepgo_annotations,sequences
0,8192,"{HP:0000359, HP:0000407, HP:0000002, HP:001184...","{GO:0044265, GO:0004252, GO:0005575, GO:190156...","{GO:0044265, GO:0004252, GO:0005575, GO:190156...","[GO:0000502|0.052, GO:0001539|0.035, GO:000367...",MWPGILVGGARVASCRYPALGPRLAAHFPAQRPPQRTLQNGLALQR...
1,2,"{HP:0000001, HP:0000005, HP:0000006}","{GO:0048584, GO:0140029, GO:0032940, GO:000425...","{GO:0048584, GO:0140029, GO:0032940, GO:000425...","[GO:0000003|0.189, GO:0000165|0.051, GO:000018...",MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...
2,8195,"{HP:0000002, HP:0000811, HP:0000069, HP:000003...","{GO:0022414, GO:0005929, GO:0007275, GO:000754...","{GO:0048729, GO:0032102, GO:0033365, GO:000552...","[GO:0000003|0.595, GO:0000226|0.861, GO:000110...",MSRLEAKKPSLCKSEPLTTERVRTTLSVLKRIVTSCYGPSGRLKQL...
3,8200,"{HP:0001382, HP:0010161, HP:0008081, HP:000116...","{GO:0045597, GO:0048584, GO:0007275, GO:006039...","{GO:0045597, GO:0048584, GO:0007275, GO:000646...","[GO:0000003|0.143, GO:0000122|0.069, GO:000016...",MRLPKLLTFLLWYLAWLDLEFICTVLGAPDLGQRPQGTRPGLAKAE...
4,90121,"{HP:0000002, HP:0002488, HP:0001507, HP:000082...",{},"{GO:0006725, GO:0042274, GO:0022613, GO:004423...","[GO:0000462|0.5, GO:0006139|0.545, GO:0006364|...",MAGAAEDARALFRAGVCAALEAWPALQIAVENGFGGVHSQEKAKWL...
...,...,...,...,...,...,...
3939,147409,"{HP:0000005, HP:0001006, HP:0000478, HP:000070...","{GO:1903575, GO:0007275, GO:0005575, GO:000381...","{GO:0007275, GO:0022405, GO:0003824, GO:007136...","[GO:0000003|0.046, GO:0000165|0.01, GO:0000226...",MDWLFFRNICLLIILMVVMEVNSEFIVEVKEFDIENGTTKWQTVRR...
3940,8148,"{HP:0011793, HP:0025425, HP:0007373, HP:003181...","{GO:0031325, GO:0048583, GO:0046483, GO:003197...","{GO:0031325, GO:0048583, GO:0046483, GO:003197...","[GO:0000018|0.062, GO:0000375|0.321, GO:000037...",MSDSGSYGQSGGEQQSYSTYGNPGSQGYGQASQSYSGYGQTTDSSY...
3941,344018,"{HP:0000008, HP:0000001, HP:0000005, HP:000013...","{GO:0005623, GO:0005575, GO:0008134, GO:000551...","{GO:0022414, GO:0031325, GO:0007275, GO:000727...","[GO:0000122|0.059, GO:0000228|0.256, GO:000078...",MDPAPGVLDPRAAPPALLGTPQAEVLEDVLREQFGPLPQLAAVCRL...
3942,81887,"{HP:0000002, HP:0000750, HP:0000436, HP:000081...","{GO:0006725, GO:0005575, GO:0022613, GO:000565...","{GO:0006725, GO:0005575, GO:0022613, GO:003068...","[GO:0005575|0.973, GO:0005622|0.973, GO:000562...",MSWESGAGPGLGSQGMDLVWSAWYGKCVKGKGSLPLSAHGIVVAWL...
