In [92]:
# load in the list of genes

gene_file = open('../data/networks/yeast/yeast_string_genes.txt')
gene_forward = {}
idx = 1 
for line in gene_file.readlines():
    gene = line.split()[0]
    gene_forward[gene] = idx
    idx += 1
gene_file.close()

In [93]:
# reads the database of annotations

annotation_file = open('../data/yeast_annotation_db.gaf')
entries = []
for line in annotation_file.readlines():
    # skip the meta data
    if line[0] == '!':
        continue
    # split fields and drop the newline character
    entries.append(line.split('\t')[:-1]) 
annotation_file.close()

In [94]:
# filters the annotations 

relevant_entries = []
for entry in entries:
    [db, db_obj_id, db_obj_sym, qual, go_id, 
     db_ref, ev_code, with_from, aspect, 
     db_obj_name, db_obj_syn, db_obj_type, 
     taxon, date, ass_by, anno_ext] = entry
    # check form of evidence - experimental only
    if ev_code not in ['EXP','IDA','IPI','IMP','IGI','IEP','HTP','HDP','HDA','HMP','HGI','HEP']:
        continue
        
    # check if gene name is in the string gene set
    names = [db_obj_id] + db_obj_syn.split('|')
    name = False
    for n in names:
        if n in gene_set:
            name = n
            break
    if name == False:
        continue
    
    # check the qualifier
    if qual == 'NOT' or qual == 'colocalizes_with':
        continue
    
    # aspect is one of P (for bp) F (for mf) or C (for cc)
    relevant_entries.append([name, go_id, aspect])

In [98]:
# filters out duplicate annotations

unique_entries = []
included = set()
for entry in relevant_entries:
    [name, go_id, aspect] = entry
    uq_id = f'{name}-{go_id}-{aspect}'
    if uq_id in included:
        continue
    unique_entries.append(entry)
    included.add(uq_id)

In [99]:
# pulls out db entries according to their ontology

bp_entries = [entry[:2] for entry in unique_entries if entry[2] == 'P']
mf_entries = [entry[:2] for entry in unique_entries if entry[2] == 'F']
cc_entries = [entry[:2] for entry in unique_entries if entry[2] == 'C']

In [100]:
# creats term maps to efficiently go back and forth between indices

def create_maps(ont_entries):
    ont_forward = {}
    ont_reverse = [None] # again for matlab's 1-indexing
    idx = 1
    for entry in ont_entries:
        term = entry[1]
        if term in ont_forward:
            continue
        ont_forward[term] = idx
        ont_reverse.append(term)
        idx += 1
    return [ont_forward, ont_reverse]

[bp_forward, bp_reverse] = create_maps(bp_entries)
[mf_forward, mf_reverse] = create_maps(mf_entries)
[cc_forward, cc_reverse] = create_maps(cc_entries)

In [101]:
# creates the term files

def create_term_file(ont, reverse):
    term_file = open(f'../data/annotations/yeast/go_yeast_ref_{ont}_terms.txt', 'w')
    for i in range(1, len(reverse)):
        term_file.write(f'{reverse[i]}\n')
    term_file.close()

create_term_file('bp', bp_reverse)
create_term_file('mf', mf_reverse)
create_term_file('cc', cc_reverse)

In [102]:
# creates the adjacency files

def create_adj_file(ont, ont_forward, gene_forward, entries):
    adj_file = open(f'../data/annotations/yeast/go_yeast_ref_{ont}_adjacency.txt', 'w')
    for entry in entries:
        [gene, go_term] = entry
        gene_idx = gene_forward[gene]
        go_idx = ont_forward[go_term]
        adj_file.write(f'{gene_idx}\t{go_idx}\n')
    adj_file.close()
    
create_adj_file('bp', bp_forward, gene_forward, bp_entries)
create_adj_file('mf', mf_forward, gene_forward, mf_entries)
create_adj_file('cc', cc_forward, gene_forward, cc_entries)

In [106]:
# now we test completeness on bp

bp_adj = open('../data/annotations/yeast/go_yeast_ref_bp_adjacency.txt')
uses = {}
for line in bp_adj.readlines():
    pair = line.split()
    term = pair[1]
    if term in uses:
        uses[term] += 1
    else:
        uses[term] = 1
bp_adj.close()

In [108]:
dist = list(uses.values())

In [110]:
import matplotlib.pyplot as plt