### Gene ontology

In [2]:
import pandas as pd
from pathlib import Path
import random

random.seed(42)
path = Path("raw_data")

In [3]:
subontology_roots = {'BPO':'GO:0008150',
                     'CCO':'GO:0005575',
                     'MFO':'GO:0003674'}

In [4]:
import networkx
import obonet

graph = obonet.read_obo(path/'Train/go-basic.obo')

# Number of nodes & edges
len(graph), graph.number_of_edges()

(43248, 84805)

In [5]:
# Create name mappings
id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
name_to_id = {data['name']: id_ for id_, data in graph.nodes(data=True) if 'name' in data}

In [6]:
# Randomly select a node
random_node = random.choice(list(graph))
random_node, id_to_name[random_node]

('GO:1990820', 'response to mitotic DNA integrity checkpoint signaling')

In [7]:
# Find edges to parent terms
for child, parent, key in graph.out_edges(random_node, keys=True):
    print(f'- {id_to_name[child]} -> {key} -> {id_to_name[parent]}')

- response to mitotic DNA integrity checkpoint signaling -> is_a -> response to DNA integrity checkpoint signaling
- response to mitotic DNA integrity checkpoint signaling -> is_a -> response to mitotic cell cycle checkpoint signaling


In [9]:
# Find edges to children terms
node = name_to_id['pilus']
for parent, child, key in graph.in_edges(random_node, keys=True):
    print(f'- {id_to_name[child]} <- {key} <- {id_to_name[parent]}')

In [10]:
# Find all superterms
sorted(id_to_name[superterm] for superterm in networkx.descendants(graph, random_node))

['biological_process',
 'cellular process',
 'cellular response to biotic stimulus',
 'cellular response to endogenous stimulus',
 'cellular response to stimulus',
 'response to DNA integrity checkpoint signaling',
 'response to biotic stimulus',
 'response to cell cycle checkpoint signaling',
 'response to endogenous stimulus',
 'response to mitotic cell cycle checkpoint signaling',
 'response to stimulus']

In [11]:
# Find all subterms
sorted(id_to_name[subterm] for subterm in networkx.ancestors(graph, random_node))

[]

In [12]:
# Find all paths to the root
paths = networkx.all_simple_paths(
    graph,
    source=random_node,
    target=name_to_id['biological_process']
)
for pth in paths:
    print('•', ' ⟶ '.join(id_to_name[node] for node in pth))

• response to mitotic DNA integrity checkpoint signaling ⟶ response to DNA integrity checkpoint signaling ⟶ response to cell cycle checkpoint signaling ⟶ cellular response to biotic stimulus ⟶ response to biotic stimulus ⟶ response to stimulus ⟶ biological_process
• response to mitotic DNA integrity checkpoint signaling ⟶ response to DNA integrity checkpoint signaling ⟶ response to cell cycle checkpoint signaling ⟶ cellular response to biotic stimulus ⟶ cellular response to stimulus ⟶ cellular process ⟶ biological_process
• response to mitotic DNA integrity checkpoint signaling ⟶ response to DNA integrity checkpoint signaling ⟶ response to cell cycle checkpoint signaling ⟶ cellular response to biotic stimulus ⟶ cellular response to stimulus ⟶ response to stimulus ⟶ biological_process
• response to mitotic DNA integrity checkpoint signaling ⟶ response to DNA integrity checkpoint signaling ⟶ response to cell cycle checkpoint signaling ⟶ cellular response to endogenous stimulus ⟶ response

### Train set

train_sequences.fasta contains the protein sequences for the training dataset.

In [None]:
from Bio import SeqIO

filename = "raw_data/Train/train_sequences.fasta"
fasta_sequences = SeqIO.parse(open(filename),'fasta')

for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    print(name, sequence)