# Data overview

In [5]:
# Import dependencies for the complete notebook

import os
import config
from config import INTERACTOMES_PATH
from queries import QUERY_GET_ALL_PROTEOFORMS
from lib.graph_database_access import get_pathways, get_query_result, make_proteoform_string
from lib.dictionaries import read_dictionary_one_to_set
from lib.networks import get_json_filename, create_pathway_interaction_network, read_graph, get_interactomes
from pathlib import Path

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

config.set_root_wd()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Initial working directory: C:\git\ProteoformNetworks\src\Python
New working directory: c:\git\ProteoformNetworks


In [6]:
# Calculate number of genes, proteins and proteoforms we can get from Reactome
proteoforms = get_query_result(QUERY_GET_ALL_PROTEOFORMS)
proteoforms['Id'] = proteoforms.apply(lambda x: make_proteoform_string(x.Id), axis=1)
print(f"There are {len(proteoforms)} proteoforms.")
proteoforms

Unnamed: 0,Id
0,A0A075B6P5;
1,A0A075B6S6;
2,A0A096LP49;
3,A0A0A6YYK7;
4,A0A0C4DH25;
...,...
14392,Q9Y6X9;
14393,Q9Y6Y8;
14394,Q9Y6Y9;
14395,"Q9Y6Y9;00160:26,00160:114"


In [2]:
# Create interactomes to make sure mapping files genes-->proteins and proteins-->proteoforms exist
interactomes = get_interactomes(config.DATA_REACTOME_PATH, INTERACTOMES_PATH)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'networks/interactomes/'

In [None]:
map_proteins_to_proteoforms = read_dictionary_one_to_set(INTERACTOMES_PATH, "mapping_proteins_to_proteoforms.tsv",
                                                         col_indices=(0, 1))
pathways = get_pathways()["stId"]
print(f"There are {len(pathways)} pathways.")

In [None]:
print(f"There are {len(map_proteins_to_proteoforms.keys())} proteins.")
selected_proteins = []
for protein, proteoforms in map_proteins_to_proteoforms.items():
    if len(proteoforms) > 1:
        selected_proteins.append(protein)

print(f"Only {len(selected_proteins)} have multiple proteoforms.")

In [None]:
selected_pathways = []
for pathway in pathways:
    filename = get_json_filename(config.proteins, config.no_sm, config.PATHWAY_GRAPHS_PATH, pathway)
    if not Path(filename).exists():
        create_pathway_interaction_network(pathway, config.proteins, config.no_sm, config.PATHWAY_GRAPHS_PATH)
    G = read_graph(filename)
    if any(protein in selected_proteins for protein in list(G.nodes)):
        selected_pathways.append(pathway)

print(f"There are {len(selected_pathways)} pathways that contain proteoforms.")

In [None]:
# Examples of genes with multiple protein products

map_genes_to_proteins = read_dictionary_one_to_set(INTERACTOMES_PATH, "mapping_proteins_to_genes.tsv", col_indices=(1, 0))
map_proteins_to_proteoforms = read_dictionary_one_to_set(INTERACTOMES_PATH, "mapping_proteins_to_proteoforms.tsv", col_indices=(0, 1))

map_proteins_to_proteoforms

In [None]:

#  Examples when gene products participate in a different set of reactions.

In [None]:
- Calculate set of reactions where each gene product participates
- Select genes where it's protein products have a different set of reactions
- Quantify the difference overall:
    * How often the multiple protein products participate in the same reaction
    * Quantify the intersection: 