In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from utils import yeast_name_resolver
import networkx as nx
import re
import feature_preprocessing.complexes_pathways
res = yeast_name_resolver.NameResolver()

# get phosphorylations

In [2]:
def extract_relations(gpath, file_path):
    G = nx.read_gpickle(gpath)
    nodes = sorted(G.nodes())
    node_ix = dict(zip(nodes, np.arange(len(nodes))))

    relations = []
    with open(file_path, 'r') as f:
        for line in f:
            m = re.match(r'^(.+?)\s', line)
            if not m:
                continue   
            source = res.get_unified_name(m.group(1))
            m = re.search(r'\[(.+?)\]', line)
            if not m:
                continue 
            targets = res.get_unified_names([t.lower() for t in m.group(1).split(', ')])
            for target in targets:
                relations.append((source,target))
    
    #print(relations)
    print("# relations: %d" % len(relations))
    relations = [rel for rel in relations if rel[0] in node_ix and rel[1] in node_ix]
    print("# relations in network: %d" % len(relations))
    
    return relations

p_relations = extract_relations('../generated-data/ppc_yeast', '../data-sources/yeast/kinase.txt')
dp_relations = extract_relations('../generated-data/ppc_yeast', '../data-sources/yeast/phosphotase.txt')

# relations: 1347
# relations in network: 1326
# relations: 255
# relations in network: 255


# get transcriptions

In [3]:
G = nx.read_gpickle('../generated-data/ppc_yeast')
nodes = sorted(G.nodes())
node_ix = dict(zip(nodes, np.arange(len(nodes))))

df = pd.read_excel('../data-sources/yeast/transcript_25c.xlsx')
    
factors = [res.get_unified_name(f) for f in df.columns[2:]]
    
all_columns = np.array(df.columns[:])
all_columns[2:] = factors
df.columns = all_columns

factors = [f for f in factors if f in node_ix]

In [4]:
TF_THRESHOLD = 1.0

# read transcription matrix
# 0 = In, 1 = Out
t_relations = set()
for i, r in df.iterrows():
    utarget = res.get_unified_name(r['Factor'])
    if utarget not in node_ix:
        continue
        
    for factor in factors:
            
        if r[factor] >= TF_THRESHOLD:
            
            t_relations.add((factor, utarget))
len(t_relations)

106940

# get ppc

In [5]:
ppc_relations = list(G.edges())
len(ppc_relations)

59377

# get complexes and pathways

In [6]:
def invert(mapping):
    
    inverted = defaultdict(set)
    for key, vals in mapping.items():
        for val in vals:
            inverted[val].add(key)
    return dict(inverted)

def make_relations(mapping):
    
    rels = set()
    
    for group, genes in mapping.items():
        for a in genes:
            for b in genes:
                if a != b:
                    rels.add((a, b))
    
    return rels

In [7]:
genes_to_complexes = feature_preprocessing.complexes_pathways.parse_yeast_complexes()
complexes_to_genes = invert(genes_to_complexes)

In [8]:
complex_relations = make_relations(complexes_to_genes)
len(complex_relations)

22510

In [9]:
genes_to_pathways = feature_preprocessing.complexes_pathways.parse_kegg_pathways()
pathways_to_genes = invert(genes_to_pathways)
pathway_relations = make_relations(pathways_to_genes)
len(pathway_relations)

831146

# cleanup relations

In [10]:
def sort_rels(rels):
    return set(sorted([tuple(sorted(r)) for r in rels]))

phosph_rels = sort_rels(p_relations + dp_relations)
t_rels = sort_rels(t_relations)
ppc_rels = sort_rels(ppc_relations)
complex_rels = sort_rels(complex_relations)
pathway_rels = sort_rels(pathway_relations)

# load up dataset

In [30]:
df = pd.read_csv("../generated-data/task_yeast_gi_hybrid")

In [39]:
pair = pd.Series([tuple(sorted((a,b))) for a,b in zip(df['a'], df['b'])])

In [44]:
df['rel_phospho'] = pair.isin(phosph_rels).astype(int)
df['rel_trans'] = pair.isin(t_rels).astype(int)
df['rel_ppc'] = pair.isin(ppc_rels).astype(int)
df['rel_complex'] = pair.isin(complex_rels).astype(int)
df['rel_pathway'] = pair.isin(pathway_rels).astype(int)

df['rel_not_phospho'] = (~pair.isin(phosph_rels)).astype(int)
df['rel_not_trans'] = (~pair.isin(t_rels)).astype(int)
df['rel_not_ppc'] = (~pair.isin(ppc_rels)).astype(int)
df['rel_not_complex'] = (~pair.isin(complex_rels)).astype(int)
df['rel_not_pathway'] = (~pair.isin(pathway_rels)).astype(int)

In [46]:
np.sum(df['rel_phospho'])

806

In [47]:
df.to_csv("../generated-data/task_yeast_gi_hybrid", index=False)

In [34]:
ix = df['bin'] != 1
np.sum(df[ix][['rel_phospho', 'rel_trans', 'rel_ppc', 'rel_complex', 'rel_pathway']], axis=0)

rel_phospho      50
rel_trans       461
rel_ppc        1592
rel_complex     519
rel_pathway    1881
dtype: int64

In [35]:
np.sum(df[~ix][['rel_phospho', 'rel_trans', 'rel_ppc', 'rel_complex', 'rel_pathway']], axis=0)

rel_phospho       756
rel_trans       53524
rel_ppc         24782
rel_complex      2991
rel_pathway    149753
dtype: int64

In [21]:
np.sum(~ix)

6897562

In [22]:
np.sum(np.sum(df[~ix][['rel_phospho', 'rel_trans', 'rel_ppc', 'rel_complex', 'rel_pathway']], axis=0))

231806

In [25]:
df[['is_interacting', 'rel_phospho', 'rel_trans', 'rel_ppc', 'rel_complex', 'rel_pathway']].corr()

Unnamed: 0,is_interacting,rel_phospho,rel_trans,rel_ppc,rel_complex,rel_pathway
is_interacting,1.0,0.011485,0.008738,0.06395,0.058868,0.025192
rel_phospho,0.011485,1.0,0.000108,0.174492,0.028898,0.007531
rel_trans,0.008738,0.000108,1.0,0.007603,0.000847,-0.00928
rel_ppc,0.06395,0.174492,0.007603,1.0,0.134222,0.021674
rel_complex,0.058868,0.028898,0.000847,0.134222,1.0,0.104868
rel_pathway,0.025192,0.007531,-0.00928,0.021674,0.104868,1.0


In [26]:
df['rel_pattern'] = [(a,b,c,d,e) for a,b,c,d,e in zip(df['rel_phospho'], df['rel_trans'], df['rel_ppc'], df['rel_complex'], df['rel_pathway'])]



In [28]:
pattern_counts = defaultdict(int)
for p in df['rel_pattern']:
    pattern_counts[p]+=1


defaultdict(int,
            {(False, False, False, False, False): 6689396,
             (False, False, False, False, True): 147402,
             (False, False, True, False, True): 1342,
             (False, True, False, False, False): 53155,
             (False, False, True, False, False): 22524,
             (False, False, False, True, False): 240,
             (False, False, True, True, True): 472,
             (False, False, True, True, False): 746,
             (True, False, True, True, False): 24,
             (False, True, True, False, False): 438,
             (True, False, True, False, False): 677,
             (True, False, True, False, True): 73,
             (False, False, False, True, True): 1964,
             (False, True, False, False, True): 335,
             (True, False, True, True, True): 25,
             (False, True, True, True, False): 29,
             (False, True, True, True, True): 6,
             (False, True, True, False, True): 11,
             (True, True, 

In [29]:
len(pattern_counts)

22