In [51]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import analysis.grouped_analysis as ga
from collections import defaultdict
import scipy.stats as stats
import utils.eval_funcs as eval_funcs
import numpy.random as rng
import networkx as nx
import string

In [5]:
plt.rcParams["font.family"] = "Liberation Serif"
plt.rcParams["font.weight"] = "bold"
plot_cfg = {
    "tick_label_size" : 50,
    "xlabel_size" : 60,
    "ylabel_size" : 60,
    "border_size" : 6,
    "bar_border_size" : 2.5,
    "bar_label_size" : 32,
    "stars_label_size" : 48,
    "annot_size" : 72,
    "max_cm_classes" : 4,
    "legend_size" : 42,
    "max_bars" : 4
}


In [6]:
genes_to_complexes = ga.parse_yeast_complexes()
genes_to_pathways =  ga.parse_kegg_pathways()

In [7]:
genes_to_cp = defaultdict(lambda: { "complexes" : set(), "pathways" : set() })

for k, v in genes_to_complexes.items():
    genes_to_cp[k]["complexes"] = v 
        
for k,v in genes_to_pathways.items():
    genes_to_cp[k]["pathways"] = v 

In [19]:
group = 'complexes'

genes_to_groups = genes_to_complexes if group == 'complexes' else genes_to_pathways
all_groups = []
for k, v in genes_to_groups.items():
    all_groups.extend(v) 

In [20]:
n_groups = len(set(all_groups))
print("No. groups: ", n_groups)
n_assoc = len(genes_to_groups)
print("No. Genes associated with a group: ", n_assoc)
genes_to_group = { g: list(v)[0] for g,v in genes_to_groups.items() if len(v) == 1}
n_one_pathway = len(genes_to_group)
print("No. Genes associated with only one group: ", n_one_pathway)
n_multiple_pathways = n_assoc - n_one_pathway
print("No. Genes removed due to involvement in multiple groups: ", n_multiple_pathways)

No. groups:  408
No. Genes associated with a group:  1627
No. Genes associated with only one group:  1416
No. Genes removed due to involvement in multiple groups:  211


In [65]:
group_to_genes = defaultdict(set)
G = nx.read_gpickle('../generated-data/ppc_yeast')
node_ix = dict(zip(sorted(G.nodes()), range(len(G.nodes()))))
eligible_genes = set()
for gene, group in genes_to_group.items():
    if gene in node_ix:
        group_to_genes[group].add(gene)
        eligible_genes.add(gene)
eligible_genes = list(eligible_genes)

In [70]:
import random
triplets_in_same_complex = set()
N_SAMPLES = 5000
groups = list(group_to_genes)
groups = [g for g in groups if len(group_to_genes[g]) >= 3]
while len(triplets_in_same_complex) < N_SAMPLES:
    
    # pick a random complex
    group = rng.choice(groups)
    
    # sample three genes
    genes = list(group_to_genes[group])
    triplet = tuple(sorted(rng.choice(genes, size=3, replace=False)))
    
    triplets_in_same_complex.add(triplet)
    

In [71]:
triplets_in_diff_complexes = set()
while len(triplets_in_diff_complexes) < N_SAMPLES:
    
    # sample three genes
    triplet = tuple(sorted(rng.choice(eligible_genes, size=3, replace=False)))
    
    a,b,c = triplet
    if (genes_to_group[a] != genes_to_group[b]) or (genes_to_group[a] != genes_to_group[c]) or (genes_to_group[b] != genes_to_group[c]):
        triplets_in_diff_complexes.add(triplet)

In [73]:

rows = [{
        "a" : a,
        "b" : b,
        "c" : c,
        "bin" : 0,
        "a_id" : node_ix[a],
        "b_id" : node_ix[b],
        "c_id" : node_ix[c] }
        for a,b,c in triplets_in_same_complex
] + [ {
        "a" : a,
        "b" : b,
        "c" : c,
        "bin" : 1,
        "a_id" : node_ix[a],
        "b_id" : node_ix[b],
        "c_id" : node_ix[c] }
        for a,b,c in triplets_in_diff_complexes
]
df = pd.DataFrame(rows)
df.to_csv('../generated-data/task_ppc_yeast_pseudo_triplets', index=False)