In [32]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict
import utils.yeast_name_resolver

res = utils.yeast_name_resolver.NameResolver()

In [33]:
BIOGRID_PATH = "../data-sources/biogrid/"
PATHS = [
    ("BIOGRID-SYSTEM-Synthetic_Lethality-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Synthetic_Growth_Defect-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Negative_Genetic-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Dosage_Growth_Defect-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Dosage_Lethality-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Phenotypic_Enhancement-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Phenotypic_Suppression-3.4.156.mitab", 3),
    ("BIOGRID-SYSTEM-Dosage_Rescue-3.4.156.mitab", 3),
    ("BIOGRID-SYSTEM-Synthetic_Rescue-3.4.156.mitab", 3),
    ("BIOGRID-SYSTEM-Positive_Genetic-3.4.156.mitab", 2)
]
def extract_names_taxid559292(df):

    res = utils.yeast_name_resolver.NameResolver()

    a_names = list(df['Alt IDs Interactor A'])
    b_names = list(df['Alt IDs Interactor B'])

    def extract_locus(e):
        return res.get_unified_name(e.split('|')[-1].replace('entrez gene/locuslink:', '').lower())

    return [extract_locus(e) for e in a_names], [extract_locus(e) for e in b_names]

taxid = 559292

In [35]:
taxid_str = "taxid:%d" % taxid
name_extraction_func = extract_names_taxid559292
pub_threshold = 3

pairs_to_conditions = defaultdict(lambda: defaultdict(list) )

for path, condition in PATHS:
    df = pd.read_csv(os.path.join(BIOGRID_PATH, path+'.txt'), sep='\t')
    
    ix = (df['Taxid Interactor A'] == taxid_str) & (df['Taxid Interactor B'] == taxid_str)
    df = df[ix]

    a, b = name_extraction_func(df)

    for i in range(df.shape[0]):
        key = tuple(sorted((a[i], b[i])))
        pub = df.iloc[i]['Publication 1st Author']
        pairs_to_conditions[key][condition].append(pub)

# count how often each condition is experienced      
rows = []
conditions_to_pairs = defaultdict(set)
ignored_overlaps = 0
for pair, conditions in pairs_to_conditions.items():

    # some pairs can be listed under multiple conditions
    # this is ok as long as there is one condition that has
    # higher frequency of appearance than others.
    # So if a pair occurs twice in bin #2, and twice in bin #3, it is discarded.
    # But if it occurs once in #2, and twice in bin #3, it is accepted.
    condition_cnts = [(c,len(conditions[c])) for c in conditions]
    condition_cnts = sorted(condition_cnts, key=lambda p: p[1], reverse=True)
    if len(condition_cnts) > 1 and condition_cnts[0][1] == condition_cnts[1][1]:
        #print("(%s, %s): %s" % (pair[0], pair[1], ', '.join([str(e) for e in conditions])))
        ignored_overlaps += 1
        continue 
    
    most_common_condition = condition_cnts[0]
        
    # only accept if it passes pub threshold
    if most_common_condition[1] >= pub_threshold:
        conditions_to_pairs[most_common_condition[0]].add(pair)

print("Ignored %d pairs with overlaps" % ignored_overlaps)

Ignored 4606 pairs with overlaps


In [100]:
diff = 0
for c, pairs in conditions_to_pairs.items():
    
    pairs_with_costanzo_pub = 0
    for pair in pairs:
        costanzo_pubs = len([p for p in pairs_to_conditions[pair][c] if 'costanzo' in p.lower()])
        pairs_with_costanzo_pub += costanzo_pubs > 0
    
    diff += len(pairs) - pairs_with_costanzo_pub
    print("%s %d (those with costanzo pub: %d)" % (c,len(pairs), pairs_with_costanzo_pub))
print("Total diff: %d" % diff)

0 20746 (those with costanzo pub: 18328)
3 664 (those with costanzo pub: 4)
2 1098 (those with costanzo pub: 850)
Total diff: 3326


In [62]:
gdf = df.groupby('Publication 1st Author').agg('size')
gdf.sort_values(ascending=False)

Publication 1st Author
Costanzo M (2016)         47191
Costanzo M (2010)          6472
Hoppins S (2011)           2943
Collins SR (2007)          2813
Wilmes GM (2008)           2654
Szappanos B (2011)         1990
Srivas R (2016)            1882
Schuldiner M (2005)        1430
Fiedler D (2009)           1409
Surma MA (2013)             810
Bandyopadhyay S (2010)      793
Stirling PC (2011)          733
Makrantoni V (2017)         428
Zheng J (2010)              395
Vembar SS (2010)            332
Breslow DK (2008)           321
Kyriakou D (2016)           312
Aguilar PS (2010)           274
Sharifpoor S (2012)         191
Addinall SG (2011)          187
Garay E (2014)              178
Leung GP (2014)             163
Steunou AL (2016)           141
Louie RJ (2012)             132
Libuda DE (2010)            106
Nguyen HD (2013)             88
Hannum G (2009)              81
Gallina I (2015)             62
Boettner DR (2011)           53
Michelot A (2010)            50
St Onge RP (2007)

# Check GI Overlap Between Two Datasets

In [78]:
costanzo_df = pd.read_csv("../generated-data/costanzo_gi")
costanzo_df['pair'] = [tuple(sorted((a,b))) for a,b in zip(costanzo_df['a'], costanzo_df['b'])]

In [66]:
hybrid_df = pd.read_csv("../generated-data/task_yeast_gi_hybrid")
hybrid_df['pair'] = [tuple(sorted((a,b))) for a,b in zip(hybrid_df['a'], hybrid_df['b'])]

In [79]:
costanzo_df

Unnamed: 0,a,b,cs,std,gi,a_smf,b_smf,p_value,a_essential,b_essential,temp,atype,qtype,pair
0,ybl023c mcm2,yal001c tfc3,0.7319,0.0102,-0.0348,0.9254,0.8285,0.005042,1,1,30,tsa,tsq,"(yal001c tfc3, ybl023c mcm2)"
1,ybl026w lsm2,yal001c tfc3,0.4266,0.0790,-0.3529,0.9408,0.8285,0.000004,1,1,30,tsa,tsq,"(yal001c tfc3, ybl026w lsm2)"
2,ybl034c stu1,yal001c tfc3,0.7520,0.1338,0.0126,0.8925,0.8285,0.462500,1,1,30,tsa,tsq,"(yal001c tfc3, ybl034c stu1)"
3,ybl034c stu1,yal001c tfc3,0.6661,0.0831,0.0043,0.7988,0.8285,0.499800,1,1,30,tsa,tsq,"(yal001c tfc3, ybl034c stu1)"
4,ybl034c stu1,yal001c tfc3,0.4764,0.1395,-0.1601,0.7683,0.8285,0.051400,1,1,30,tsa,tsq,"(yal001c tfc3, ybl034c stu1)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18042396,ypl264c ypl264c,ypr201w arr3,1.0217,0.0405,0.0453,0.9768,0.9995,0.178600,0,0,30,dma,sn,"(ypl264c ypl264c, ypr201w arr3)"
18042397,ypl265w dip5,ypr201w arr3,1.0205,0.0143,-0.0020,1.0230,0.9995,0.463700,0,0,30,dma,sn,"(ypl265w dip5, ypr201w arr3)"
18042398,ypl267w acm1,ypr201w arr3,0.9446,0.0331,-0.0539,0.9990,0.9995,0.107200,0,0,30,dma,sn,"(ypl267w acm1, ypr201w arr3)"
18042399,ypl272c ypl272c,ypr201w arr3,1.0013,0.0385,-0.0137,1.0155,0.9995,0.385300,0,0,30,dma,sn,"(ypl272c ypl272c, ypr201w arr3)"


In [90]:
neg_gi_ix = costanzo_df['gi'] < -0.08
pos_gi_ix = costanzo_df['gi'] > 0.08
all_ix = neg_gi_ix | pos_gi_ix

In [91]:
costanzo_gi_ix = all_ix
hybrid_gi_ix = hybrid_df['bin'] != 1

In [96]:
costanzo_pairs = set(costanzo_df[costanzo_gi_ix]['pair'])
hybrid_pairs = set(hybrid_df[hybrid_gi_ix]['pair'])

In [97]:
len(costanzo_pairs)

1164605

In [98]:
len(hybrid_pairs)

21308

In [99]:
len(hybrid_pairs - costanzo_pairs)

3975