In [1]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict
import utils.yeast_name_resolver

res = utils.yeast_name_resolver.NameResolver()

In [2]:
BIOGRID_PATH = "../data-sources/biogrid/"
PATHS = [
    ("BIOGRID-SYSTEM-Synthetic_Lethality-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Synthetic_Growth_Defect-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Negative_Genetic-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Dosage_Growth_Defect-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Dosage_Lethality-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Phenotypic_Enhancement-3.4.156.mitab", 0),
    ("BIOGRID-SYSTEM-Phenotypic_Suppression-3.4.156.mitab", 3),
    ("BIOGRID-SYSTEM-Dosage_Rescue-3.4.156.mitab", 3),
    ("BIOGRID-SYSTEM-Synthetic_Rescue-3.4.156.mitab", 3),
    ("BIOGRID-SYSTEM-Positive_Genetic-3.4.156.mitab", 2)
]
def extract_names_taxid559292(df):

    res = utils.yeast_name_resolver.NameResolver()

    a_names = list(df['Alt IDs Interactor A'])
    b_names = list(df['Alt IDs Interactor B'])

    def extract_locus(e):
        return res.get_unified_name(e.split('|')[-1].replace('entrez gene/locuslink:', '').lower())

    return [extract_locus(e) for e in a_names], [extract_locus(e) for e in b_names]

taxid = 559292

In [3]:
taxid_str = "taxid:%d" % taxid
name_extraction_func = extract_names_taxid559292
pub_threshold = 3

pairs_to_conditions = defaultdict(lambda: defaultdict(list) )

for path, condition in PATHS:
    df = pd.read_csv(os.path.join(BIOGRID_PATH, path+'.txt'), sep='\t')
    
    ix = (df['Taxid Interactor A'] == taxid_str) & (df['Taxid Interactor B'] == taxid_str)
    df = df[ix]

    a, b = name_extraction_func(df)

    for i in range(df.shape[0]):
        key = tuple(sorted((a[i], b[i])))
        pub = df.iloc[i]['Publication 1st Author']
        pairs_to_conditions[key][condition].append(pub)

# count how often each condition is experienced      
rows = []
conditions_to_pairs = defaultdict(set)
ignored_overlaps = 0
for pair, conditions in pairs_to_conditions.items():

    # some pairs can be listed under multiple conditions
    # this is ok as long as there is one condition that has
    # higher frequency of appearance than others.
    # So if a pair occurs twice in bin #2, and twice in bin #3, it is discarded.
    # But if it occurs once in #2, and twice in bin #3, it is accepted.
    condition_cnts = [(c,len(conditions[c])) for c in conditions]
    condition_cnts = sorted(condition_cnts, key=lambda p: p[1], reverse=True)
    if len(condition_cnts) > 1 and condition_cnts[0][1] == condition_cnts[1][1]:
        #print("(%s, %s): %s" % (pair[0], pair[1], ', '.join([str(e) for e in conditions])))
        ignored_overlaps += 1
        continue 
    
    most_common_condition = condition_cnts[0]
        
    # only accept if it passes pub threshold
    if most_common_condition[1] >= pub_threshold:
        conditions_to_pairs[most_common_condition[0]].add(pair)

print("Ignored %d pairs with overlaps" % ignored_overlaps)

Ignored 4606 pairs with overlaps


In [21]:
diff = 0
for c, pairs in conditions_to_pairs.items():
    
    pairs_with_costanzo_pub = 0
    for pair in pairs:
        costanzo_pubs = len([p for p in pairs_to_conditions[pair][c] if 'costanzo m (2016)' in p.lower()])
        pairs_with_costanzo_pub += costanzo_pubs > 0
    
    diff += len(pairs) - pairs_with_costanzo_pub
    print("%s %d (those with costanzo pub: %d)" % (c,len(pairs), pairs_with_costanzo_pub))
print("Total diff: %d" % diff)

0 20746 (those with costanzo pub: 17556)
3 664 (those with costanzo pub: 0)
2 1098 (those with costanzo pub: 781)
Total diff: 4171


In [23]:
1 - 4171 / (20746+664+1098)

0.8146881108939044

In [25]:
781/1098

0.7112932604735883

In [5]:
gdf = df.groupby('Publication 1st Author').agg('size')
gdf.sort_values(ascending=False)

Publication 1st Author
Costanzo M (2016)         47191
Costanzo M (2010)          6472
Hoppins S (2011)           2943
Collins SR (2007)          2813
Wilmes GM (2008)           2654
Szappanos B (2011)         1990
Srivas R (2016)            1882
Schuldiner M (2005)        1430
Fiedler D (2009)           1409
Surma MA (2013)             810
Bandyopadhyay S (2010)      793
Stirling PC (2011)          733
Makrantoni V (2017)         428
Zheng J (2010)              395
Vembar SS (2010)            332
Breslow DK (2008)           321
Kyriakou D (2016)           312
Aguilar PS (2010)           274
Sharifpoor S (2012)         191
Addinall SG (2011)          187
Garay E (2014)              178
Leung GP (2014)             163
Steunou AL (2016)           141
Louie RJ (2012)             132
Libuda DE (2010)            106
Nguyen HD (2013)             88
Hannum G (2009)              81
Gallina I (2015)             62
Boettner DR (2011)           53
Michelot A (2010)            50
St Onge RP (2007)

# Check GI Overlap Between Two Datasets

In [6]:
costanzo_df = pd.read_csv("../generated-data/task_yeast_gi_costanzo")
costanzo_df['pair'] = [tuple(sorted((a,b))) for a,b in zip(costanzo_df['a'], costanzo_df['b'])]

In [7]:
hybrid_df = pd.read_csv("../generated-data/task_yeast_gi_hybrid")
hybrid_df['pair'] = [tuple(sorted((a,b))) for a,b in zip(hybrid_df['a'], hybrid_df['b'])]

In [8]:
costanzo_df

Unnamed: 0,a,b,a_id,b_id,bin,gi,cs,std,a_cs,b_cs,p_value,a_essential,b_essential,temp,pair
0,ybr142w mak5,yal025c mak16,309,38,0,-0.1636,0.3203,0.0892,0.7052,0.6862,1.112000e-03,1,1,26,"(yal025c mak16, ybr142w mak5)"
1,ydl132w cdc53,yal025c mak16,681,38,0,-0.1697,0.1564,0.0327,0.4752,0.6862,2.885000e-06,1,1,26,"(yal025c mak16, ydl132w cdc53)"
2,ydr081c pdc2,yal025c mak16,849,38,0,-0.0979,0.1482,0.0328,0.3586,0.6862,4.467000e-03,1,1,26,"(yal025c mak16, ydr081c pdc2)"
3,ydr356w spc110,yal025c mak16,1088,38,0,-0.0917,0.4586,0.0062,0.8019,0.6862,1.041000e-27,1,1,26,"(yal025c mak16, ydr356w spc110)"
4,yer133w glc7,yal025c mak16,1430,38,0,-0.0900,0.4456,0.0526,0.7806,0.6862,2.885000e-02,1,1,26,"(yal025c mak16, yer133w glc7)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2068317,ypl256c cln2,ypr081c grs2,4889,4975,1,-0.0140,0.9046,0.0197,1.0275,0.8940,3.055000e-01,0,0,26,"(ypl256c cln2, ypr081c grs2)"
2068318,ypl262w fum1,ypr081c grs2,4895,4975,1,-0.0098,0.7215,0.0076,0.8180,0.8940,2.073000e-01,0,0,26,"(ypl262w fum1, ypr081c grs2)"
2068319,ypl265w dip5,ypr081c grs2,4897,4975,1,-0.0150,0.8995,0.0377,1.0230,0.8940,3.515000e-01,0,0,26,"(ypl265w dip5, ypr081c grs2)"
2068320,ypl269w kar9,ypr081c grs2,4901,4975,1,-0.0009,0.9204,0.0242,1.0305,0.8940,4.881000e-01,0,0,26,"(ypl269w kar9, ypr081c grs2)"


In [15]:
# neg_gi_ix = costanzo_df['gi'] < -0.08
# pos_gi_ix = costanzo_df['gi'] > 0.08
# all_ix = neg_gi_ix | pos_gi_ix
all_ix = costanzo_df['bin'] != 1

In [16]:
costanzo_gi_ix = all_ix
hybrid_gi_ix = hybrid_df['bin'] != 1

In [17]:
costanzo_pairs = set(costanzo_df[costanzo_gi_ix]['pair'])
hybrid_pairs = set(hybrid_df[hybrid_gi_ix]['pair'])

In [18]:
len(costanzo_pairs)

132298

In [19]:
len(hybrid_pairs)

21308

In [20]:
len(hybrid_pairs - costanzo_pairs)

18685