In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import itertools
from scipy.stats import hypergeom

import warnings
warnings.filterwarnings("ignore")

In [2]:
def dframe_stack_list(ins,col,typ='float32'):
    df = ins.copy(deep=True)
    s = df[col].apply(lambda x: pd.Series(list(x)),1).stack().astype(typ)
    s.index = s.index.droplevel(-1)
    s.name = col
    del df[col]
    df = df.join(s)
    return df

def fix_pvals(x):
    if x < 1e-16:
        x=1e-16
    return x

In [3]:
# read in all the data
# append from different sheets to one df

sheet_names = ['acetate',
          'furfural',
          'HMF',
          'formic acid',
          'phenolics',
          'hydrolysates']

df = []
for i in range(6):
    df.append(pd.read_excel('../data/210310_List target genes_EC.xlsx',sheet_name=i))
    df[-1]['sheet'] = sheet_names[i]
    if i==0:
        df[-1] = df[-1].drop(0)
df = pd.concat(df)
# fix col names
df.columns = pd.Series(df.columns).str.lower().str.replace(' ','_')
print(df.shape)
df.head()

print(df.gene.unique().shape)

# # combine groups
# df['sheet'] = df.sheet.map({'acetate':'ac_for',
#                   'formic acid':'ac_for',
#                   'HMF':'fur_HMF',
#                   'furfural':'fur_HMF',
#                   'hydrolysates':'hydrolysates',
#                   'phenolics':'phenolics'
#                  })
df.groupby('sheet').gene.count()
df.groupby('sheet')['strain','conditions'].nunique()

(7020, 13)
(4411,)


Unnamed: 0_level_0,strain,conditions
sheet,Unnamed: 1_level_1,Unnamed: 2_level_1
HMF,12,16
acetate,13,30
formic acid,7,9
furfural,1,2
hydrolysates,17,19
phenolics,6,16


In [4]:
test_hyper = []
for inh1,inh2 in list(itertools.combinations(df.sheet.unique(),2)):
    for eff in ['+','-']:
        if inh1!=inh2:
            M = df.gene.unique().shape[0]
            sel1 = (df
                 .query('sheet==@inh1')
                 .query('growth_effect==@eff'))
            sel2 = (df
                 .query('sheet==@inh2')
                 .query('growth_effect==@eff'))
            n = sel1.gene.unique().shape[0]
            N = sel2.gene.unique().shape[0]
            x = (sel1[sel1.gene.isin(sel2.gene)]
                  .gene.unique()
                  .shape[0]
                )
            pval = hypergeom.sf(x-1, M, n, N)
            test_hyper.append([inh1,inh2,eff,M,n,N,x,fix_pvals(pval)])

test_hyper = pd.DataFrame(test_hyper,columns=['inhibitor1','inhibitor2',
                                             'growth_effect',
                                             'population_size',
                                             'inh1_subset',
                                             'inh2_subset',
                                             'overlap',
                                             'pvalue'])

# sanity
print(all(test_hyper.overlap<=test_hyper.inh1_subset))
print(all(test_hyper.overlap<=test_hyper.inh2_subset))

print(sum(test_hyper['pvalue']<0.05))
test_hyper.to_csv('hypergeom_test.tsv',sep='\t',index=False)
test_hyper

True
True
13


Unnamed: 0,inhibitor1,inhibitor2,growth_effect,population_size,inh1_subset,inh2_subset,overlap,pvalue
0,acetate,furfural,+,4411,2220,41,18,0.8373509
1,acetate,furfural,-,4411,427,174,19,0.3229921
2,acetate,HMF,+,4411,2220,24,7,0.989757
3,acetate,HMF,-,4411,427,20,0,1.0
4,acetate,formic acid,+,4411,2220,10,2,0.9898896
5,acetate,formic acid,-,4411,427,19,0,1.0
6,acetate,phenolics,+,4411,2220,20,11,0.4233456
7,acetate,phenolics,-,4411,427,152,17,0.2999286
8,acetate,hydrolysates,+,4411,2220,423,152,1.0
9,acetate,hydrolysates,-,4411,427,2381,108,1.0
