In [1]:
import math
import numpy as np
import pandas as pd
from scipy.stats import norm
import scipy
import seaborn as sn
import matplotlib.pyplot as plt
from math import log
from collections import Counter

In [2]:
sub = ['control', 'test', 'difference', 'bca_low', 'bca_high']
data = pd.read_csv('/Users/Emily/Documents/S1/S1_3.csv', usecols=sub).reset_index(drop=True)

In [3]:
def CItoP(row):
    u = row['bca_high']
    l = row['bca_low']
    Est = row['difference']
    #calculate the standard error from the upper and lower bounds of the confidence interval
    SE = (u-l)/(2*1.96)
    #calculate the z-score from the difference in means (estimated mean) over the standard error
    z = Est/SE
    #find the area under the curve of the CDF for the associated z-score using scipy.stats.special.ndtr or scipy.stats.norm.sf
    #equations to integrate the area under the curve for both functions are the same.
    p = norm.sf(abs(z))*2 #two-sided
    return p

In [4]:

for index, row in data.iterrows():
    data.loc[index, 'p_value'] = CItoP(row)


In [5]:
# i = pvalue rank
# m = total number of tests
# Q = false discovery rate percentage

def BH_critical(i, m, Q):
    correction = (i/m)*Q
    return correction

In [48]:
def pranks(df, fdr):
    hed = 'BH_correction_' + str(fdr)
    for index, row in df.iterrows():
        df.loc[index, hed] = BH_critical(row['rank'],41,fdr)
    return df

In [7]:
vals96 = pd.DataFrame()
controls = ['DMSO', 'H2O']

for c in controls:
    h = data.loc[data['control']==c]
    ranked = h.sort_values(by=['p_value']).reset_index(drop=True)
    ranks=np.arange(1,len(ranked)+1)
    ranked['rank'] = ranks
    BH = pranks(ranked, .05)
    vals96 = vals96.append(BH)
vals96.tail()

Unnamed: 0,control,test,difference,bca_low,bca_high,p_value,rank,BH_critical_0.05
90,H2O,Eucalyptol,0.131015,-1.52033,1.710998,0.873718,91,0.047396
91,H2O,Carnosic acid,0.116052,-1.502566,1.752305,0.888844,92,0.047917
92,H2O,Rosmarinic acid,-0.100297,-1.653787,1.484949,0.900316,93,0.048438
93,H2O,Trans-Anethole,-0.072106,-1.680317,1.456152,0.928192,94,0.048958
94,H2O,Skatole,0.017496,-1.500906,1.571867,0.982193,95,0.049479


In [8]:
filtered_BH = vals96.loc[vals96['p_value'] < vals96['BH_critical_0.05']]
filtered_BH

Unnamed: 0,control,test,difference,bca_low,bca_high,p_value,rank,BH_critical_0.05
0,DMSO,Isoamyl alcohol,14.166999,12.820407,15.511796,1.353903e-94,1,0.000521
1,DMSO,2-Methyl-1-butanol,11.712247,9.891178,13.409462,6.392657e-39,2,0.001042
2,DMSO,Thiophene,9.759898,8.202217,11.263300,7.612012e-36,3,0.001563
3,DMSO,"2,3-Dihydrobenzofuran",9.194467,7.629789,10.673937,2.428362e-32,4,0.002083
4,DMSO,Diacetyl,8.018781,6.589946,9.361027,7.992905e-30,5,0.002604
...,...,...,...,...,...,...,...,...
35,H2O,Oleanolic Acid,-2.178525,-3.833409,-0.583985,8.586267e-03,36,0.018750
36,H2O,Ethyl palmitate,2.155766,0.531007,3.781640,9.331073e-03,37,0.019271
37,H2O,Piperonyl Alcohol,1.872080,0.430105,3.296847,1.047040e-02,38,0.019792
38,H2O,p-Tolualdehyde,1.987114,0.397433,3.483390,1.159710e-02,39,0.020313


In [9]:
#filtered_BH.to_csv('/Users/Emily/Desktop/ReviewerResponses/filteredBH_.05.csv')
#vals96.to_csv('/Users/Emily/Desktop/ReviewerResponses/96_CItoPvals_.3.csv')

In [10]:
#Comparing the "significant" hits between DMSO and H2O controls. What compounds are found in both sets?
test = filtered_BH['test']
counts = Counter(test)
output = [value for value, count in counts.items() if count > 1]

In [45]:
#Next we want to find which compounds we would have omitted from our mutant screens if we had performed BH-corrections
followups = pd.read_csv('/Users/Emily/Documents/S1/inner3.csv', index_col=0)
comps = list(followups['test'].unique())
set1 = set(comps)
set2 = set(output)
missing = list(sorted(set1 - set2))
missing

NameError: name 'output' is not defined

#### Creating a clean and sorted Supplemental Table 2

In [41]:
dmso = vals96.loc[vals96['control']=='DMSO']
dmso = dmso.sort_values(by='difference', ascending=False)
cord = list(dmso['test'])

h2o = vals96.loc[vals96['control'] == 'H2O']
h2o = h2o.set_index('test').reindex(cord).reset_index()
reordered = dmso.append(h2o)
reordered = reordered.drop(columns=['rank'])

In [44]:
reordered.to_csv('/Users/Emily/Documents/S1/reorderedS2.csv')

#### Creating a clean and sorted Supplemental Table 3

In [52]:
cols = ['control', 'test', 'difference', 'bca_low', 'bca_high', 'Strain']
mut_data = pd.read_csv('/Users/Emily/Documents/S1F2/all_strains_mdiff_dmso.csv', usecols=cols, index_col=0).reset_index(drop=True)
#mut_data=mut_data.rename(columns={"index": "rank"})
mut_data.head()

Unnamed: 0,test,difference,bca_low,bca_high,Strain
0,(-)-Huperzine A,2.506552,1.196282,3.777346,PR678
1,1-octanol,-3.953032,-5.293354,-2.707069,PR678
2,"2,3-Dihydrobenzofuran",6.791056,5.381876,8.232949,PR678
3,"2,5-Dihydroxybenzoic acid",1.256955,0.014839,2.370033,PR678
4,2-Methyl-1-butanol,1.722631,0.324622,3.058874,PR678


In [53]:
for index, row in mut_data.iterrows():
    mut_data.loc[index, 'p_value'] = CItoP(row)
mut_data.head()


Unnamed: 0,test,difference,bca_low,bca_high,Strain,p_value
0,(-)-Huperzine A,2.506552,1.196282,3.777346,PR678,0.0001407567
1,1-octanol,-3.953032,-5.293354,-2.707069,PR678,2.078376e-09
2,"2,3-Dihydrobenzofuran",6.791056,5.381876,8.232949,PR678,9.894876999999999e-21
3,"2,5-Dihydroxybenzoic acid",1.256955,0.014839,2.370033,PR678,0.03643103
4,2-Methyl-1-butanol,1.722631,0.324622,3.058874,PR678,0.01352355


In [65]:
n2

Unnamed: 0,test,difference,bca_low,bca_high,Strain,p_value
161,α-Phellandrene,5.367571,3.800868,6.835874,N2,4.127964e-12
151,p-Tolualdehyde,2.405531,0.921225,3.838737,N2,0.001228856
136,Ursolic acid,-2.229335,-3.7373,-0.736521,N2,0.003588451
165,Thiophene,9.759898,8.202217,11.2633,N2,7.612012e-36
133,Spinosad,-2.79694,-4.280256,-1.379054,N2,0.0001573796
150,Solasodine,3.820701,2.257312,5.414399,N2,2.095617e-06
141,Sinomenine hydrochloride,2.010176,0.48123,3.530882,N2,0.009770001
132,Salvinorin A Propionate,-3.313405,-4.793185,-1.846934,N2,1.040894e-05
140,Sabinene,2.165602,0.573782,3.713575,N2,0.006856526
144,Piperonyl Alcohol,2.290497,0.900558,3.673603,N2,0.001204274


In [67]:
n2 = mut_data.loc[mut_data['Strain']=='N2']
n2 = n2.sort_values(by='difference', ascending=False)
n2ord = list(n2['test'])

In [68]:
#all_vals.to_csv('/Users/Emily/Desktop/ReviewerResponses/mutants_CItoPvals_' + str(fdr) +'.csv')

In [69]:
strns = ['GN1077', 'CX10', 'PR678']
reorderedmuts = pd.DataFrame()
def reorder(s, df, ordr):
    hold = df.loc[df['Strain'] == s]
    hold = hold.set_index('test').reindex(ordr).reset_index()
    return hold

In [70]:
for s in strns:
    h = reorder(s, mut_data, n2ord)
    reorderedmuts = reorderedmuts.append(h)

In [72]:
reorderedmuts.to_csv('/Users/Emily/Desktop/NPScreen_figs/Final versions/TableS3.csv')