### The following notebook was used to unblind and compile all of the S1 data

In [2]:
# Importing the packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pathlib as plb
import seaborn as sns
from scipy import stats
import statistics
import dabest as db

#### Reading in and cleaning up blinded metadata and compound key

In [3]:
md = pd.read_csv('C:/Users/Emily/Documents/S1/metadata/S1_metadata.csv', delimiter=',', encoding='utf-8-sig')
compound_key = pd.read_csv('C:/Users/Emily/Documents/S1/metadata/S1_randomized_compounds.csv', index_col=0)

In [4]:
compound_key['Plate Number'] =  compound_key['Plate Number'].str.strip().str[-1]
md['Compound library ID'] = md['Compound library ID'].str.strip().str[-1]
md.drop(md.columns[[3, 4, 12, 13, 16, 17, 18, 19, 20, 21]], axis = 1, inplace = True)
md = md.drop(md.index[0], axis=0)
md.head()

Unnamed: 0,Date:,Recorder,Plate ID,Compound library ID,Compound Well A,Compound Well B,Compound Well C,Compound Well D,Temp (C),Humidity,Image ID,Scanner Slot:
1,8/23/21,Hodan,S1_R1_1,1,B2,B3,B4,B5,22.0,40.0,S1_001,1
2,8/23/21,Hodan,S1_R1_2,1,C2,C3,C4,C5,22.0,40.0,S1_001,2
3,8/23/21,Hodan,S1_R1_3,1,D2,D3,D4,D5,22.0,40.0,S1_001,3
4,8/23/21,Hodan,S1_R1_4,1,E2,E3,E4,E5,22.0,40.0,S1_001,4
5,8/23/21,Hodan,S1_R1_5,1,F2,F3,F4,F5,22.0,40.0,S1_002,1


#### Generating functions to unblind the compounds in the metadata

In [5]:
def add_compoundA(row, compound_map ):
    compound = compound_map.loc[
        (compound_map['Plate Number']==row['Compound library ID']) & 
        (compound_map['Compound Well']==row['Compound Well A'])]['Compound']
    #print(compound.values)
    return compound.values[0]

def add_compoundB(row, compound_map ):
    compound = compound_map.loc[
        (compound_map['Plate Number']==row['Compound library ID']) & 
        (compound_map['Compound Well']==row['Compound Well B'])]['Compound']
    #print(compound.values)
    return compound.values[0]

def add_compoundC(row, compound_map ):
    compound = compound_map.loc[
        (compound_map['Plate Number']==row['Compound library ID']) & 
        (compound_map['Compound Well']==row['Compound Well C'])]['Compound']
    return compound.values[0]

def add_compoundD(row, compound_map ):
    compound = compound_map.loc[
        (compound_map['Plate Number']==row['Compound library ID']) & 
        (compound_map['Compound Well']==row['Compound Well D'])]['Compound']
    return compound.values[0]

In [6]:
md['Compound A'] = md.apply(
    lambda row: add_compoundA(row, compound_key), axis=1)

md['Compound B'] = md.apply(
    lambda row: add_compoundB(row, compound_key), axis=1)
    
md['Compound C'] = md.apply(
    lambda row: add_compoundC(row, compound_key), axis=1)
    
md['Compound D'] = md.apply(
    lambda row: add_compoundD(row, compound_key), axis=1)

md['Scanner Slot:'] = md['Scanner Slot:'].apply(str)

#### Reading in the Image Analysis summary file. 
<p> The metadata sheet will be used to fill in missing fields on the Image Analysis summary file including Compound, Strain and Plate ID </p>

In [7]:
ia_data = pd.read_csv('C:/Users/Emily/Documents/S1/image_analysis/All_reps.csv', index_col=0)
ia_data = ia_data.drop(['Large Object'], axis=1)

#### Generating a function to add the plate ID and compound name to the summary data

In [8]:
def add_PlateID(row, metadata):
    slotID = row['WellNo'][0]
    pid = metadata.loc[
        (metadata['Image ID']==row['File Name']) & 
        (metadata['Scanner Slot:']==slotID)]['Plate ID']
    #print(pid)
    return pid.values[0]

In [9]:
ia_data['Plate ID'] = ia_data.apply(
    lambda row: add_PlateID(row, md), axis=1)

In [10]:
def add_Compound(row, metadata):
    wellID = row['WellNo'][1]
    if wellID == 'A':
        compound = metadata.loc[metadata['Plate ID']==row['Plate ID']]['Compound A']
    elif wellID == 'B':
        compound = metadata.loc[metadata['Plate ID']==row['Plate ID']]['Compound B']
    elif wellID == 'C':
        compound = metadata.loc[metadata['Plate ID']==row['Plate ID']]['Compound C']
    elif wellID == 'D':
        compound = metadata.loc[metadata['Plate ID']==row['Plate ID']]['Compound D']
   #print(compound)
    return compound.values[0]

In [11]:
ia_data['Compound'] = ia_data.apply(
    lambda row: add_Compound(row, md), axis=1)

#### Filtering the dataset to only include replicates with 150 worms or more

In [12]:
ia_data = ia_data.loc[ia_data['Total Worms']>= 150] 

#### Reading in all of the files that contain worm positions based on the exclusion criteria above

In [14]:
wrm_locs_fldr = plb.Path('C:/Users/Emily/Documents/S1/image_analysis/')

In [15]:
def get_worm_locs(row, wrms, result_dict): 

    fname = row['File Name']
    wellnum = row['WellNo']
    loc_fname =  wrms.joinpath('loc_' + fname + '_' + wellnum + '.csv')
    temp = pd.read_csv(loc_fname)
    compound = row['Compound']
    xs = temp['X']
    #xs = list(temp['centroid-1'])
    if compound in result_dict:
        result_dict[compound] = result_dict[compound].append(xs)
        result_dict[compound].reset_index(inplace=True, drop=True)
        #result_dict[compound] = result_dict[compound]+xs
    else:
        result_dict[compound]=xs
    
    return result_dict

In [16]:
# Need to create an empty dictionary to hold the values
results_dict = {}
for index, row in ia_data.iterrows():
    pooled = get_worm_locs(row, wrm_locs_fldr, results_dict)
    
#Remember that Dabest requires a dataframe. Casting dict to df
pooled_df = pd.DataFrame.from_dict(pooled)

#### Converting the worm locations from measurements in dots per inch to millimeters

In [17]:
# 1 inch = 25.4mm
mm = 25.4
# 1200 pixels per 25.4mm
px_mm = 1200/mm

#The following transforms the worm location data so that it is respective to the start zone
# The start zone is at the center of the image; Start Zone = 0mm
#Worms with positive values are closer to the compound, negative values are away from the compound
mm_df = pooled_df.apply(lambda x: -(x/px_mm)+32.5)
#mm_df.to_csv('D:/_2021_08_screen/analysis/S1_xs.csv')

#### Creating an ordered list of compounds to pass to Dabest to calculate confidence intervals
<p> The control condition always needs to be the first item in the list

In [18]:
sums = ia_data.groupby('Compound').mean().reset_index()
ia_sort = sums.copy()
condition = (ia_sort.Compound=='DMSO') | (ia_sort.Compound=='H2O')

excluded = ia_sort[condition]
included = ia_sort[~condition]
sorted = included.sort_values(by='Chemotaxis',ascending=True)
ia_sort = pd.concat([excluded, sorted])

cmpd_ordr = ia_sort['Compound'].to_list()

#### Generating confidence intervals using DMSO as the control group

In [19]:
db_obj = db.load(mm_df, idx=(cmpd_ordr))
pooled_mm_obj = db.load(mm_df, idx=(cmpd_ordr))
results_df_mm = pooled_mm_obj.mean_diff.results

In [76]:
#results_df_mm.to_csv('D:/_2021_08_screen/analysis/S1mdiff_150_DMSO.csv')

#### Filtering the DMSO analysis for compounds with confidence intervals that do not span the mean of position of worms against DMSO

In [20]:
same = results_df_mm.loc[(results_df_mm['bca_low'] < 0) & (results_df_mm['bca_high'] > 0)]
same_comps = same['test'].to_list()
diff = results_df_mm[~results_df_mm['test'].isin(same_comps)]
diff

Unnamed: 0,control,test,control_N,test_N,effect_size,is_paired,difference,ci,bca_low,bca_high,...,resamples,random_seed,pvalue_permutation,permutation_count,pvalue_welch,statistic_welch,pvalue_students_t,statistic_students_t,pvalue_mann_whitney,statistic_mann_whitney
1,DMSO,1-octanol,1068,652,mean difference,False,-7.446258,95,-8.944625,-5.916856,...,5000,12345,0.0,5000,1.049841e-21,9.717362,2.703217e-20,9.347132,1.296941e-18,436169.5
2,DMSO,Phytol,1068,625,mean difference,False,-6.248916,95,-7.80029,-4.622622,...,5000,12345,0.0,5000,5.494421e-14,7.599271,1.175689e-13,7.48141,1.81106e-13,405215.0
3,DMSO,Ellagic acid,1068,1101,mean difference,False,-4.205057,95,-5.603935,-2.882264,...,5000,12345,0.0,5000,1.247352e-09,6.101265,1.162802e-09,6.112091,1.380842e-08,670698.5
4,DMSO,2-nonanone,1068,867,mean difference,False,-4.575976,95,-6.02,-3.096992,...,5000,12345,0.0,5000,1.936083e-09,6.0328,2.22725e-09,6.008853,3.639988e-09,535088.0
5,DMSO,Methyl palmitate,1068,1121,mean difference,False,-2.98869,95,-4.367339,-1.575574,...,5000,12345,0.0,5000,2.418873e-05,4.231438,2.360316e-05,4.236897,5.023816e-05,658546.5
6,DMSO,Salvinorin A Propionate,1068,926,mean difference,False,-3.313405,95,-4.793185,-1.846934,...,5000,12345,0.0,5000,1.228296e-05,4.383759,1.248376e-05,4.380002,1.484407e-05,550018.5
7,DMSO,Spinosad,1068,1071,mean difference,False,-3.234352,95,-4.740211,-1.870059,...,5000,12345,0.0002,5000,8.428385e-06,4.464892,8.422537e-06,4.465034,1.12028e-05,634650.0
8,DMSO,Camphor,1068,1186,mean difference,False,-2.690187,95,-4.075054,-1.31914,...,5000,12345,0.0,5000,0.0001345027,3.824805,0.0001307632,3.831664,0.0001954113,690792.0
9,DMSO,Oleanolic Acid,1068,730,mean difference,False,-1.760109,95,-3.393989,-0.206566,...,5000,12345,0.0324,5000,0.03055372,2.164798,0.03056667,2.164379,0.02923899,413391.5
10,DMSO,Ursolic acid,1068,884,mean difference,False,-2.229335,95,-3.7373,-0.736521,...,5000,12345,0.0036,5000,0.003954668,2.885328,0.003934319,2.886827,0.003827441,507905.5


#### Swapping the list order so that water is the control group

In [21]:
def swap_rows(df, i1, i2):
    a, b = df.iloc[0], df.iloc[1]
    df.iloc[i1, :], df.iloc[i2, :] = b, a
    return df

swapped = swap_rows(excluded, 0, 1)
h20_analysis = swapped.append(included)
h20_analysis.tail()

Unnamed: 0,Compound,Total Worms,Chemotaxis,Strain,Well width
91,p-Anisic acid,341.333333,-0.076448,,3046.333333
92,p-Tolualdehyde,291.0,0.189825,,3039.666667
93,trans-Cinnamaldehyde,293.0,0.019462,,3051.666667
94,α-Phellandrene,259.0,0.341553,,3045.333333
95,β-Citronellol,260.75,0.122218,,3038.0


#### Calculating confidence intervals using water as the control group

In [22]:
h20_ordr = h20_analysis['Compound'].to_list()
h20_obj = db.load(mm_df, idx=(h20_ordr))
h20_md_results = h20_obj.mean_diff.results
h20_md_results

Unnamed: 0,control,test,control_N,test_N,effect_size,is_paired,difference,ci,bca_low,bca_high,...,resamples,random_seed,pvalue_permutation,permutation_count,pvalue_welch,statistic_welch,pvalue_students_t,statistic_students_t,pvalue_mann_whitney,statistic_mann_whitney
0,H2O,DMSO,915,1068,mean difference,False,-0.418416,95,-1.835136,1.079114,...,5000,12345,0.5740,5000,5.824984e-01,0.549831,5.826499e-01,0.549608,5.920424e-01,495422.0
1,H2O,(+)-Carvone,915,1052,mean difference,False,0.703972,95,-0.733226,2.151434,...,5000,12345,0.3566,5000,3.498650e-01,-0.935080,3.489966e-01,-0.936761,3.573942e-01,469726.0
2,H2O,(-)-Borneol,915,867,mean difference,False,-1.210954,95,-2.762577,0.381913,...,5000,12345,0.1404,5000,1.380134e-01,1.483902,1.375771e-01,1.485543,1.374698e-01,412778.5
3,H2O,(-)-Cedrene,915,1036,mean difference,False,1.584452,95,-0.021242,3.080056,...,5000,12345,0.0408,5000,4.176785e-02,-2.037181,4.222333e-02,-2.032649,3.555616e-02,447868.0
4,H2O,(-)-Huperzine A,915,1111,mean difference,False,2.660609,95,1.188367,4.006993,...,5000,12345,0.0002,5000,3.563070e-04,-3.576862,3.441448e-04,-3.585698,4.885588e-04,462589.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,H2O,p-Anisic acid,915,1024,mean difference,False,-1.586643,95,-3.142784,-0.052098,...,5000,12345,0.0438,5000,4.107570e-02,2.044140,4.136976e-02,2.041164,3.869320e-02,493925.5
91,H2O,p-Tolualdehyde,915,873,mean difference,False,1.987114,95,0.397433,3.483390,...,5000,12345,0.0108,5000,1.113149e-02,-2.541192,1.120495e-02,-2.538883,1.454251e-02,372731.0
92,H2O,trans-Cinnamaldehyde,915,879,mean difference,False,-0.375126,95,-1.989382,1.223275,...,5000,12345,0.6526,5000,6.412641e-01,0.466011,6.411169e-01,0.466217,6.493164e-01,407130.5
93,H2O,α-Phellandrene,915,777,mean difference,False,4.949155,95,3.271456,6.519793,...,5000,12345,0.0000,5000,1.560789e-09,-6.072308,1.631275e-09,-6.064321,1.582380e-09,295028.0


In [77]:
#h20_md_results.to_csv('D:/_2021_08_screen/analysis/S1mdiff_150_h2o.csv')

#### Filtering the water analysis for compounds with confidence intervals that do not span the mean of position of worms against water

In [23]:
h2o_same = h20_md_results.loc[(h20_md_results['bca_low'] < 0) & (h20_md_results['bca_high'] > 0)]
h2o_same_comps = h2o_same['test'].to_list()
h2o_diff = h20_md_results[~h20_md_results['test'].isin(h2o_same_comps)]
h2o_diff

Unnamed: 0,control,test,control_N,test_N,effect_size,is_paired,difference,ci,bca_low,bca_high,...,resamples,random_seed,pvalue_permutation,permutation_count,pvalue_welch,statistic_welch,pvalue_students_t,statistic_students_t,pvalue_mann_whitney,statistic_mann_whitney
4,H2O,(-)-Huperzine A,915,1111,mean difference,False,2.660609,95,1.188367,4.006993,...,5000,12345,0.0002,5000,0.000356307,-3.576862,0.0003441448,-3.585698,0.0004885588,462589.0
5,H2O,1-octanol,915,652,mean difference,False,-7.864674,95,-9.412386,-6.307701,...,5000,12345,0.0,5000,1.6652080000000002e-22,9.917635,1.713584e-21,9.662233,4.284519e-20,379347.5
6,H2O,"2,3-Dihydrobenzofuran",915,705,mean difference,False,8.776051,95,7.16418,10.309801,...,5000,12345,0.0,5000,2.4429379999999998e-26,-10.814839,6.385411999999999e-26,-10.713114,1.587266e-25,225059.0
7,H2O,"2,5-Dihydroxybenzoic acid",915,971,mean difference,False,2.044578,95,0.513804,3.517039,...,5000,12345,0.0084,5000,0.007660099,-2.669591,0.007608091,-2.671865,0.008738044,413238.5
8,H2O,2-Methyl-1-butanol,915,381,mean difference,False,11.293831,95,9.436491,12.980729,...,5000,12345,0.0,5000,2.33743e-32,-12.358826,3.183088e-29,-11.502871,2.081073e-28,106449.5
10,H2O,2-nonanone,915,867,mean difference,False,-4.994392,95,-6.570856,-3.472582,...,5000,12345,0.0,5000,2.58695e-10,6.358271,2.681458e-10,6.352614,3.667791e-10,464699.0
13,H2O,Acetophenone,915,755,mean difference,False,4.819987,95,3.179669,6.422917,...,5000,12345,0.0,5000,6.005481e-09,-5.848135,6.163933e-09,-5.842685,4.273249e-09,287807.0
18,H2O,Anisole,915,858,mean difference,False,2.598339,95,0.981452,4.142118,...,5000,12345,0.001,5000,0.001417803,-3.19609,0.001403249,-3.19904,0.001176104,357580.0
25,H2O,Camphor,915,1186,mean difference,False,-3.108603,95,-4.514609,-1.656158,...,5000,12345,0.0,5000,2.29333e-05,4.244598,2.129921e-05,4.260491,2.151191e-05,601172.0
27,H2O,Carnosol,915,930,mean difference,False,2.102122,95,0.484797,3.611089,...,5000,12345,0.0094,5000,0.007673362,-2.66905,0.007677007,-2.668889,0.006448328,394307.5


#### Merging the water and DMSO analysis to reflect only compounds that are found in both analyses
<p> Compounds found on the resulting dataset will be retested using the the tax-4 null, osm-9 null and tax-4::osm-9 double null mutant strains

In [24]:
inner = pd.merge(h2o_diff, diff , on=["test"])
#inner.to_csv('D:/_2021_08_screen/analysis/S1_inner_join150.csv')

In [25]:
inner

Unnamed: 0,control_x,test,control_N_x,test_N_x,effect_size_x,is_paired_x,difference_x,ci_x,bca_low_x,bca_high_x,...,resamples_y,random_seed_y,pvalue_permutation_y,permutation_count_y,pvalue_welch_y,statistic_welch_y,pvalue_students_t_y,statistic_students_t_y,pvalue_mann_whitney_y,statistic_mann_whitney_y
0,H2O,(-)-Huperzine A,915,1111,mean difference,False,2.660609,95,1.188367,4.006993,...,5000,12345,0.0,5000,1.747528e-05,-4.304582,1.726821e-05,-4.307196,1.706859e-05,530139.5
1,H2O,1-octanol,915,652,mean difference,False,-7.864674,95,-9.412386,-6.307701,...,5000,12345,0.0,5000,1.049841e-21,9.717362,2.703217e-20,9.347132,1.296941e-18,436169.5
2,H2O,"2,3-Dihydrobenzofuran",915,705,mean difference,False,8.776051,95,7.16418,10.309801,...,5000,12345,0.0,5000,2.015177e-30,-11.706788,1.1458100000000001e-29,-11.521013,1.172357e-28,259297.0
3,H2O,"2,5-Dihydroxybenzoic acid",915,971,mean difference,False,2.044578,95,0.513804,3.517039,...,5000,12345,0.0008,5000,0.0008634819,-3.336573,0.0008801916,-3.331193,0.0009182209,474505.5
4,H2O,2-Methyl-1-butanol,915,381,mean difference,False,11.293831,95,9.436491,12.980729,...,5000,12345,0.0,5000,7.148108e-36,-13.148846,4.614423e-32,-12.076666,1.447923e-30,122872.0
5,H2O,2-nonanone,915,867,mean difference,False,-4.994392,95,-6.570856,-3.472582,...,5000,12345,0.0,5000,1.936083e-09,6.0328,2.22725e-09,6.008853,3.639988e-09,535088.0
6,H2O,Acetophenone,915,755,mean difference,False,4.819987,95,3.179669,6.422917,...,5000,12345,0.0,5000,7.189151e-11,-6.560129,7.800292e-11,-6.543272,1.161934e-10,331826.5
7,H2O,Anisole,915,858,mean difference,False,2.598339,95,0.981452,4.142118,...,5000,12345,0.0,5000,0.000130616,-3.833538,0.0001253071,-3.843348,0.000106887,411173.0
8,H2O,Camphor,915,1186,mean difference,False,-3.108603,95,-4.514609,-1.656158,...,5000,12345,0.0,5000,0.0001345027,3.824805,0.0001307632,3.831664,0.0001954113,690792.0
9,H2O,Carnosol,915,930,mean difference,False,2.102122,95,0.484797,3.611089,...,5000,12345,0.0008,5000,0.0009381473,-3.313503,0.0009361918,-3.313991,0.0008830817,453845.5


In [28]:
print(len(inner[inner['difference_x']>0]))
print(len(inner[inner['difference_x']<0]))

29
13
