### The following notebook was used to unblind and compile all of the S1 data

In [1]:
# Importing the packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pathlib as plb
import seaborn as sns
from scipy import stats
import statistics
import dabest as db

#### Reading in the Image Analysis summary file. 
<p> The metadata sheet will be used to fill in missing fields on the Image Analysis summary file including Compound, Strain and Plate ID </p>

In [2]:
ia_data = pd.read_csv('C:/Users/Emily/Desktop/screens/CB4856_screen/Unblinded Metadata.csv', index_col=0)
ia_data.head()

Unnamed: 0,WellNo,area,Total Worms,Chemotaxis,Compound,Strain,File Name,Well width,Plate ID,Passes QC,UB_Compound
14,4C,2373031,123,-0.078261,D10,CB4856,S1CB_002,3052,S1CB_R1_9,N,HuperzineA
2,1C,2335394,222,0.239583,D10,CB4856,S1CB_006,3022,S1CB_R2_9,Y,HuperzineA
14,4C,2336458,175,0.251701,D10,CB4856,S1CB_007,3052,S1CB_R3_4,Y,HuperzineA
2,1C,2342363,109,-0.010989,D10,,S1CB_011,3040,S1CB_R4_9,N,HuperzineA
1,1B,2331195,195,0.01676,D10,,S1CB_012,3049,S1CB_R4_12,Y,HuperzineA


#### Filtering the dataset to only include replicates with 150 worms or more
<p> We also want to identify any compounds with fewer than 3 biological replicates with over 150 worms<p>

In [3]:
ia_data = ia_data.loc[ia_data['Total Worms']>= 150] 
ia_data['comp_count'] = ia_data.groupby('Compound')['Compound'].transform('count')
ia_data.loc[ia_data['comp_count']<3] 

Unnamed: 0,WellNo,area,Total Worms,Chemotaxis,Compound,Strain,File Name,Well width,Plate ID,Passes QC,UB_Compound,comp_count
7,2D,2332527,156,0.220588,F11,CB4856,S1CB_007,3052,S1CB_R3_2,Y,Piperitenone,2
11,3D,2339563,261,0.228814,F11,,S1CB_011,3052,S1CB_R4_11,Y,Piperitenone,2


#### Reading in all of the files that contain worm positions based on the exclusion criteria above

In [4]:
wrm_locs_fldr = plb.Path('C:/Users/Emily/Desktop/screens/CB4856_screen/ia/')

In [5]:
def get_worm_locs(row, wrms, result_dict): 

    fname = row['File Name']
    wellnum = row['WellNo']
    loc_fname =  wrms.joinpath('loc_' + fname + '_' + wellnum + '.csv')
    temp = pd.read_csv(loc_fname)
    compound = row['UB_Compound']
    xs = temp['X']
    mean_pos = xs.mean()
    #xs = list(temp['centroid-1'])
    if compound in result_dict:
        result_dict[compound] = result_dict[compound].append(xs)
        result_dict[compound].reset_index(inplace=True, drop=True)
        
        #result_dict[compound] = result_dict[compound]+xs
    else:
        result_dict[compound]=xs
    
    return result_dict, mean_pos

In [6]:
ia_data = ia_data.sort_values(['UB_Compound', 'File Name'], ascending=[True, True])


#### To perform statistical analyses we need to pool all of the worm positiions from each biological replicate for each condition.

<p> We also want to capture the first 3 biological replicates performed for each condition. Some conditions were captured 4 times due to errors made during the screening process <p>

In [7]:
cols = list(ia_data.columns)
cols.append('MeanPos')
means_df = pd.DataFrame(columns=cols)

In [14]:
# Need to create an empty dictionary to hold the values
results_dict = {}
compound = ''
i=0
idx=0

for index, row in ia_data.iterrows():
    if row['UB_Compound'] == compound:
        i += 1
        compound = row['UB_Compound']
        if i < 3:
            pooled, m = get_worm_locs(row, wrm_locs_fldr, results_dict)
            row['mean'] = m
            
            idx =+ 1
        else:
            continue
    else:
        i = 0
        compound = row['UB_Compound']
        pooled, m = get_worm_locs(row, wrm_locs_fldr, results_dict)
        row['mean'] = m

    
#Remember that Dabest requires a dataframe. Casting dict to df
pooled_df = pd.DataFrame.from_dict(pooled)

Unnamed: 0,WellNo,area,Total Worms,Chemotaxis,Compound,Strain,File Name,Well width,Plate ID,Passes QC,UB_Compound,comp_count
7,2D,2358352,169,0.467532,E11,CB4856,S1CB_003,3044,S1CB_R1_10,Y,1-octanol,3
7,2D,2277502,283,-0.447471,E11,CB4856,S1CB_006,3046,S1CB_R2_10,Y,1-octanol,3
7,2D,2331200,226,-0.263682,E11,,S1CB_011,3050,S1CB_R4_10,Y,1-octanol,3
8,3A,2330855,222,0.115789,B8,Strain column not matched,S1CB_005,3054,S1CB_R2_7,Y,"2,3-Dihydrobenzofuran",3
2,1C,2287166,207,0.602151,B8,CB4856,S1CB_008,3030,S1CB_R3_5,Y,"2,3-Dihydrobenzofuran",3
...,...,...,...,...,...,...,...,...,...,...,...,...
2,1C,2347221,158,0.410072,B4,CB4856,S1CB_004,3040,S1CB_R2_1,Y,p-Tolualdehyde,3
2,1C,2380336,201,0.392265,B4,,S1CB_009,3024,S1CB_R4_1,Y,p-Tolualdehyde,3
3,1D,2338182,195,-0.261146,F5,CB4856,S1CB_005,3060,S1CB_R2_5,Y,α-Phellandrene,3
5,2B,2261405,369,0.095385,F5,CB4856,S1CB_008,3015,S1CB_R3_6,Y,α-Phellandrene,3


#### Converting the worm locations from measurements in dots per inch to millimeters

In [11]:
middle = ia_data['Well width'].mean()/2

# 1 inch = 25.4mm
mm = 25.4
# 1200 pixels per 25.4mm
px_mm = 1200/mm

middle_mm = middle/px_mm

#The following transforms the worm location data so that it is respective to the start zone
# The start zone is at the center of the image; Start Zone = 0mm
#Worms with positive values are closer to the compound, negative values are away from the compound
mm_df = pooled_df.apply(lambda x: -(x/px_mm)+middle_mm)
means_df['MeanPos'] = means_df['MeanPos'].apply(lambda x: -(int(x)/px_mm)+32.5)

means_df.to_csv('/Users/Emily/Desktop/screens/CB4856_screen/CB_means.csv')
#mm_df.to_csv('C:/Users/Emily/Documents/S1/S1_xs3.csv')

In [10]:
mm_df.to_csv('C:/Users/Emily/Desktop/CB4856_screen/CBMM.csv')

In [11]:
sub = mm_df.columns.tolist()
sub.remove('Empty:Empty')

In [12]:
n2 = pd.read_csv('C:/Users/Emily/Documents/S1/S1_xs3.csv', index_col=0, usecols=sub)

In [13]:
print(print(len(sub)))

43
None


In [14]:
print(len(n2.columns))

42


#### Creating an ordered list of compounds to pass to Dabest to calculate confidence intervals
<p> The control condition always needs to be the first item in the list

In [15]:
ia_data.head()

Unnamed: 0,WellNo,area,Total Worms,Chemotaxis,Compound,Strain,File Name,Well width,Plate ID,Passes QC,UB_Compound,comp_count
7,2D,2358352,169,0.467532,E11,CB4856,S1CB_003,3044,S1CB_R1_10,Y,1-octanol,3
7,2D,2277502,283,-0.447471,E11,CB4856,S1CB_006,3046,S1CB_R2_10,Y,1-octanol,3
7,2D,2331200,226,-0.263682,E11,,S1CB_011,3050,S1CB_R4_10,Y,1-octanol,3
8,3A,2330855,222,0.115789,B8,Strain column not matched,S1CB_005,3054,S1CB_R2_7,Y,"2,3-Dihydrobenzofuran",3
2,1C,2287166,207,0.602151,B8,CB4856,S1CB_008,3030,S1CB_R3_5,Y,"2,3-Dihydrobenzofuran",3


In [16]:
sums = ia_data.groupby('UB_Compound').mean().reset_index()
ia_sort = sums.copy()
condition = (ia_sort.UB_Compound=='DMSO')
excluded = ia_sort[condition]
included = ia_sort[~condition]
sortd = included.sort_values(by='Chemotaxis',ascending=True)
ia_sort = pd.concat([excluded, sortd])

cmpd_ordr = ia_sort['UB_Compound'].to_list()

In [17]:
cmpd_ordr

['DMSO',
 'Phytol',
 'Ethyl palmitate',
 'Guaiazulene',
 'Ethyl p-methoxycinnamate',
 'Ellagic acid',
 '2-nonanone',
 'Piperonyl Alcohol',
 '1-octanol',
 'α-Phellandrene',
 'Camphor',
 'Daucosterol',
 'Acetophenone',
 'Ursolic acid',
 '2,5-Dihydroxybenzoic acid',
 'Oleanolic Acid',
 'Cinnamyl Alcohol',
 'Salvinorin A Propionate',
 'Methyl palmitate',
 'Empty:Empty',
 'H2O',
 'Isoquinoline',
 'Spinosad',
 'Solasodine',
 'L-Mimosine',
 'Sinomenine hydrochloride',
 'Thiophene',
 'Limonin',
 'Sabinene',
 'HuperzineA',
 'Lapachol',
 'Carnosol',
 'Leonurine',
 'Piperitenone',
 'Phenylacetylene',
 'Furfural',
 'Paeoniflorin',
 '2,3-Dihydrobenzofuran',
 'p-Tolualdehyde',
 'Isoamyl alcohol',
 'Coumaran',
 'Anisole',
 '2-Methyl-1-butanol',
 'Diacetyl']

#### Generating confidence intervals using DMSO as the control group

In [18]:
db_obj = db.load(mm_df, idx=(cmpd_ordr))
pooled_mm_obj = db.load(mm_df, idx=(cmpd_ordr))
results_df_mm = pooled_mm_obj.mean_diff.results

In [19]:
results_df_mm.to_csv('C:/Users/Emily/Desktop/CB4856_screen/cb_confint.csv')

PermissionError: [Errno 13] Permission denied: 'C:/Users/Emily/Desktop/CB4856_screen/cb_confint.csv'

In [None]:
comps = results_df_mm['test']
comps = comps.tolist()
print(len(comps))

In [None]:
fig, ax = plt.subplots(figsize=(10,10)) 

positions = list(range(0, len(comps)))

lines = []

for index, row in results_df_mm.iterrows():
    line = ((row['bca_low'], index), (row['bca_high'], index))
    lines.append(line)


    ytick_labels = comps

    mdiffs = list(results_df_mm['difference'])

    hln_coll = mpl.collections.LineCollection(lines, colors='black', linewidths = .7)

    #ax[r,c] = plt.gca()


    x_positions = list(np.arange(-15, 20, 5))

    plt.yticks(positions, ytick_labels )
    plt.xticks(x_positions, x_positions)

    ax.axvline(0, ls='--', c='gray', zorder=1)
    #ax.fill_betweenx(positions, sub['bca_low'], sub['bca_high'], alpha=.25, joinstyle='round')
    ax.add_collection(hln_coll)
    ax.scatter(mdiffs, positions, s=8, c='black')
    ax.tick_params(axis='both', tickdir='in')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    ax.set_xlim(-12,10)    
    ax.set_ylim(-1, len(comps))


    plt.tight_layout()
    plt.rcParams['svg.fonttype'] = 'none'
#plt.savefig('C:/Users/Emily/Desktop/DS/LRCI.svg')