# 1. Imports and File selection 

In [3]:
import io
import ipywidgets as widgets
import math
import numpy
import psycopg
import pandas as pd
import requests
import sqlite3
import sys
import tqdm
import warnings

from config import load_config
from ipyfilechooser import FileChooser
from scipy import stats
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import insert
from sqlite3 import Error
from sqlite3 import IntegrityError

## Select Baseline .csv File

In [4]:
starting_directory = '/Volumes'
baseline_chooser = FileChooser(starting_directory)
display(baseline_chooser)

FileChooser(path='/Volumes', filename='', title='', show_hidden=False, select_desc='Select', change_desc='Chan…

## Select Tap .csv File

In [5]:
tap_chooser=FileChooser('/Volumes')
display(tap_chooser)

FileChooser(path='/Volumes', filename='', title='', show_hidden=False, select_desc='Select', change_desc='Chan…

## Select Post Stimulus Arousal .csv File

In [6]:
psa_chooser = FileChooser('/Volumes')
display(psa_chooser)

FileChooser(path='/Volumes', filename='', title='', show_hidden=False, select_desc='Select', change_desc='Chan…

In [7]:
screens = ['PD_Screen', 'ASD_Screen', 'G-Proteins_Screen', 'Glia_Genes_Screen', 
           'Neuron_Genes_Screen', 'PD_GWAS_Locus71_Screen', 'ASD_WGS_Screen', 'Miscellaneous']

screen_chooser = widgets.Select(options=screens, value=screens[0], description='Screen:')
display(screen_chooser)

Select(description='Screen:', options=('PD_Screen', 'ASD_Screen', 'G-Proteins_Screen', 'Glia_Genes_Screen', 'N…

In [8]:
Screen=screen_chooser.value
folder_path=baseline_chooser.selected_path
print(folder_path)

/Volumes/RankinLabMehak_SSD/PD_Uncertain_Loci_Locus71


## Read baseline, tap and post stimulus arousal (psa) data

In [9]:
# Read the baseline file
baseline_output = pd.read_csv(baseline_chooser.selected, index_col=0)#.drop(columns=['index'])

print(f"\nShape of the baseline .csv file: {baseline_output.shape}")

# Print the first five rows of the file
baseline_output.head()


Shape of the baseline .csv file: (266327, 21)


Unnamed: 0,Time,n,Number,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,...,Kink,Curve,Crab,Pathlength,Plate_id,Date,Screen,dataset,Gene,Allele
11624,490.036,58,41,0.0362,0.0374,-0.024,0.1008,1.1194,0.137088,2.1,...,39.3,26.0,0.0041,2.297,20240717_031256_A0717aa,20240717,PD_GWAS_Locus71_Screen,N2,N2,N2
11625,490.079,58,41,0.0378,0.0384,0.0,0.1007,1.1201,0.136839,2.1,...,39.2,26.2,0.005,2.297,20240717_031256_A0717aa,20240717,PD_GWAS_Locus71_Screen,N2,N2,N2
11626,490.122,58,41,0.0422,0.042,0.0,0.1004,1.1186,0.13659,2.5,...,40.3,26.5,0.0064,2.297,20240717_031256_A0717aa,20240717,PD_GWAS_Locus71_Screen,N2,N2,N2
11627,490.163,58,41,0.0402,0.0459,0.0,0.1001,1.1138,0.136092,2.6,...,40.4,26.4,0.0057,2.297,20240717_031256_A0717aa,20240717,PD_GWAS_Locus71_Screen,N2,N2,N2
11628,490.203,58,41,0.0406,0.0459,0.0,0.1029,1.112,0.136199,2.3,...,43.3,26.7,0.0057,2.297,20240717_031256_A0717aa,20240717,PD_GWAS_Locus71_Screen,N2,N2,N2


In [10]:
# Read the tap file
tap_output = pd.read_csv(tap_chooser.selected, index_col=0)

print(f"\nShape of the psa .csv file: {tap_output.shape}")

# Print the first five rows of the file
tap_output.head()


Shape of the psa .csv file: (4400, 13)


Unnamed: 0,time,dura,dist,prob,speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.971,3.21,0.786,0.891892,0.24486,1,20241002,20241002_034746_bb,PD_GWAS_Locus71_Screen,1.0,F56C9.1_tm5851,F56C9.1,tm5851
1,609.968,2.32,0.637,0.918919,0.274569,1,20241002,20241002_034746_bb,PD_GWAS_Locus71_Screen,2.0,F56C9.1_tm5851,F56C9.1,tm5851
2,619.984,2.29,0.709,0.883721,0.309607,1,20241002,20241002_034746_bb,PD_GWAS_Locus71_Screen,3.0,F56C9.1_tm5851,F56C9.1,tm5851
3,629.96,1.68,0.568,0.84,0.338095,1,20241002,20241002_034746_bb,PD_GWAS_Locus71_Screen,4.0,F56C9.1_tm5851,F56C9.1,tm5851
4,639.961,1.66,0.451,0.87037,0.271687,1,20241002,20241002_034746_bb,PD_GWAS_Locus71_Screen,5.0,F56C9.1_tm5851,F56C9.1,tm5851


In [11]:
# Read the psa file
psa_output = pd.read_csv(psa_chooser.selected, index_col=0)

for cols in ['Instantaneous Speed', 'Interval Speed',
       'Bias', 'Morphwidth', 'Midline', 'Area', 'Angular Speed',
       'Aspect Ratio', 'Kink', 'Curve', 'Crab', 'Pathlength']:
    psa_output.rename(columns={cols: f"PSA {cols}"}, inplace=True)

print(f"\nShape of the tap .csv file: {psa_output.shape}")

# Print the first five rows of the file
psa_output.head()


Shape of the tap .csv file: (4433, 24)


Unnamed: 0,Experiment,Screen,Date,Plate_id,Gene,Allele,dataset,taps,Time,n,...,Tap,PSA Morphwidth,PSA Midline,PSA Area,PSA Angular Speed,PSA Aspect Ratio,PSA Kink,PSA Curve,PSA Crab,PSA Pathlength
0,1,PD_GWAS_Locus71_Screen,20240717,20240717_031256_A0717aa,N2,N2,N2,1.0,607.029,67.065217,...,0.0,0.113857,1.098531,0.139064,7.689131,0.466565,73.64565,39.419567,0.014424,2.390739
1,1,PD_GWAS_Locus71_Screen,20240717,20240717_031256_A0717aa,N2,N2,N2,2.0,617.023,66.896552,...,0.0,0.100129,1.124203,0.13848,12.524138,0.344966,54.24655,34.47586,0.023114,2.504552
2,1,PD_GWAS_Locus71_Screen,20240717,20240717_031256_A0717aa,N2,N2,N2,3.0,627.023,66.678571,...,0.0,0.101554,1.098889,0.132173,11.882143,0.3165,47.714287,33.060715,0.023891,3.211428
3,1,PD_GWAS_Locus71_Screen,20240717,20240717_031256_A0717aa,N2,N2,N2,4.0,637.037,64.190476,...,0.0,0.096886,1.11255,0.13139,12.773809,0.303762,47.07143,32.52381,0.02601,4.067333
4,1,PD_GWAS_Locus71_Screen,20240717,20240717_031256_A0717aa,N2,N2,N2,5.0,647.032,66.088235,...,0.0,0.095765,1.109976,0.130992,10.367647,0.275471,41.8,31.529411,0.022841,5.233


### Merge PSA with Tap response

In [12]:
tap_psa_output = pd.merge(
    tap_output, psa_output.drop(columns=['Experiment', 'Time', 'Tap', 'PSA Morphwidth', 
                                         'PSA Midline', 'PSA Area', 'PSA Angular Speed',]),
    how='outer', 
    on=['dataset', 'Gene', 'Allele', 'Date', 'Plate_id', 'Screen', "taps" ] 
)

tap_psa_output = tap_psa_output[['dataset', 'Gene', 'Allele', 'Date', 'Plate_id', 'plate', 
                                 'Screen', 'taps', 'time', 'dura', 'dist', 'prob', 'speed',
                                 'PSA Instantaneous Speed', 'PSA Interval Speed', 'PSA Bias',
                                 'PSA Aspect Ratio', 'PSA Kink', 'PSA Curve', 'PSA Crab'
                                 ]]

print(f"Shape of the dataframe: {tap_psa_output.shape}")

tap_psa_output.rename(columns={
    'prob': 'Probability',
    'dura': 'Duration',
    'speed': 'Speed'
}, inplace=True)

tap_psa_output.head()

Shape of the dataframe: (4433, 20)


Unnamed: 0,dataset,Gene,Allele,Date,Plate_id,plate,Screen,taps,time,Duration,dist,Probability,Speed,PSA Instantaneous Speed,PSA Interval Speed,PSA Bias,PSA Aspect Ratio,PSA Kink,PSA Curve,PSA Crab
0,F56C9.1_tm5851,F56C9.1,tm5851,20241002,20241002_034746_bb,1.0,PD_GWAS_Locus71_Screen,1.0,599.971,3.21,0.786,0.891892,0.24486,0.098753,0.098847,0.402842,0.404395,65.323685,36.576317,0.015576
1,F56C9.1_tm5851,F56C9.1,tm5851,20241002,20241002_034746_bb,1.0,PD_GWAS_Locus71_Screen,2.0,609.968,2.32,0.637,0.918919,0.274569,0.192097,0.126659,0.682719,0.305594,52.35625,32.453125,0.019709
2,F56C9.1_tm5851,F56C9.1,tm5851,20241002,20241002_034746_bb,1.0,PD_GWAS_Locus71_Screen,3.0,619.984,2.29,0.709,0.883721,0.309607,0.248714,0.110986,0.902107,0.261536,41.864285,31.189285,0.021886
3,F56C9.1_tm5851,F56C9.1,tm5851,20241002,20241002_034746_bb,1.0,PD_GWAS_Locus71_Screen,4.0,629.96,1.68,0.568,0.84,0.338095,0.237523,0.114471,0.881019,0.256212,38.01154,29.651924,0.020067
4,F56C9.1_tm5851,F56C9.1,tm5851,20241002,20241002_034746_bb,1.0,PD_GWAS_Locus71_Screen,5.0,639.961,1.66,0.451,0.87037,0.271687,0.217495,0.101515,0.919805,0.231561,41.685364,28.712194,0.014661


In [13]:
# tap_psa_output.to_csv("tap_psa_output.csv")

# 2. DataFrame preparation

### 2.1. Tap Data

In [14]:
# Dataframe for first tap
PD_first_tap = (
    tap_output[(tap_output.taps==1)]
    .reset_index().drop(columns="index")
    .rename(columns={"dura": "init_dura", "prob": "init_prob", "speed": "init_speed"}, errors="raise")
)

PD_first_tap.head()

Unnamed: 0,time,init_dura,dist,init_prob,init_speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.971,3.21,0.786,0.891892,0.24486,1,20241002,20241002_034746_bb,PD_GWAS_Locus71_Screen,1.0,F56C9.1_tm5851,F56C9.1,tm5851
1,599.937,3.45,0.812,0.702703,0.235362,2,20241002,20241002_050423_be,PD_GWAS_Locus71_Screen,1.0,F56C9.1_tm5851,F56C9.1,tm5851
2,599.99,3.34,0.811,0.892857,0.242814,3,20241002,20241002_102947_B1002ba,PD_GWAS_Locus71_Screen,1.0,F56C9.1_tm5851,F56C9.1,tm5851
3,599.994,2.54,0.562,0.87234,0.22126,4,20241002,20241002_112933_C1002bc,PD_GWAS_Locus71_Screen,1.0,F56C9.1_tm5851,F56C9.1,tm5851
4,599.98,3.1,0.684,0.794872,0.220645,5,20241002,20241002_114941_B1002bd,PD_GWAS_Locus71_Screen,1.0,F56C9.1_tm5851,F56C9.1,tm5851


In [15]:
# Dataframe for recovery taps
PD_recov_taps = (
    tap_output[(tap_output.taps==31)]
    .reset_index().drop(columns="index")
    .rename(columns={"dura": "recov_dura", "prob": "recov_prob", "speed":"recov_speed"})
)

PD_recov_taps.head()

Unnamed: 0,time,recov_dura,dist,recov_prob,recov_speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,1189.989,1.82,0.397,0.814815,0.218132,1,20241002,20241002_034746_bb,PD_GWAS_Locus71_Screen,31.0,F56C9.1_tm5851,F56C9.1,tm5851
1,1189.977,2.44,0.595,0.857143,0.243852,2,20241002,20241002_050423_be,PD_GWAS_Locus71_Screen,31.0,F56C9.1_tm5851,F56C9.1,tm5851
2,1189.988,2.59,0.575,0.870968,0.222008,3,20241002,20241002_102947_B1002ba,PD_GWAS_Locus71_Screen,31.0,F56C9.1_tm5851,F56C9.1,tm5851
3,1189.976,2.79,0.648,0.931034,0.232258,4,20241002,20241002_112933_C1002bc,PD_GWAS_Locus71_Screen,31.0,F56C9.1_tm5851,F56C9.1,tm5851
4,1189.976,2.73,0.628,0.891892,0.230037,5,20241002,20241002_114941_B1002bd,PD_GWAS_Locus71_Screen,31.0,F56C9.1_tm5851,F56C9.1,tm5851


In [16]:
# Dataframe for last three taps
PD_final_taps = (
    tap_output[((tap_output.taps >= 28) & (tap_output.taps <= 30))]
    .groupby(["dataset", "Date","Plate_id","Screen","Gene","Allele","plate"])
    .mean()
    .reset_index()
    .rename(columns={"dura": "final_dura", "prob": "final_prob", "speed": "final_speed"}, errors="raise")
)

PD_final_taps.head()

Unnamed: 0,dataset,Date,Plate_id,Screen,Gene,Allele,plate,time,final_dura,dist,final_prob,final_speed,taps
0,F56C9.1_tm5851,20241002,20241002_034746_bb,PD_GWAS_Locus71_Screen,F56C9.1,tm5851,1,879.99,0.57,0.118333,0.664508,0.208919,29.0
1,F56C9.1_tm5851,20241002,20241002_050423_be,PD_GWAS_Locus71_Screen,F56C9.1,tm5851,2,879.989333,0.913333,0.212,0.616558,0.229307,29.0
2,F56C9.1_tm5851,20241002,20241002_102947_B1002ba,PD_GWAS_Locus71_Screen,F56C9.1,tm5851,3,879.911,0.963333,0.222333,0.447405,0.228549,29.0
3,F56C9.1_tm5851,20241002,20241002_112933_C1002bc,PD_GWAS_Locus71_Screen,F56C9.1,tm5851,4,879.952667,1.023333,0.208333,0.426684,0.203101,29.0
4,F56C9.1_tm5851,20241002,20241002_114941_B1002bd,PD_GWAS_Locus71_Screen,F56C9.1,tm5851,5,879.923333,1.21,0.236667,0.510495,0.195392,29.0


In [17]:
# Dataframe to analyse habituation behaviour after merging first tap and final taps

PD_habit_levels = pd.merge(
    PD_first_tap, 
    PD_final_taps, 
    on =['dataset', 'plate', "Plate_id", "Screen", "Gene", "Allele", "Date"], how ='left'
).drop(columns=['time_x','time_y','dist_x','dist_y', 'taps_x', 'taps_y']).dropna()

PD_habit_levels['habit_dura'] = PD_habit_levels['init_dura'] - PD_habit_levels['final_dura']

PD_habit_levels['habit_prob'] = PD_habit_levels['init_prob'] - PD_habit_levels['final_prob']

PD_habit_levels['habit_speed'] = PD_habit_levels['init_speed'] - PD_habit_levels['final_speed']

In [18]:
# Continue to analyse habituation behaviour after merging with recovery taps

if PD_recov_taps.empty:
    PD_habituation = pd.merge(PD_habit_levels, PD_recov_taps, on =['dataset','plate',"Plate_id","Screen","Gene","Allele","Date"], how ='outer')
else:
    PD_habituation = pd.merge(PD_habit_levels, PD_recov_taps, on =['dataset','plate',"Plate_id","Screen","Gene","Allele","Date"], how ='left')

if Screen not in ['Neuron_Genes_Screen', 'G-Proteins_Screen']:
    PD_habituation = PD_habituation.dropna() 

PD_habituation['recovery_dura']=(PD_habituation.recov_dura-PD_habituation.init_dura)/PD_habituation.init_dura*100

PD_habituation['recovery_prob']=(PD_habituation.recov_prob-PD_habituation.init_prob)/PD_habituation.init_prob*100

PD_habituation['recovery_speed']=(PD_habituation.recov_speed-PD_habituation.init_speed)/PD_habituation.init_speed*100

PD_habituation['memory_retention_dura']=(PD_habituation.recov_dura-PD_habituation.final_dura)

PD_habituation['memory_retention_prob']=(PD_habituation.recov_prob-PD_habituation.final_prob)

PD_habituation['memory_retention_speed']=(PD_habituation.recov_speed-PD_habituation.final_speed)


# Rename `PD_habituation` to `tap_data` based on the condition below
if Screen in ['Neuron_Genes_Screen', 'G-Proteins_Screen']:
    tap_data=PD_habituation.dropna(subset = ['init_dura', 'init_prob', 'init_speed', 'plate', 'Date', 'Plate_id',
       'Screen', 'dataset', 'Gene', 'Allele', 'final_dura', 'final_prob',
       'final_speed', 'habit_dura', 'habit_prob', 'habit_speed'])
else:
    tap_data=PD_habituation.dropna() 


# Display final dataframe
tap_data.head()


Unnamed: 0,init_dura,init_prob,init_speed,plate,Date,Plate_id,Screen,dataset,Gene,Allele,...,dist,recov_prob,recov_speed,taps,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,3.21,0.891892,0.24486,1,20241002,20241002_034746_bb,PD_GWAS_Locus71_Screen,F56C9.1_tm5851,F56C9.1,tm5851,...,0.397,0.814815,0.218132,31.0,-43.302181,-8.641975,-10.915611,1.25,0.150307,0.009213
1,3.45,0.702703,0.235362,2,20241002,20241002_050423_be,PD_GWAS_Locus71_Screen,F56C9.1_tm5851,F56C9.1,tm5851,...,0.595,0.857143,0.243852,31.0,-29.275362,21.978022,3.607264,1.526667,0.240585,0.014546
2,3.34,0.892857,0.242814,3,20241002,20241002_102947_B1002ba,PD_GWAS_Locus71_Screen,F56C9.1_tm5851,F56C9.1,tm5851,...,0.575,0.870968,0.222008,31.0,-22.45509,-2.451613,-8.568953,1.626667,0.423563,-0.006542
3,2.54,0.87234,0.22126,4,20241002,20241002_112933_C1002bc,PD_GWAS_Locus71_Screen,F56C9.1_tm5851,F56C9.1,tm5851,...,0.648,0.931034,0.232258,31.0,9.84252,6.728343,4.970727,1.766667,0.504351,0.029157
4,3.1,0.794872,0.220645,5,20241002,20241002_114941_B1002bd,PD_GWAS_Locus71_Screen,F56C9.1_tm5851,F56C9.1,tm5851,...,0.628,0.891892,0.230037,31.0,-11.935484,12.205754,4.256367,1.52,0.381397,0.034644


### 2.2. PSA data

In [19]:
# function to calculate Initial, Final, Peak, ect values for specified column (metric)

def summary_metrics(df, metric = 'Instantaneous Speed'):

    initial = df[metric].iloc[0]
    recovery = df[metric].iloc[-1]
    peak = df[metric].max()
    peak_id = df[metric].values.argmax()
    peak_tap = df['taps'].iloc[peak_id]
    mean = df[metric].mean()
    peak_id = df[metric].values.argmax()
    initial_to_peak = df[metric].iloc[: peak_id+1].mean()
    peak_to_recovery = df[metric].iloc[peak_id:].mean()
    

    return pd.Series({
        f'Initial {metric}': initial, 
        f'Recovery {metric}': recovery, 
        f'Peak {metric}': peak,
        f'Peak Tap Number {metric}': peak_tap,
        f'Initial_to_peak {metric}': initial_to_peak, 
        f'Peak_to_recovery {metric}': peak_to_recovery,
        f'Average {metric}': mean
        })

In [20]:
warnings.filterwarnings('ignore')

# columns to summarize
metrics_to_summarize = ['PSA Instantaneous Speed', 'PSA Bias', 'PSA Angular Speed', 
                        'PSA Aspect Ratio', 'PSA Kink', 'PSA Curve', 'PSA Crab']

# standard columns
group_cols = ['Experiment', 'Plate_id', 'Date', 'Screen', 'dataset', 'Gene', 'Allele']

# pass each column to summarise through `summary_metrics` function and merge the summarised values to psa_output
psa_data = psa_output[group_cols].drop_duplicates()
for metric in metrics_to_summarize:
    summary = psa_output.groupby(group_cols).apply(lambda x: summary_metrics(x, metric)).reset_index()
    psa_data = pd.merge(psa_data, summary, on=group_cols, how='left')

In [21]:
psa_data.head()

Unnamed: 0,Experiment,Plate_id,Date,Screen,dataset,Gene,Allele,Initial PSA Instantaneous Speed,Recovery PSA Instantaneous Speed,Peak PSA Instantaneous Speed,...,Initial_to_peak PSA Curve,Peak_to_recovery PSA Curve,Average PSA Curve,Initial PSA Crab,Recovery PSA Crab,Peak PSA Crab,Peak Tap Number PSA Crab,Initial_to_peak PSA Crab,Peak_to_recovery PSA Crab,Average PSA Crab
0,1,20240717_031256_A0717aa,20240717,PD_GWAS_Locus71_Screen,N2,N2,N2,0.081913,0.088761,0.245082,...,39.419567,30.828508,30.828508,0.014424,0.010394,0.02601,4.0,0.02186,0.015312,0.015811
1,2,20240717_043608_A0717ad,20240717,PD_GWAS_Locus71_Screen,N2,N2,N2,0.092412,0.151708,0.274664,...,40.54407,30.863218,30.863218,0.013585,0.017172,0.027013,3.0,0.021999,0.016589,0.016776
2,3,20240717_105144_B0717ab,20240717,PD_GWAS_Locus71_Screen,N2,N2,N2,0.095345,0.1005,0.2634,...,37.012123,30.774809,30.774809,0.014,0.01269,0.026975,2.0,0.020487,0.016356,0.01628
3,4,20240717_112334_C0717ac,20240717,PD_GWAS_Locus71_Screen,N2,N2,N2,0.110297,0.110721,0.239817,...,30.606978,35.99474,30.606978,0.014845,0.0121,0.027056,3.0,0.02194,0.015281,0.015545
4,5,20240717_121311_B0717ae,20240717,PD_GWAS_Locus71_Screen,N2,N2,N2,0.096595,0.103742,0.28022,...,38.663635,29.815205,29.815205,0.015555,0.013211,0.025865,3.0,0.022334,0.015424,0.015756


In [22]:
psa_data.shape

(143, 56)

# 3. Run Statistics (T-Test and mean sample distance) on Data

## 3.1 Generate dataframes conditioned by `baseline` (True/False) and `allele` (True/False)

In [23]:
def get_output_byplate(output, baseline=["true", "false", "psa"], allele = [False, True]):
    """
    Aggregates data by 'Gene' or 'Allele' and drops 'Plate_id','Date','Screen','dataset', etc

    Parameters:
        output (pd.DataFrame): Input DataFrame (either baseline_output or tap_data)
        baseline (boolean): whether data is baseline (True) or tap response (False)
        allele (boolean): group by allele (True) or group by gene (False)

    Returns:
        A DataFrame with plate-level averages
    """
    
    # columns to delete if baseline = true
    if baseline == "true":
        drop_col = ['Plate_id','n','Number','Time','Screen','Date','Allele']
    # columns to delete if baseline = false
    elif baseline == "false":
        drop_col = ['Plate_id','Screen','Date','Allele','dist','plate','time',
                       'taps','recov_dura','recov_prob','recov_speed']
    # columns to delete if baseline = psa
    else: 
        drop_col = ['Experiment', 'Plate_id', 'Date', 'Screen', 'Allele']

    drop_col.append('Gene') if allele else drop_col.append('dataset')
     
    output_byplate = output.groupby(
        by=['Plate_id','Date','Screen','dataset','Gene','Allele'],
        as_index=False).mean().drop(columns=drop_col)
    
    return output_byplate

#### 3.1.1 `baseline` = True, `allele` = False

In [24]:
baseline_output_byplate=get_output_byplate(baseline_output, baseline= "true", allele=False)

print(f"Shape: {baseline_output_byplate.shape}")

baseline_output_byplate.head()

Shape: (143, 13)


Unnamed: 0,Gene,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,N2,0.037623,0.046815,0.02589,0.103025,1.106846,0.136614,2.652896,0.2638,43.818739,28.742547,0.006616,2.559248
1,tlk-1,0.046806,0.054001,0.224306,0.109251,0.961308,0.123101,3.332453,0.293602,49.047317,29.077087,0.007296,8.393271
2,N2,0.052749,0.060365,0.178015,0.107982,1.166192,0.148761,2.823113,0.264263,45.193378,29.452678,0.007588,12.283984
3,tlk-1,0.066274,0.065152,0.340591,0.101147,0.939145,0.116957,4.630818,0.274451,43.769097,26.835439,0.008497,5.857463
4,tlk-1,0.057581,0.062705,0.31205,0.105636,0.895827,0.113698,3.392918,0.256364,38.721597,24.534756,0.007549,9.269532


#### 3.1.2 `baseline` = False, `allele` = False

In [25]:
tap_data_byplate=get_output_byplate(tap_data, baseline="false", allele=False)

print(f"Shape: {tap_data_byplate.shape}")

tap_data_byplate.head()

Shape: (140, 16)


Unnamed: 0,Gene,init_dura,init_prob,init_speed,final_dura,final_prob,final_speed,habit_dura,habit_prob,habit_speed,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,N2,2.48,0.947368,0.202016,0.85,0.328378,0.191174,1.63,0.61899,0.010843,-36.290323,-4.659498,20.61952,0.73,0.574848,0.052497
1,tlk-1,1.62,0.766667,0.206173,0.996667,0.237044,0.166373,0.623333,0.529622,0.0398,-9.259259,-14.933837,9.544177,0.473333,0.41513,0.059478
2,N2,2.54,0.846154,0.2,0.683333,0.397547,0.201687,1.856667,0.448607,-0.001687,-37.401575,-14.049587,18.553459,0.906667,0.329726,0.03542
3,tlk-1,2.15,0.888889,0.237209,0.856667,0.279337,0.177961,1.293333,0.609552,0.059249,4.186047,-16.847826,-6.464461,1.383333,0.459793,0.043914
4,tlk-1,1.84,0.65625,0.213043,1.456667,0.303096,0.17899,0.383333,0.353154,0.034053,30.434783,-21.182266,2.287415,0.943333,0.214146,0.038926


#### 3.1.3 `baseline` = True, `allele` = True

In [26]:
baseline_output_allele_byplate = get_output_byplate(baseline_output,baseline="true", allele=True)

print(f"Shape: {baseline_output_allele_byplate.shape}")

baseline_output_allele_byplate.head()

Shape: (143, 13)


Unnamed: 0,dataset,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,N2,0.037623,0.046815,0.02589,0.103025,1.106846,0.136614,2.652896,0.2638,43.818739,28.742547,0.006616,2.559248
1,tlk-1_tm2395,0.046806,0.054001,0.224306,0.109251,0.961308,0.123101,3.332453,0.293602,49.047317,29.077087,0.007296,8.393271
2,N2,0.052749,0.060365,0.178015,0.107982,1.166192,0.148761,2.823113,0.264263,45.193378,29.452678,0.007588,12.283984
3,tlk-1_tm2395,0.066274,0.065152,0.340591,0.101147,0.939145,0.116957,4.630818,0.274451,43.769097,26.835439,0.008497,5.857463
4,tlk-1_tm2395,0.057581,0.062705,0.31205,0.105636,0.895827,0.113698,3.392918,0.256364,38.721597,24.534756,0.007549,9.269532


#### 3.1.4 `baseline` = False, `allele` = True

In [27]:
tap_data_allele_byplate = get_output_byplate(tap_data, baseline="false", allele=True)

print(f"Shape: {tap_data_allele_byplate.shape}")

tap_data_allele_byplate.head()

Shape: (140, 16)


Unnamed: 0,dataset,init_dura,init_prob,init_speed,final_dura,final_prob,final_speed,habit_dura,habit_prob,habit_speed,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,N2,2.48,0.947368,0.202016,0.85,0.328378,0.191174,1.63,0.61899,0.010843,-36.290323,-4.659498,20.61952,0.73,0.574848,0.052497
1,tlk-1_tm2395,1.62,0.766667,0.206173,0.996667,0.237044,0.166373,0.623333,0.529622,0.0398,-9.259259,-14.933837,9.544177,0.473333,0.41513,0.059478
2,N2,2.54,0.846154,0.2,0.683333,0.397547,0.201687,1.856667,0.448607,-0.001687,-37.401575,-14.049587,18.553459,0.906667,0.329726,0.03542
3,tlk-1_tm2395,2.15,0.888889,0.237209,0.856667,0.279337,0.177961,1.293333,0.609552,0.059249,4.186047,-16.847826,-6.464461,1.383333,0.459793,0.043914
4,tlk-1_tm2395,1.84,0.65625,0.213043,1.456667,0.303096,0.17899,0.383333,0.353154,0.034053,30.434783,-21.182266,2.287415,0.943333,0.214146,0.038926


In [28]:
# tap_data_allele_byplate[tap_data_allele_byplate.dataset=='N2_XJ1']

#### 3.1.5 `baseline` = "psa" , `allele` = False

In [29]:
psa_data_byplate = get_output_byplate(psa_data, baseline="psa", allele=False)

print(f"Shape: {psa_data_byplate.shape}")

psa_data_byplate.head()

Shape: (143, 50)


Unnamed: 0,Gene,Initial PSA Instantaneous Speed,Recovery PSA Instantaneous Speed,Peak PSA Instantaneous Speed,Peak Tap Number PSA Instantaneous Speed,Initial_to_peak PSA Instantaneous Speed,Peak_to_recovery PSA Instantaneous Speed,Average PSA Instantaneous Speed,Initial PSA Bias,Recovery PSA Bias,...,Initial_to_peak PSA Curve,Peak_to_recovery PSA Curve,Average PSA Curve,Initial PSA Crab,Recovery PSA Crab,Peak PSA Crab,Peak Tap Number PSA Crab,Initial_to_peak PSA Crab,Peak_to_recovery PSA Crab,Average PSA Crab
0,N2,0.081913,0.088761,0.245082,5.0,0.195989,0.170619,0.172309,0.341196,0.364722,...,39.419567,30.828508,30.828508,0.014424,0.010394,0.02601,4.0,0.02186,0.015312,0.015811
1,tlk-1,0.071283,0.073303,0.146562,5.0,0.123265,0.117932,0.117868,0.439017,0.4685,...,30.204097,26.819351,26.916039,0.009778,0.009531,0.01751,5.0,0.014105,0.011417,0.011654
2,N2,0.092412,0.151708,0.274664,5.0,0.215709,0.192295,0.193415,0.329983,0.86225,...,40.54407,30.863218,30.863218,0.013585,0.017172,0.027013,3.0,0.021999,0.016589,0.016776
3,tlk-1,0.058442,0.069992,0.15619,7.0,0.126196,0.129233,0.127677,0.339542,0.348974,...,32.34915,27.14458,27.14458,0.010664,0.010084,0.020658,14.0,0.013855,0.012181,0.012664
4,tlk-1,0.077169,0.087164,0.163942,6.0,0.140181,0.139857,0.139143,0.383466,0.526472,...,32.91724,25.098708,25.098708,0.011209,0.011072,0.015605,6.0,0.014058,0.01155,0.011905


#### 3.1.6 `baseline` = "psa" , `allele` = True

In [30]:
psa_data_allele_byplate = get_output_byplate(psa_data, baseline="psa", allele=True)

print(f"Shape: {psa_data_allele_byplate.shape}")

psa_data_allele_byplate.head()

Shape: (143, 50)


Unnamed: 0,dataset,Initial PSA Instantaneous Speed,Recovery PSA Instantaneous Speed,Peak PSA Instantaneous Speed,Peak Tap Number PSA Instantaneous Speed,Initial_to_peak PSA Instantaneous Speed,Peak_to_recovery PSA Instantaneous Speed,Average PSA Instantaneous Speed,Initial PSA Bias,Recovery PSA Bias,...,Initial_to_peak PSA Curve,Peak_to_recovery PSA Curve,Average PSA Curve,Initial PSA Crab,Recovery PSA Crab,Peak PSA Crab,Peak Tap Number PSA Crab,Initial_to_peak PSA Crab,Peak_to_recovery PSA Crab,Average PSA Crab
0,N2,0.081913,0.088761,0.245082,5.0,0.195989,0.170619,0.172309,0.341196,0.364722,...,39.419567,30.828508,30.828508,0.014424,0.010394,0.02601,4.0,0.02186,0.015312,0.015811
1,tlk-1_tm2395,0.071283,0.073303,0.146562,5.0,0.123265,0.117932,0.117868,0.439017,0.4685,...,30.204097,26.819351,26.916039,0.009778,0.009531,0.01751,5.0,0.014105,0.011417,0.011654
2,N2,0.092412,0.151708,0.274664,5.0,0.215709,0.192295,0.193415,0.329983,0.86225,...,40.54407,30.863218,30.863218,0.013585,0.017172,0.027013,3.0,0.021999,0.016589,0.016776
3,tlk-1_tm2395,0.058442,0.069992,0.15619,7.0,0.126196,0.129233,0.127677,0.339542,0.348974,...,32.34915,27.14458,27.14458,0.010664,0.010084,0.020658,14.0,0.013855,0.012181,0.012664
4,tlk-1_tm2395,0.077169,0.087164,0.163942,6.0,0.140181,0.139857,0.139143,0.383466,0.526472,...,32.91724,25.098708,25.098708,0.011209,0.011072,0.015605,6.0,0.014058,0.01155,0.011905


## 3.2 Calculate Mean Distances and CIs

In [31]:

def extract_phenotypes(df):
    ''' 
    Splits a multi-column DataFrame into a list of DataFrames, each containing one phenotype

    input: 
        df (pd.DataFrame): dataframe with multiple columns (1st column is the index, the other are phenotypes)

    returns:
        list_phenotypes_df: list with 2 columns - one for index and one for phenotype, 
            for how many phenotypes there are in the input
    '''
    list_phenotypes_df = []
    index = df.columns[0]
    for i in df.columns[1:]:
        list_phenotypes_df.append(df[[index, i]].copy())

    return list_phenotypes_df



def ci95(df):
    """
    input: df of 4 columns: index, mean, count, std

    returns: df of 6 columns: index, mean, count, std, ci95_hi, ci95_low

    """
    for metric in df.columns.levels[0]:
        if metric == 'Gene':
            pass
        else:
            ci95_hi = []
            ci95_lo = []
            for i in df[metric].index:
                m = df[metric]['mean'].loc[i]
                c = df[metric]['count'].loc[i]
                s = df[metric]['sem'].loc[i]
                ci95_hi.append(stats.t.interval(confidence=0.95, df=c-1, loc=m, scale=s)[1])
                ci95_lo.append(stats.t.interval(confidence=0.95, df=c-1, loc=m, scale=s)[0])
            df[metric,'ci95_hi'] = ci95_hi
            df[metric,'ci95_lo'] = ci95_lo
            # df[metric,'ci95']=list(zip(ci95_lo,ci95_hi))
            
    return df



def calculate_MSD(list_of_dfs, by):
    new_list_of_dfs = []
    
    for df in list_of_dfs:
        # Get phenotype column name (assuming 2nd column is the metric)
        pheno_col = df.columns[1]
        
        # Calculate statistics
        stats = df.groupby(by)[df.columns[1]].agg(['mean', 'count', 'sem'])

        
        # Convert to MultiIndex if needed (more robust version)
        if not isinstance(stats.columns, pd.MultiIndex):
            stats.columns = pd.MultiIndex.from_tuples([(pheno_col, col) for col in stats.columns])
        
        # Calculate CI
        stats_2 = ci95(stats)
        
        # Get N2 control data
        if Screen == "Neuron_Genes_Screen":
            N2_mask = stats_2.index == 'N2' if by == "Gene" else stats_2.index.isin(['N2_XJ1','N2_N2'])
        else:
            N2_mask = stats_2.index == 'N2'
            
        N2_data = stats_2[N2_mask]
        
        # Subtract N2 values
        stats_2.iloc[:, 0] -= N2_data.iloc[0, 0]  # mean
        stats_2.iloc[:, 3] -= N2_data.iloc[0, 0]  # ci95_hi
        stats_2.iloc[:, 4] -= N2_data.iloc[0, 0]  # ci95_low
        
        new_list_of_dfs.append(stats_2)
    
    return new_list_of_dfs

In [32]:
def calculate_MSD(list_of_dfs, by):
    new_list_of_dfs = []
    
    for df in list_of_dfs:
        # Get phenotype column name (assuming 2nd column is the metric)
        pheno_col = df.columns[1]
        
        # Create proper MultiIndex structure
        stats = df.groupby(by)[df.columns[1]].agg(['mean', 'count', 'sem'])

        # Convert to MultiIndex if needed (more robust version)
        if not isinstance(stats.columns, pd.MultiIndex):
            stats.columns = pd.MultiIndex.from_tuples([(pheno_col, col) for col in stats.columns])
        
        # Calculate CIs
        stats_2 = ci95(stats)
        
        # Get N2 control data
        if Screen == "Neuron_Genes_Screen":
            N2_mask = stats_2.index == 'N2' if by == "Gene" else stats_2.index.isin(['N2_XJ1','N2_N2'])
        else:
            N2_mask = stats_2.index == 'N2'
            
        N2_data = stats_2[N2_mask]
        
        # Subtract N2 values
        stats_2.iloc[:, 0] -= N2_data.iloc[0, 0]  # mean
        stats_2.iloc[:, 3] -= N2_data.iloc[0, 0]  # ci95_hi
        stats_2.iloc[:, 4] -= N2_data.iloc[0, 0]  # ci95_low
        
        new_list_of_dfs.append(stats_2)
    
    return new_list_of_dfs

In [33]:
def get_MSD(list_MSD):
    '''
    input: List of dataframes, each representing a phenotype with calculated MSD.

    returns: Single combined dataframe joining all input dataframes with MSD values.
    '''
    for a in list_MSD:
        if a.columns.levels[0] == list_MSD[0].columns.levels[0]:
            MSD=a
        else:
            MSD=MSD.join(a)
    return MSD

In [34]:
def get_combined_MSD(baseline_byplate,tap_byplate, psa_byplate, by=['Gene','dataset']):
    """
    Combines MSD datafram from baseline plates and tap plates

    input:
        - baseline_byplate: baseline data by plate
        - tap_byplate: tap data by plate
        - by: what to group by "Gene" or "dataset"
    returns:
        - combined MSD dataframe
    """
    list_baseline_MSD=calculate_MSD(extract_phenotypes(baseline_byplate), by=by)

    list_tap_MSD=calculate_MSD(extract_phenotypes(tap_byplate), by=by)

    list_psa_MSD=calculate_MSD(extract_phenotypes(psa_byplate), by=by)

    baseline_MSD = get_MSD(list_baseline_MSD)
    
    tap_MSD = get_MSD(list_tap_MSD)

    psa_MSD = get_MSD(list_psa_MSD)

    combined_MSD = pd.merge(pd.merge(baseline_MSD, tap_MSD, on=by, how='outer'), psa_MSD, on=by, how='outer')

    combined_MSD=combined_MSD.rename(columns={"habit_dura":"Habituation of Response Duration",
                                         "habit_prob": "Habituation of Respones Probability",
                                         "habit_speed":"Habituation of Response Speed",
                                         "init_dura": "Initial Response Duration",
                                         "init_prob": "Initial Response Probability",
                                         "init_speed": "Initial Response Speed",
                                         "final_dura": "Final Response Duration",
                                         "final_prob": "Final Response Probability",
                                         "final_speed": "Final Response Speed",
                                         "recovery_dura": "Spontaneous Recovery of Response Duration",
                                         "recovery_prob": "Spontaneous Recovery of Response Probability",
                                         "recovery_speed": "Spontaneous Recovery of Response Speed",
                                         "memory_retention_dura": "Memory Retention of Response Duration",
                                         "memory_retention_prob": "Memory Retention of Response Probability",
                                         "memory_retention_speed": "Memory Retention of Response Speed"})

    combined_MSD=combined_MSD.reset_index()
    combined_MSD.columns = combined_MSD.columns.to_flat_index().str.join('-')
    combined_MSD=combined_MSD.rename(columns={by+"-": by})
    combined_MSD['Screen']=Screen
    
    return combined_MSD

### 3.2.1 Gene-level SMD

In [35]:
combined_MSD=get_combined_MSD(baseline_output_byplate,
                              tap_data_byplate, 
                              psa_data_byplate,
                              by='Gene')

combined_MSD.head()

Unnamed: 0,Gene,Instantaneous Speed-mean,Instantaneous Speed-count,Instantaneous Speed-sem,Instantaneous Speed-ci95_hi,Instantaneous Speed-ci95_lo,Interval Speed-mean,Interval Speed-count,Interval Speed-sem,Interval Speed-ci95_hi,...,Peak_to_recovery PSA Crab-count,Peak_to_recovery PSA Crab-sem,Peak_to_recovery PSA Crab-ci95_hi,Peak_to_recovery PSA Crab-ci95_lo,Average PSA Crab-mean,Average PSA Crab-count,Average PSA Crab-sem,Average PSA Crab-ci95_hi,Average PSA Crab-ci95_lo,Screen
0,F56C9.1,-0.019745,25,0.001985,-0.015648,-0.023841,-0.02086,25,0.002091,-0.016545,...,25,0.000339,-0.004556,-0.005957,-0.005217,25,0.000333,-0.004531,-0.005904,PD_GWAS_Locus71_Screen
1,N2,0.0,66,0.002052,0.004099,-0.004099,0.0,66,0.001602,0.0032,...,66,0.000328,0.000656,-0.000656,0.0,66,0.000291,0.000582,-0.000582,PD_GWAS_Locus71_Screen
2,clec-51,-0.033603,5,0.002462,-0.026766,-0.040439,-0.041034,5,0.001903,-0.035749,...,5,0.000665,-0.005314,-0.009004,-0.007226,5,0.000687,-0.00532,-0.009132,PD_GWAS_Locus71_Screen
3,clec-52,-0.028551,12,0.001989,-0.024174,-0.032928,-0.032089,12,0.003238,-0.024962,...,12,0.000899,-0.003634,-0.007591,-0.006442,12,0.000418,-0.005522,-0.007361,PD_GWAS_Locus71_Screen
4,dog-1,-0.012725,10,0.00346,-0.004897,-0.020554,-0.014338,10,0.002925,-0.007722,...,10,0.000643,0.002464,-0.000446,0.000873,10,0.000621,0.002277,-0.000531,PD_GWAS_Locus71_Screen


### 3.2.2 Allele-level SMD

In [36]:
allele_combined_MSD=get_combined_MSD(baseline_output_allele_byplate,
                                     tap_data_allele_byplate, 
                                     psa_data_allele_byplate,
                                     by='dataset')

allele_combined_MSD.head()

Unnamed: 0,dataset,Instantaneous Speed-mean,Instantaneous Speed-count,Instantaneous Speed-sem,Instantaneous Speed-ci95_hi,Instantaneous Speed-ci95_lo,Interval Speed-mean,Interval Speed-count,Interval Speed-sem,Interval Speed-ci95_hi,...,Peak_to_recovery PSA Crab-count,Peak_to_recovery PSA Crab-sem,Peak_to_recovery PSA Crab-ci95_hi,Peak_to_recovery PSA Crab-ci95_lo,Average PSA Crab-mean,Average PSA Crab-count,Average PSA Crab-sem,Average PSA Crab-ci95_hi,Average PSA Crab-ci95_lo,Screen
0,F56C9.1_tm5851,-0.023229,10,0.003546,-0.015208,-0.03125,-0.022186,10,0.004205,-0.012672,...,10,0.000365,-0.003469,-0.00512,-0.004316,10,0.000373,-0.003473,-0.005159,PD_GWAS_Locus71_Screen
1,F56C9.1_tm6018,-0.017422,15,0.002214,-0.012672,-0.022171,-0.019977,15,0.002184,-0.015292,...,15,0.000446,-0.004942,-0.006854,-0.005818,15,0.000438,-0.004878,-0.006758,PD_GWAS_Locus71_Screen
2,N2,0.0,66,0.002052,0.004099,-0.004099,0.0,66,0.001602,0.0032,...,66,0.000328,0.000656,-0.000656,0.0,66,0.000291,0.000582,-0.000582,PD_GWAS_Locus71_Screen
3,clec-51_tm6692,-0.033603,5,0.002462,-0.026766,-0.040439,-0.041034,5,0.001903,-0.035749,...,5,0.000665,-0.005314,-0.009004,-0.007226,5,0.000687,-0.00532,-0.009132,PD_GWAS_Locus71_Screen
4,clec-52_tm8126,-0.028551,12,0.001989,-0.024174,-0.032928,-0.032089,12,0.003238,-0.024962,...,12,0.000899,-0.003634,-0.007591,-0.006442,12,0.000418,-0.005522,-0.007361,PD_GWAS_Locus71_Screen


## 3.3 T-Stat analysis

In [37]:
def baseline_metrics(by=["Gene","dataset"]):
    """
    Create a list of empty dataframe and list of metrics for baseline analysis

    input:
        by (list): what to group by "Gene" or "dataset"
        
    returns:
        list_baseline_Tstats: dataframes to store t-statistics
        list_baseline_metrics: dataframes to store metic names
    """
    PD_baseline_instantspeed_T=pd.DataFrame(columns = [by,"Instantaneous Speed"])
    PD_baseline_intspeed_T=pd.DataFrame(columns = [by,"Interval Speed"])
    PD_baseline_bias_T=pd.DataFrame(columns = [by,"Bias"])
    PD_baseline_morphwidth_T=pd.DataFrame(columns = [by,"Morphwidth"])
    PD_baseline_midline_T=pd.DataFrame(columns = [by,"Midline"])
    PD_baseline_area_T=pd.DataFrame(columns = [by,"Area"])
    PD_baseline_angularspeed_T=pd.DataFrame(columns = [by,"Angular Speed"])
    PD_baseline_aspectratio_T=pd.DataFrame(columns = [by,"Aspect Ratio"])
    PD_baseline_kink_T=pd.DataFrame(columns = [by,"Kink"])
    PD_baseline_curve_T=pd.DataFrame(columns = [by,"Curve"])
    PD_baseline_crab_T=pd.DataFrame(columns = [by,"Crab"])
    PD_baseline_pathlength_T=pd.DataFrame(columns = [by,"Pathlength"])

    list_baseline_Tstats=[PD_baseline_instantspeed_T,
                        PD_baseline_intspeed_T,
                        PD_baseline_bias_T,
                        PD_baseline_morphwidth_T,
                        PD_baseline_midline_T,
                        PD_baseline_area_T,
                        PD_baseline_angularspeed_T,
                        PD_baseline_aspectratio_T,
                        PD_baseline_kink_T,
                        PD_baseline_curve_T,
                        PD_baseline_crab_T,
                        PD_baseline_pathlength_T]

    list_baseline_metrics=["Instantaneous Speed",
                        "Interval Speed",
                        "Bias",
                        "Morphwidth",
                        "Midline",
                        "Area",
                        "Angular Speed",
                        "Aspect Ratio",
                        "Kink",
                        "Curve",
                        "Crab",
                        "Pathlength"]
    
    return list_baseline_Tstats, list_baseline_metrics

In [38]:
def tap_metrics(by=["Gene","dataset"]):
    """
    Create a list of empty dataframes and list of metrics for tap analysis

    input:
        by (list): what to group by "Gene" or "dataset"
        
    returns:
        list_tap_Tstats: dataframes to store t-statistics
        list_tap_metrics: dataframes to store metic names
    """
    recovery_dura=pd.DataFrame(columns = [by,"Recovery Duration"])
    recovery_prob=pd.DataFrame(columns = [by,"Recovery Probability"])
    recovery_speed=pd.DataFrame(columns = [by,"Recovery Speed"])
    memory_retention_dura=pd.DataFrame(columns = [by,"Memory Retention Duration"])
    memory_retention_prob=pd.DataFrame(columns = [by,"Memory Retention Probability"])
    memory_retention_speed=pd.DataFrame(columns = [by,"Memory Retention Speed"])
    init_dura=pd.DataFrame(columns = [by,"Initial Duration"])
    init_prob=pd.DataFrame(columns = [by,"Initial Probability"])
    init_speed=pd.DataFrame(columns = [by,"Initial Speed"])
    final_dura=pd.DataFrame(columns = [by,"Final Duration"])
    final_prob=pd.DataFrame(columns = [by,"Final Probability"])
    final_speed=pd.DataFrame(columns = [by,"Final Speed"])
    hab_dura=pd.DataFrame(columns = [by,"Habituation of Duration"])
    hab_prob=pd.DataFrame(columns = [by,"Habituation of Probability"])
    hab_speed=pd.DataFrame(columns = [by,"Habituation of Speed"])

    list_tap_Tstats = [recovery_dura,
                    recovery_prob,
                    recovery_speed,
                    memory_retention_dura,
                    memory_retention_prob,
                    memory_retention_speed,
                    init_dura,
                    init_prob,
                    init_speed,
                    final_dura,
                    final_prob,
                    final_speed,
                    hab_dura,
                    hab_prob,
                    hab_speed]
    
    list_tap_metrics = ["recovery_dura",
                        "recovery_prob",
                        "recovery_speed",
                        "memory_retention_dura",
                        "memory_retention_prob",
                        "memory_retention_speed",
                        "init_dura",
                        "init_prob",
                        "init_speed",
                        "final_dura",
                        "final_prob",
                        "final_speed",
                        "habit_dura",
                        "habit_prob",
                        "habit_speed"]
    
    return list_tap_Tstats, list_tap_metrics

In [39]:
def psa_metrics(by=["Gene", "dataset"]):
    """
    Create a list of empty dataframes and list of metric names for PSA summary analysis.

    input:
        by (list): what to group by ("Gene" or "dataset")

    returns:
        list_psa_Tstats: list of empty DataFrames for t-statistics
        list_psa_metrics: list of metric names (short strings)
    """

    psa_initial_speed = pd.DataFrame(columns=[by,"Initial PSA Instantaneous Speed"])
    psa_recovery_speed = pd.DataFrame(columns=[by,"Recovery PSA Instantaneous Speed"])
    psa_peak_speed = pd.DataFrame(columns=[by,"Peak PSA Instantaneous Speed"])
    psa_initial_to_peak_speed = pd.DataFrame(columns=[by,"Initial_to_peak PSA Instantaneous Speed"])
    psa_peak_to_recovery_speed = pd.DataFrame(columns=[by,"Peak_to_recovery PSA Instantaneous Speed"])
    psa_avg_speed = pd.DataFrame(columns=[by,"Average PSA PSA Instantaneous Speed"])

    psa_initial_bias = pd.DataFrame(columns=[by,"Initial PSA Bias"])
    psa_recovery_bias = pd.DataFrame(columns=[by,"Recovery PSA Bias"])
    psa_peak_bias = pd.DataFrame(columns=[by,"Peak PSA Bias"])
    psa_initial_to_peak_bias = pd.DataFrame(columns=[by,"Initial_to_peak PSA Bias"])
    psa_peak_to_recovery_bias = pd.DataFrame(columns=[by,"Peak_to_recovery PSA Bias"])
    psa_avg_bias = pd.DataFrame(columns=[by,"Average PSA Bias"])

    psa_initial_ang_speed = pd.DataFrame(columns=[by,"Initial PSA Angular Speed"])
    psa_recovery_ang_speed = pd.DataFrame(columns=[by,"Recovery PSA Angular Speed"])
    psa_peak_ang_speed = pd.DataFrame(columns=[by,"Peak PSA Angular Speed"])
    psa_initial_to_peak_ang_speed = pd.DataFrame(columns=[by,"Initial_to_peak PSA Angular Speed"])
    psa_peak_to_recovery_ang_speed = pd.DataFrame(columns=[by,"Peak_to_recovery PSA Angular Speed"])
    psa_avg_ang_speed = pd.DataFrame(columns=[by,"Average PSA Angular Speed"])

    psa_initial_aspect = pd.DataFrame(columns=[by,"Initial PSA Aspect Ratio"])
    psa_recovery_aspect = pd.DataFrame(columns=[by,"Recovery PSA Aspect Ratio"])
    psa_peak_aspect = pd.DataFrame(columns=[by,"Peak PSA Aspect Ratio"])
    psa_initial_to_peak_aspect = pd.DataFrame(columns=[by,"Initial_to_peak PSA Aspect Ratio"])
    psa_peak_to_recovery_aspect = pd.DataFrame(columns=[by,"Peak_to_recovery PSA Aspect Ratio"])
    psa_avg_aspect = pd.DataFrame(columns=[by,"Average PSA Aspect Ratio"])

    psa_initial_kink = pd.DataFrame(columns=[by,"Initial PSA Kink"])
    psa_recovery_kink = pd.DataFrame(columns=[by,"Recovery PSA Kink"])
    psa_peak_kink = pd.DataFrame(columns=[by,"Peak PSA Kink"])
    psa_initial_to_peak_kink = pd.DataFrame(columns=[by,"Initial_to_peak PSA Kink"])
    psa_peak_to_recovery_kink = pd.DataFrame(columns=[by,"Peak_to_recovery PSA Kink"])
    psa_avg_kink = pd.DataFrame(columns=[by,"Average PSA Kink"])

    psa_initial_curve = pd.DataFrame(columns=[by,"Initial PSA Curve"])
    psa_recovery_curve = pd.DataFrame(columns=[by,"Recovery PSA Curve"])
    psa_peak_curve = pd.DataFrame(columns=[by,"Peak PSA Curve"])
    psa_initial_to_peak_curve = pd.DataFrame(columns=[by,"Initial_to_peak PSA Curve"])
    psa_peak_to_recovery_curve = pd.DataFrame(columns=[by,"Peak_to_recovery PSA Curve"])
    psa_avg_curve = pd.DataFrame(columns=[by,"Average PSA Curve"])

    psa_initial_crab = pd.DataFrame(columns=[by,"Initial PSA Crab"])
    psa_recovery_crab = pd.DataFrame(columns=[by,"Recovery PSA Crab"])
    psa_peak_crab = pd.DataFrame(columns=[by,"Peak PSA Crab"])
    psa_initial_to_peak_crab = pd.DataFrame(columns=[by,"Initial_to_peak PSA Crab"])
    psa_peak_to_recovery_crab = pd.DataFrame(columns=[by,"Peak_to_recovery PSA Crab"])
    psa_avg_crab = pd.DataFrame(columns=[by,"Average PSA Crab"])

    list_psa_Tstats = [
        psa_initial_speed, psa_recovery_speed, psa_peak_speed,
        psa_initial_to_peak_speed, psa_peak_to_recovery_speed, psa_avg_speed,

        psa_initial_bias, psa_recovery_bias, psa_peak_bias,
        psa_initial_to_peak_bias, psa_peak_to_recovery_bias, psa_avg_bias,

        psa_initial_ang_speed, psa_recovery_ang_speed, psa_peak_ang_speed,
        psa_initial_to_peak_ang_speed, psa_peak_to_recovery_ang_speed, psa_avg_ang_speed,

        psa_initial_aspect, psa_recovery_aspect, psa_peak_aspect,
        psa_initial_to_peak_aspect, psa_peak_to_recovery_aspect, psa_avg_aspect,

        psa_initial_kink, psa_recovery_kink, psa_peak_kink,
        psa_initial_to_peak_kink, psa_peak_to_recovery_kink, psa_avg_kink,

        psa_initial_curve, psa_recovery_curve, psa_peak_curve,
        psa_initial_to_peak_curve, psa_peak_to_recovery_curve, psa_avg_curve,

        psa_initial_crab, psa_recovery_crab, psa_peak_crab,
        psa_initial_to_peak_crab, psa_peak_to_recovery_crab, psa_avg_crab
    ]

    list_psa_metrics = [
    "Initial PSA Instantaneous Speed",
    "Recovery PSA Instantaneous Speed",
    "Peak PSA Instantaneous Speed",
    "Initial_to_peak PSA Instantaneous Speed",
    "Peak_to_recovery PSA Instantaneous Speed",
    "Average PSA Instantaneous Speed",

    "Initial PSA Bias",
    "Recovery PSA Bias",
    "Peak PSA Bias",
    "Initial_to_peak PSA Bias",
    "Peak_to_recovery PSA Bias",
    "Average PSA Bias",

    "Initial PSA Angular Speed",
    "Recovery PSA Angular Speed",
    "Peak PSA Angular Speed",
    "Initial_to_peak PSA Angular Speed",
    "Peak_to_recovery PSA Angular Speed",
    "Average PSA Angular Speed",

    "Initial PSA Aspect Ratio",
    "Recovery PSA Aspect Ratio",
    "Peak PSA Aspect Ratio",
    "Initial_to_peak PSA Aspect Ratio",
    "Peak_to_recovery PSA Aspect Ratio",
    "Average PSA Aspect Ratio",

    "Initial PSA Kink",
    "Recovery PSA Kink",
    "Peak PSA Kink",
    "Initial_to_peak PSA Kink",
    "Peak_to_recovery PSA Kink",
    "Average PSA Kink",

    "Initial PSA Curve",
    "Recovery PSA Curve",
    "Peak PSA Curve",
    "Initial_to_peak PSA Curve",
    "Peak_to_recovery PSA Curve",
    "Average PSA Curve",

    "Initial PSA Crab",
    "Recovery PSA Crab",
    "Peak PSA Crab",
    "Initial_to_peak PSA Crab",
    "Peak_to_recovery PSA Crab",
    "Average PSA Crab"
]
    
    return list_psa_Tstats, list_psa_metrics


In [40]:
def TTest(Type, DF_ref, output, by=["Gene", "dataset"]):
    """
    Perform two sample t-test for each unique Gene/dataset column in the Df_ref
    input: 
        - a:column name of values 
        - DF_ref:reference dataframe
        - output: output df to store results in 
        - by: what to group by "Gene" or "dataset"
        
    """
    for a in DF_ref[by].unique():
        Tstat_a = ttest_ind(DF_ref[DF_ref.dataset == a][Type], DF_ref[DF_ref.Allele.isin(["XJ1","N2"])][Type],equal_var=False)[0]
        Tstat_g = ttest_ind(DF_ref[DF_ref.Gene == a][Type], DF_ref[DF_ref.Gene == "N2"][Type],equal_var=False)[0]
        Tstat = Tstat_g if by=="Gene" else Tstat_a
        row = [a, Tstat]
        output.loc[len(output)] = row
    # print(output)

def do_TTest(by=["Gene", "dataset"], baseline=["true", "false", "psa"]):
    """
    Perform TTest function for each unique Gene/dataset column in baseline_output/tap_data
    
    input: 
        - by: what to group by "Gene" or "dataset"
        - baseline: whether or not to use baseline data

    returns: sorted T-statistics dataframe
    """

    if baseline=="true":
        list_Tstats, list_metrics = baseline_metrics(by)
        data = baseline_output
    elif baseline=="false":
        list_Tstats,list_metrics = tap_metrics(by)
        data = tap_data
    else:
        list_Tstats,list_metrics = psa_metrics(by)
        data = psa_data
    for x in data[by].unique():
        if Screen=="Neuron_Genes_Screen":
            condition = x in (["N2"] if by == "Gene" else ["N2_XJ1", "N2_N2"])
        else:
            condition = (x =="N2")
        if condition:
            pass
        else:
            output_gene=data[data[by]==x]
            gene_data=data[data['Date'].isin(output_gene['Date'].unique())]
            if Screen=="Neuron_Genes_Screen":
                gene_data_final = gene_data[gene_data[by].isin(['N2', x])] if by=="Gene" else gene_data[gene_data[by].isin(['N2_N2','N2_XJ1', x])]
            else:
                gene_data_final = gene_data[gene_data[by].isin(['N2', x])]

            for a,b in zip(list_metrics, list_Tstats):
                TTest(a, gene_data_final, b, by) # calls t test function
    
    PD_Tstats=pd.DataFrame()
    for a in list_Tstats:
        b=a.groupby([by], as_index=False).mean()
        if b.columns.values[1] == list_Tstats[0].columns.values[1]:
            PD_Tstats=b
        else:
            PD_Tstats=PD_Tstats.join(b.iloc[:,1])
            
    PD_Tstats=PD_Tstats.set_index(by)
    
    return PD_Tstats
            

### T-stat on Baseline data:

### 3.3.1 Allele-level T-stat analysis of baseline data

In [41]:
warnings.filterwarnings('ignore')

PD_baseline_Tstats_allele = do_TTest("dataset", baseline="true") # get sorted T-statistics DataFrame 

# PD_baseline_Tstats_allele_sorted=PD_baseline_Tstats_allele.sort_index()

PD_baseline_Tstats_allele.head()

Unnamed: 0_level_0,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
F56C9.1_tm5851,-94.512488,-72.441971,-41.170625,8.05481,36.860836,30.820266,-77.34865,-60.110006,-33.955937,-101.424504,-79.189063,-89.698107
F56C9.1_tm6018,-80.18503,-77.663922,-57.462244,-82.649188,-21.5011,-91.043504,-21.715668,53.09934,63.226435,65.955876,-47.683132,-58.214243
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
clec-51_tm6692,-139.587254,-162.586652,-78.182874,-25.310107,-93.578528,-76.514859,-86.943016,-62.988493,-43.588897,-149.542786,-84.438878,-90.526029
clec-52_tm8126,-217.061236,-173.387577,-151.211361,-23.488354,-90.997012,-75.944231,-51.348749,-121.950271,-54.510945,-252.654895,-84.687979,-72.324818


### 3.3.2 Gene-level T-stat analysis of baseline data

In [42]:
warnings.filterwarnings('ignore')

PD_baseline_Tstats=do_TTest("Gene", baseline="true") # get sorted T-statistics DataFrame 

# PD_baseline_Tstats_sorted=PD_baseline_Tstats.sort_index()

PD_baseline_Tstats.head()

Unnamed: 0_level_0,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
F56C9.1,-112.603363,-101.88103,-75.976433,-38.845508,-13.076698,-47.840224,-54.617887,10.21763,27.213582,-17.764956,-77.101475,-75.381803
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
clec-51,-139.587254,-162.586652,-78.182874,-25.310107,-93.578528,-76.514859,-86.943016,-62.988493,-43.588897,-149.542786,-84.438878,-90.526029
clec-52,-217.061236,-173.387577,-151.211361,-23.488354,-90.997012,-75.944231,-51.348749,-121.950271,-54.510945,-252.654895,-84.687979,-72.324818
dog-1,-112.308852,-123.625969,-76.398087,-53.288002,-201.744282,-238.224215,11.977647,107.753998,67.205884,-21.78188,-51.656532,-125.598001


### T-stat analysis for tap-response data:

### 3.3.3 Allele level T-stat analysis of tap response data

In [43]:
warnings.filterwarnings('ignore')

PD_habituation_Tstats_allele = do_TTest("dataset", baseline="false") # get sorted T-statistics DataFrame 

# PD_habituation_Tstats_allele_sorted=PD_habituation_Tstats_allele.sort_index()

PD_habituation_Tstats_allele.head()

Unnamed: 0_level_0,Recovery Duration,Recovery Probability,Recovery Speed,Memory Retention Duration,Memory Retention Probability,Memory Retention Speed,Initial Duration,Initial Probability,Initial Speed,Final Duration,Final Probability,Final Speed,Habituation of Duration,Habituation of Probability,Habituation of Speed
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
F56C9.1_tm5851,1.562496,1.806791,0.745197,1.822607,-0.379658,0.794743,0.927175,-1.454146,-1.278095,0.167307,1.091798,-1.248326,0.752419,-1.889412,0.202683
F56C9.1_tm6018,1.182782,-0.611597,-0.39174,0.156158,-1.398255,-2.204798,-0.520369,-1.540321,-4.933987,1.571692,0.166934,-2.280011,-1.408146,-1.196294,-1.622635
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
clec-51_tm6692,0.151098,-0.563808,0.153514,-1.62983,-0.587814,-1.525058,-3.888037,-0.93775,-5.255277,0.210934,-0.854299,-3.503848,-3.553155,0.012227,-2.024546
clec-52_tm8126,2.178969,0.410134,-2.254286,0.96998,-2.605174,-4.118857,0.250589,1.013609,-5.03632,1.515229,3.488184,-1.951698,-0.978014,-2.572366,-2.722402


### 3.3.4 Gene-level T-stat analysis of Tap response data

In [44]:
warnings.filterwarnings('ignore')

PD_habituation_Tstats = do_TTest("Gene", baseline="false") # get sorted T-statistics DataFrame 

PD_habituation_Tstats_sorted=PD_habituation_Tstats.sort_index()

PD_habituation_Tstats.head()

Unnamed: 0_level_0,Recovery Duration,Recovery Probability,Recovery Speed,Memory Retention Duration,Memory Retention Probability,Memory Retention Speed,Initial Duration,Initial Probability,Initial Speed,Final Duration,Final Probability,Final Speed,Habituation of Duration,Habituation of Probability,Habituation of Speed
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
F56C9.1,1.812612,0.925482,0.404655,1.394891,-1.814905,-1.205062,0.170908,-1.882222,-3.824903,0.885861,1.753433,-1.831667,-0.30185,-2.791379,-1.526497
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
clec-51,0.151098,-0.563808,0.153514,-1.62983,-0.587814,-1.525058,-3.888037,-0.93775,-5.255277,0.210934,-0.854299,-3.503848,-3.553155,0.012227,-2.024546
clec-52,2.178969,0.410134,-2.254286,0.96998,-2.605174,-4.118857,0.250589,1.013609,-5.03632,1.515229,3.488184,-1.951698,-0.978014,-2.572366,-2.722402
dog-1,0.268212,2.304192,0.066691,0.350123,-0.688631,-1.940253,2.977489,-2.065312,-3.055357,1.929321,3.861904,-0.572488,0.960233,-4.372235,-2.139752


### T-stat analysis for psa data:

### 3.3.5 Allele level T-stat analysis of PSA data

In [45]:
warnings.filterwarnings('ignore')

psa_tstats_allele = do_TTest("dataset", baseline="psa") # get sorted T-statistics DataFrame 

psa_tstats_allele.head()

Unnamed: 0_level_0,Initial PSA Instantaneous Speed,Recovery PSA Instantaneous Speed,Peak PSA Instantaneous Speed,Initial_to_peak PSA Instantaneous Speed,Peak_to_recovery PSA Instantaneous Speed,Average PSA PSA Instantaneous Speed,Initial PSA Bias,Recovery PSA Bias,Peak PSA Bias,Initial_to_peak PSA Bias,...,Peak PSA Curve,Initial_to_peak PSA Curve,Peak_to_recovery PSA Curve,Average PSA Curve,Initial PSA Crab,Recovery PSA Crab,Peak PSA Crab,Initial_to_peak PSA Crab,Peak_to_recovery PSA Crab,Average PSA Crab
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F56C9.1_tm5851,-0.609729,-4.938433,-2.590816,-1.502569,-0.888033,-0.967814,0.254555,-0.998765,-1.794039,0.003236,...,-3.794678,-1.311131,-5.396075,-5.73602,-2.32797,-4.850804,-8.976147,-6.486676,-7.53596,-8.016539
F56C9.1_tm6018,-0.880398,-5.212406,-5.257932,-3.715624,-3.971637,-4.095328,-0.763159,-0.890475,-0.453581,0.532155,...,-1.576173,-0.571197,-2.723809,-6.388987,-1.463426,-5.409188,-8.45899,-7.153893,-8.398339,-8.451174
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
clec-51_tm6692,-3.90742,-5.50252,-7.93618,-7.525981,-6.283363,-6.539515,-1.056317,-3.520281,-1.603832,-2.305804,...,-3.358618,-5.118708,-2.031571,-1.961594,-4.258969,-3.71473,-5.063592,-7.190523,-5.446863,-5.931258
clec-52_tm8126,-3.50394,-8.331753,-7.33902,-9.21523,-8.649053,-9.784737,-1.466636,-3.898861,-3.622715,-4.51058,...,-2.088432,-2.80343,-3.039187,-6.357054,-1.449581,-5.110772,-2.899321,-4.504114,-3.404091,-6.070975


### 3.3.6 Gene-level T-stat analysis of PSA data

In [46]:
warnings.filterwarnings('ignore')

psa_tstats = do_TTest("Gene", baseline="psa") # get sorted T-statistics DataFrame 

psa_tstats.head()

Unnamed: 0_level_0,Initial PSA Instantaneous Speed,Recovery PSA Instantaneous Speed,Peak PSA Instantaneous Speed,Initial_to_peak PSA Instantaneous Speed,Peak_to_recovery PSA Instantaneous Speed,Average PSA PSA Instantaneous Speed,Initial PSA Bias,Recovery PSA Bias,Peak PSA Bias,Initial_to_peak PSA Bias,...,Peak PSA Curve,Initial_to_peak PSA Curve,Peak_to_recovery PSA Curve,Average PSA Curve,Initial PSA Crab,Recovery PSA Crab,Peak PSA Crab,Initial_to_peak PSA Crab,Peak_to_recovery PSA Crab,Average PSA Crab
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F56C9.1,-0.76015,-6.535168,-4.926008,-3.116326,-2.983472,-3.074812,0.005178,-1.2843,-0.889973,0.645687,...,-3.650704,-1.937715,-5.117874,-8.243577,-2.518606,-6.44145,-10.477176,-8.941947,-9.7379,-9.979742
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
clec-51,-3.90742,-5.50252,-7.93618,-7.525981,-6.283363,-6.539515,-1.056317,-3.520281,-1.603832,-2.305804,...,-3.358618,-5.118708,-2.031571,-1.961594,-4.258969,-3.71473,-5.063592,-7.190523,-5.446863,-5.931258
clec-52,-3.50394,-8.331753,-7.33902,-9.21523,-8.649053,-9.784737,-1.466636,-3.898861,-3.622715,-4.51058,...,-2.088432,-2.80343,-3.039187,-6.357054,-1.449581,-5.110772,-2.899321,-4.504114,-3.404091,-6.070975
dog-1,-5.874897,-3.051216,-3.055333,-2.537735,-2.260906,-2.254899,-3.546643,-3.236413,-0.663375,-1.709242,...,-1.512794,-1.283366,-2.18676,-2.221198,-3.581154,-1.203409,-1.932733,-2.935055,-0.476679,-0.355651


# 4. Merging t-stat data into one dataset

In [47]:
def pop_cols(combined):
    """
    Reorders columns in the combined dataframe. 
    (pops specific columns["Area", "Midline", "Morphwidth", "Angular Speed"] and
    reinserts at different positions)

    input:
        combined: dataframe with columns to be reordered

    returns: 
        NA    
        
    """
    first_col=combined.pop("Area")
    combined.insert(0,"Area",first_col)

    first_col=combined.pop("Midline")
    combined.insert(0,"Midline",first_col)

    first_col=combined.pop("Morphwidth")
    combined.insert(0,"Morphwidth",first_col)

    first_col=combined.pop("Angular Speed")
    combined.insert(5,"Angular Speed",first_col)

def pop_last(combined):
    """
    Reorders the last three columns of the combined dataframe.
    input:
        combined: dataframe with columns to be reordered

    """
    last_col=combined.pop("Spontaneous Recovery of Response Duration")
    combined.insert(26,"Spontaneous Recovery of Response Duration",last_col)

    last_col=combined.pop("Spontaneous Recovery of Response Probability")
    combined.insert(26,"Spontaneous Recovery of Response Probability",last_col)

    last_col=combined.pop("Spontaneous Recovery of Response Speed")
    combined.insert(26,"Spontaneous Recovery of Response Speed",last_col)

    last_col=combined.pop("Memory Retention of Response Duration")
    combined.insert(26,"Memory Retention of Response Duration",last_col)

    last_col=combined.pop("Memory Retention of Response Probability")
    combined.insert(26,"Memory Retention of Response Probability",last_col)

    last_col=combined.pop("Memory Retention of Response Speed")
    combined.insert(26,"Memory Retention of Response Speed",last_col)

def rename_columns(df):
    '''
    Renames columns in the input dataframe
    input:
        combined: dataframe with columns to be renamed   
    returns:
        input dataframe with renamed columns 
    '''
    renames = {
        "Habituation of Duration": "Habituation of Response Duration",
        "Habituation of Probability": "Habituation of Respones Probability",
        "Habituation of Speed": "Habituation of Response Speed",
        "Initial Duration": "Initial Response Duration",
        "Initial Probability": "Initial Response Probability",
        "Initial Speed": "Initial Response Speed",
        "Final Duration": "Final Response Duration",
        "Final Probability": "Final Response Probability",
        "Final Speed": "Final Response Speed",
        "Recovery Duration": "Spontaneous Recovery of Response Duration",
        "Recovery Probability": "Spontaneous Recovery of Response Probability",
        "Recovery Speed": "Spontaneous Recovery of Response Speed",
        "Memory Retention Duration": "Memory Retention of Response Duration",
        "Memory Retention Probability": "Memory Retention of Response Probability",
        "Memory Retention Speed": "Memory Retention of Response Speed"
    }
    return df.rename(columns=renames)

def merge_Tstats(baseline, habituation, by=["Gene", "dataset"], Screen=Screen, psa=False):
    """
    merge baseline and tap response dataframes based on the Gene/dataset
    normalize the merged dataframe and then return it with melted version

    input:
        - baseline: baseline dataframe to merge
        - habituation: habituation dataframe to merge
        - by: what to group by "Gene" or "dataset"
    """

    #merge baseline and habituation data
    combined_Tstats = pd.merge(baseline, habituation, on=by, how='left')
    combined_Tstats = combined_Tstats.sort_index() # sort by index

    # ------------ NORMALISATION STEPS MOVED TO DASHBOARD -------------------
    # # normalise combined dataframe by subtracting mean and div by sd
    # combined_Tstats_normalized = (combined_Tstats-combined_Tstats.mean())/combined_Tstats.std()

    # if by=="dataset" and Screen=="Neuron_Genes_Screen":
    #     combined_Tstats_normalized_2 = combined_Tstats-combined_Tstats[combined_Tstats.index=="N2_XJ1"].squeeze()
    # else :
    #     combined_Tstats_normalized_2 = combined_Tstats-combined_Tstats[combined_Tstats.index=="N2"].squeeze()  

    pop_cols(combined_Tstats) # reorder columns

    # Skip this step if data = psa
    if not psa:
        #rename columns of combined and normalized df
        combined_Tstats = rename_columns(combined_Tstats)
        # combined_Tstats_normalized_2=rename_columns(combined_Tstats_normalized_2)
        pop_cols(combined_Tstats) # reorder columns
        pop_last(combined_Tstats) # reorder columns

    # -------------- PIVOTING STEPS MOVED TO DASHBOARD ---------------------
    # # Melt the combined dataframe
    # combined_Tstats_melted=combined_Tstats.reset_index()
    # combined_Tstats_melted=pd.melt(combined_Tstats_melted, id_vars=[by],
    #                             var_name='Metric',
    #                             value_name='T_score')
    
    # # Sort the melted dataframe by T_score
    # combined_Tstats_melted_sorted=combined_Tstats_melted.sort_values(by=['T_score'])

    # # Melt the normalized dataframe
    # combined_Tstats_normalized_melted=combined_Tstats.reset_index()
    # combined_Tstats_normalized_melted=pd.melt(combined_Tstats_normalized_melted, id_vars=[by],
    #                                                var_name='Metric',
    #                                                value_name='T_score')

    # add Screen column to df and its melted version
    combined_Tstats['Screen']=Screen
    # combined_Tstats_normalized_melted['Screen']=Screen

    return combined_Tstats#, combined_Tstats_normalized_melted



## 4.1 Gene-level

- Pass Tap and baseline through merge_Tstats() as df1
- Pass PSA and baseline through merge_Tstats()as df2
- pd.merge df1 and df2 using all columns of baseline

In [48]:
# Baseline + Tap
combined_Tstats = merge_Tstats(PD_baseline_Tstats, PD_habituation_Tstats, "Gene")

In [49]:
# Baseline + PSA 
combined_Tstats_psa = merge_Tstats(
    PD_baseline_Tstats, psa_tstats, by="Gene", psa=True
)

In [50]:
# Baseline + Tap + PSA
final_tstat = pd.merge(combined_Tstats.reset_index(), combined_Tstats_psa.reset_index(), on = PD_baseline_Tstats.columns.to_list().append(['Gene','Screen']), how = 'inner')

final_tstat.head()

Unnamed: 0,Gene,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,...,Peak PSA Curve,Initial_to_peak PSA Curve,Peak_to_recovery PSA Curve,Average PSA Curve,Initial PSA Crab,Recovery PSA Crab,Peak PSA Crab,Initial_to_peak PSA Crab,Peak_to_recovery PSA Crab,Average PSA Crab
0,F56C9.1,-38.845508,-13.076698,-47.840224,-112.603363,-101.88103,-54.617887,-75.976433,10.21763,27.213582,...,-3.650704,-1.937715,-5.117874,-8.243577,-2.518606,-6.44145,-10.477176,-8.941947,-9.7379,-9.979742
1,N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,clec-51,-25.310107,-93.578528,-76.514859,-139.587254,-162.586652,-86.943016,-78.182874,-62.988493,-43.588897,...,-3.358618,-5.118708,-2.031571,-1.961594,-4.258969,-3.71473,-5.063592,-7.190523,-5.446863,-5.931258
3,clec-52,-23.488354,-90.997012,-75.944231,-217.061236,-173.387577,-51.348749,-151.211361,-121.950271,-54.510945,...,-2.088432,-2.80343,-3.039187,-6.357054,-1.449581,-5.110772,-2.899321,-4.504114,-3.404091,-6.070975
4,dog-1,-53.288002,-201.744282,-238.224215,-112.308852,-123.625969,11.977647,-76.398087,107.753998,67.205884,...,-1.512794,-1.283366,-2.18676,-2.221198,-3.581154,-1.203409,-1.932733,-2.935055,-0.476679,-0.355651


In [51]:
# # Baseline + Tap + PSA melted
# final_tstat_melted = pd.concat([combined_Tstats_normalized_melted, combined_Tstats_psa_melted]).drop_duplicates()

# final_tstat_melted.head()

## 4.2 Allele level 


- Pass Tap and baseline through merge_Tstats() as df3
- Pass PSA and baseline through merge_Tstats()as df4
- pd.merge df3 and df4 using all columns of basline

In [52]:
# Baseline + Tap
combined_Tstats_allele = merge_Tstats(PD_baseline_Tstats_allele,PD_habituation_Tstats_allele, "dataset")

In [53]:
# Baseline + PSA 
combined_Tstats_psa_allele = merge_Tstats(
    PD_baseline_Tstats_allele, psa_tstats_allele, by="dataset", psa=True
)

In [54]:
# Baseline + Tap + PSA
final_tstat_allele = pd.merge(combined_Tstats_allele.reset_index(), combined_Tstats_psa_allele.reset_index(), on = PD_baseline_Tstats_allele.columns.to_list().append(['dataset','Screen']), how = 'outer')

final_tstat_allele.head()

Unnamed: 0,dataset,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,...,Peak PSA Curve,Initial_to_peak PSA Curve,Peak_to_recovery PSA Curve,Average PSA Curve,Initial PSA Crab,Recovery PSA Crab,Peak PSA Crab,Initial_to_peak PSA Crab,Peak_to_recovery PSA Crab,Average PSA Crab
0,F56C9.1_tm5851,8.05481,36.860836,30.820266,-94.512488,-72.441971,-77.34865,-41.170625,-60.110006,-33.955937,...,-3.794678,-1.311131,-5.396075,-5.73602,-2.32797,-4.850804,-8.976147,-6.486676,-7.53596,-8.016539
1,F56C9.1_tm6018,-82.649188,-21.5011,-91.043504,-80.18503,-77.663922,-21.715668,-57.462244,53.09934,63.226435,...,-1.576173,-0.571197,-2.723809,-6.388987,-1.463426,-5.409188,-8.45899,-7.153893,-8.398339,-8.451174
2,N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,clec-51_tm6692,-25.310107,-93.578528,-76.514859,-139.587254,-162.586652,-86.943016,-78.182874,-62.988493,-43.588897,...,-3.358618,-5.118708,-2.031571,-1.961594,-4.258969,-3.71473,-5.063592,-7.190523,-5.446863,-5.931258
4,clec-52_tm8126,-23.488354,-90.997012,-75.944231,-217.061236,-173.387577,-51.348749,-151.211361,-121.950271,-54.510945,...,-2.088432,-2.80343,-3.039187,-6.357054,-1.449581,-5.110772,-2.899321,-4.504114,-3.404091,-6.070975


In [55]:
final_tstat.shape

(8, 71)

In [56]:
# # Baseline + Tap + PSA melted
# final_tstat_melted_allele = pd.concat([combined_Tstats_normalized_melted_allele, combined_Tstats_psa_melted_allele]).drop_duplicates()

# final_tstat_melted_allele.head()

# 5. Save data to database (sqlite3)

#### A janky way to add data and update the sql 

1. Read table to pd.DataFrame
2. Add new data to pd.DataFrame
3. Replace old table with newly updated pd.DataFrame

# Primary Keys For Each SQL Table:

####  -- Gene_Allele_WormBaseID:
WBGene, WBAllele
#### -- alleleMSD:
dataset, Screen
#### -- gene_MSD:
Gene, Screen
#### -- allele_profile_data:
dataset, Metric, Screen
#### -- gene_profile_data:
Gene, Metric, Screen
#### -- tap_baseline_data:
Time, Plate_id, Date, Screen, dataset
#### -- tap_response_data:
plate, Date, Plate_id, Screen, taps, dataset, Gene, Allele
#### -- tstat_allele_data:
dataset, Screen
#### -- tstat_gene_data:
Gene, Screen
#### -- psa_summarized_data:
Plate_id,Date,Scree,dataset,Gene,Allele

In [57]:
# print(tap_output.head(5))
# print(baseline_output.head(5))

tap_output.Screen = Screen
tap_psa_output.Screen = Screen
baseline_output.Screen = Screen

# print(tap_output.head(5))
# print(baseline_output.head(5))

In [58]:
# final_tstat_allele[final_tstat_allele.isna().any(axis=1)]
final_tstat_allele[final_tstat_allele["Morphwidth"].isna()]

Unnamed: 0,dataset,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,...,Peak PSA Curve,Initial_to_peak PSA Curve,Peak_to_recovery PSA Curve,Average PSA Curve,Initial PSA Crab,Recovery PSA Crab,Peak PSA Crab,Initial_to_peak PSA Crab,Peak_to_recovery PSA Crab,Average PSA Crab


In [59]:
# final_tstat_allele[final_tstat_allele['dataset'] == "unknown_CZ11000"]

In [None]:

### This code will connect to PostgreSQL database and write non-duplicate data into the database tables.

# Loads database config values from database.ini file and validates that user and password are set.
config = load_config()
if (config['user'] == "" or config['password'] == ""):
    print("Please set your user and password in the database.ini file.")
    sys.exit(1)
    
# Creates a connection pool to PostgreSQL database using SQLAlchemy.
engine = create_engine(f"postgresql+psycopg://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}")

# Function to insert data into PostgreSQL table, skipping duplicates based on primary keys.
def postgres_skip_on_duplicate(pd_table, conn, keys, data_iter):
    data = [dict(zip(keys,row)) for row in data_iter]
    conn.execute(insert(pd_table.table).on_conflict_do_nothing(), data)

# --------- Write the dataframes to PostgreSQL tables -----------

# Complete tap response data
print("working on tap_psa_output:") 
tap_psa_output.to_sql('tap_response_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)
# tap_psa_output.to_sql('tap_response_data', engine, if_exists='replace', index=False, method=None)

# Complete baseline data  >NO
print("working on tap_baseline_data:") 
baseline_output.to_sql('tap_baseline_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)
# baseline_output.to_sql('tap_baseline_data', engine, if_exists='replace', index=False, method=None)

# Baseline + Tap + PSA combined tstat data by Gene
print("working on tstat_gene_data")
final_tstat.dropna(thresh=10).reset_index().to_sql('tstat_gene_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)
# final_tstat.reset_index().to_sql('tstat_gene_data', engine, if_exists='replace', index=False, method=None)

# Baseline + Tap + PSA combined tstat data by Allele
print("working on tstat_allele_data")
final_tstat_allele.dropna(thresh=10).reset_index().to_sql('tstat_allele_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)
# final_tstat_allele.reset_index().to_sql('tstat_allele_data', engine, if_exists='replace', index=False, method=None)

# MSD Baseline + Tap + PSA by Gene
print("working on gene_MSD")
combined_MSD.to_sql('gene_MSD', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)
# combined_MSD.to_sql('gene_MSD', engine, if_exists='replace', index=False, method=None)

# MSD Baseline + Tap + PSA by Allele
print("working on allele_MSD")
allele_combined_MSD.to_sql('allele_MSD', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)
# allele_combined_MSD.to_sql('allele_MSD', engine, if_exists='replace', index=False, method=None)

# Summarised PSA data (speed, kink, curve, etc.)
print("working on psa_data:") 
psa_data.to_sql('psa_summarised_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)
# psa_data.to_sql('psa_summarised_data', engine, if_exists='replace', index=False, method=None)

# # Melted Baseline + Tap + PSA combined tstat data by Gene
# print("working on gene_profile_data")
# final_tstat_melted.to_sql('gene_profile_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# # Melted Baseline + Tap + PSA combined tstat data by Allele
# print("working on allele_profile_data")
# final_tstat_melted_allele.to_sql('allele_profile_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)


print("---------- DONE ----------")

working on tap_psa_output:
working on tap_baseline_data:
working on tstat_gene_data
working on tstat_allele_data
working on gene_MSD
working on allele_MSD
working on psa_data:


### Use the below cell to just replace/update one table:

In [None]:
# Loads database config values from database.ini file and validates that user and password are set.
config = load_config()
if (config['user'] == "" or config['password'] == ""):
    print("Please set your user and password in the database.ini file.")
    sys.exit(1)
    
# Creates a connection pool to PostgreSQL database using SQLAlchemy.
engine = create_engine(f"postgresql+psycopg://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}")

# Function to insert data into PostgreSQL table, skipping duplicates based on primary keys.
def postgres_skip_on_duplicate(pd_table, conn, keys, data_iter):
    data = [dict(zip(keys,row)) for row in data_iter]
    conn.execute(insert(pd_table.table).on_conflict_do_nothing(), data)


# Complete tap response data
print("working on tap_output:") 
tap_psa_output.to_sql('tap_response_data', engine, if_exists='replace', index=False, method=None)
print("Done")

In [None]:
# # USE THIS CELL TO UPDATE ALL THE NEED TALBES (Also have baseline_output on the second line)

# conn=sqlite3.connect('/Users/lavanya/Desktop/Lavanya_Test/data_updated2.db')

# tap_output.to_sql('tap_response_data', conn, if_exists='append', index=False)

# baseline_output.to_sql('tap_baseline_data', conn, if_exists='append', index=False)

# combined_Tstats_normalize_2.reset_index().to_sql('tstat_gene_data', conn, if_exists='append', index=False)

# combined_Tstats_normalize_allele_2.reset_index().to_sql('tstat_allele_data', conn, if_exists='append', index=False)

# combined_Tstats_normalized_melted.to_sql('gene_profile_data', conn, if_exists='append', index=False)

# combined_Tstats_normalized_melted_allele.to_sql('allele_profile_data', conn, if_exists='append', index=False)

# combined_MSD.to_sql('gene_MSD', conn, if_exists='append', index=False)

# allele_combined_MSD.to_sql('allele_MSD', conn, if_exists='append', index=False)

# # combined_Tstats_melted_sorted.to_sql('allele_phenotype_data', conn, if_exists='replace', index=False)

# print(conn.total_changes)

# conn.close()


# # Want to test edge cases of pd.to_sql functionality#############