# 1. Imports and File selection 

In [2]:
import io
import ipywidgets as widgets
import math
import numpy
import psycopg
import pandas as pd
import requests
import sqlite3
import sys
import tqdm
import warnings

from config import load_config
from ipyfilechooser import FileChooser
from scipy import stats
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import insert
from sqlite3 import Error
from sqlite3 import IntegrityError

## Select Baseline .csv File

In [3]:
starting_directory = '/Users/gurmehak/Documents/RankinLab/Test_Datasets/'
baseline_chooser = FileChooser(starting_directory)
display(baseline_chooser)

FileChooser(path='/Users/gurmehak/Documents/RankinLab/Test_Datasets', filename='', title='', show_hidden=False…

## Select Tap .csv File

In [4]:
tap_chooser=FileChooser('/Users/gurmehak/Documents/RankinLab/Test_Datasets/')
display(tap_chooser)

FileChooser(path='/Users/gurmehak/Documents/RankinLab/Test_Datasets', filename='', title='', show_hidden=False…

## Select Post Stimulus Arousal .csv File

In [5]:
psa_chooser = FileChooser('/Users/gurmehak/Documents/RankinLab/Test_Datasets')
display(psa_chooser)

FileChooser(path='/Users/gurmehak/Documents/RankinLab/Test_Datasets', filename='', title='', show_hidden=False…

In [6]:
screens = ['PD_Screen', 'ASD_Screen', 'G-Proteins_Screen', 'Glia_Genes_Screen', 
           'Neuron_Genes_Screen', 'PD_GWAS_Locus71_Screen', 'ASD_WGS_Screen']

screen_chooser = widgets.Select(options=screens, value=screens[0], description='Screen:')
display(screen_chooser)

Select(description='Screen:', options=('PD_Screen', 'ASD_Screen', 'G-Proteins_Screen', 'Glia_Genes_Screen', 'N…

In [7]:
Screen=screen_chooser.value
folder_path=baseline_chooser.selected_path
print(folder_path)

/Users/gurmehak/Documents/RankinLab/Test_Datasets/Glia_Genes_Screen_2025


## Read baseline, tap and post stimulus arousal (psa) data

In [8]:
# Read the baseline file
baseline_output = pd.read_csv(baseline_chooser.selected, index_col=0)#.drop(columns=['index'])

print(f"\nShape of the baseline .csv file: {baseline_output.shape}")

# Print the first five rows of the file
baseline_output.head()


Shape of the baseline .csv file: (707599, 22)


Unnamed: 0,Time,n,Number,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,...,Curve,Crab,Pathlength,Plate_id,Date,Screen,plate,dataset,Gene,Allele
12182,490.041,14,12,0.027,0.0186,0.083,0.0653,0.7677,0.069255,4.3,...,30.1,0.0034,3.168,20241024_171133_B1024,20241024,Glia_Genes_Screen,0,N2,N2,N2
12183,490.082,14,12,0.0359,0.0241,0.083,0.0665,0.7662,0.070288,6.0,...,29.7,0.0102,3.169,20241024_171133_B1024,20241024,Glia_Genes_Screen,0,N2,N2,N2
12184,490.119,14,12,0.0323,0.0246,0.083,0.0665,0.7822,0.071503,5.5,...,29.6,0.0089,3.169,20241024_171133_B1024,20241024,Glia_Genes_Screen,0,N2,N2,N2
12185,490.138,14,12,0.0318,0.0241,0.083,0.0684,0.7796,0.072475,5.1,...,29.4,0.007,3.169,20241024_171133_B1024,20241024,Glia_Genes_Screen,0,N2,N2,N2
12186,490.183,14,12,0.0281,0.0176,0.083,0.0677,0.7779,0.071381,4.6,...,29.9,0.0071,3.17,20241024_171133_B1024,20241024,Glia_Genes_Screen,0,N2,N2,N2


In [11]:
# Read the tap file
tap_output = pd.read_csv(tap_chooser.selected, index_col=0)

print(f"\nShape of the psa .csv file: {tap_output.shape}")

# Print the first five rows of the file
tap_output.head()


Shape of the psa .csv file: (9695, 13)


Unnamed: 0,time,dura,dist,prob,speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.983,2.22,0.562,1.0,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
1,609.979,1.45,0.371,0.545455,0.255862,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,2.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
2,619.996,2.11,0.669,0.52,0.317062,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,3.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
3,629.971,1.59,0.422,0.809524,0.265409,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,4.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
4,639.968,1.5,0.378,0.895833,0.252,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,5.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109


In [10]:
# Read the psa file
psa_output = pd.read_csv(psa_chooser.selected, index_col=0)

print(f"\nShape of the tap .csv file: {psa_output.shape}")

# Print the first five rows of the file
psa_output.head()


Shape of the tap .csv file: (9857, 24)


Unnamed: 0,Experiment,Screen,Date,Plate_id,Gene,Allele,dataset,taps,Time,n,...,Tap,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,1,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,1.0,607.017,15.0,...,0.0,0.063419,0.774055,0.071953,5.437097,0.323339,66.69032,34.41774,0.007847,2.674403
1,1,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,2.0,617.025,16.193548,...,0.0,0.069426,0.794218,0.077992,13.196775,0.394581,70.537094,37.046772,0.013332,2.905065
2,1,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,3.0,627.02,17.0,...,0.0,0.074361,0.819671,0.083514,9.058064,0.339935,72.60807,30.751612,0.012123,2.678081
3,1,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,4.0,637.02,20.0,...,0.0,0.080366,0.819053,0.085719,8.482259,0.295935,49.640324,31.625807,0.012297,2.752871
4,1,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,5.0,647.075,20.62963,...,0.0,0.077781,0.849924,0.088974,4.809259,0.26113,50.25926,29.918518,0.007341,3.041685


# **************** MERGE HERE

In [12]:
tap_psa_output = pd.merge(
    tap_output, psa_output.drop(columns='Experiment'),
    how='outer', 
    on=['Date', 'Plate_id', 'Screen', 'dataset', 'Gene', 'Allele', "taps" ]
)

tap_psa_output

Unnamed: 0,time,dura,dist,prob,speed,plate,Date,Plate_id,Screen,taps,...,Tap,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,599.983,2.22,0.562,1.000000,0.253153,1.0,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,...,0.0,0.124076,1.118605,0.153218,12.529310,0.463328,83.258620,41.187930,0.017031,0.095621
1,609.979,1.45,0.371,0.545455,0.255862,1.0,20240724,20240724_023625_A0724,Glia_Genes_Screen,2.0,...,0.0,0.112041,1.128514,0.145828,14.563793,0.398190,65.177590,36.906900,0.024748,0.667638
2,619.996,2.11,0.669,0.520000,0.317062,1.0,20240724,20240724_023625_A0724,Glia_Genes_Screen,3.0,...,0.0,0.101146,1.121352,0.133918,16.111110,0.351426,56.307407,34.988888,0.026074,1.117426
3,629.971,1.59,0.422,0.809524,0.265409,1.0,20240724,20240724_023625_A0724,Glia_Genes_Screen,4.0,...,0.0,0.094000,1.128707,0.129577,14.318966,0.324241,51.543102,34.353447,0.023929,1.816603
4,639.968,1.50,0.378,0.895833,0.252000,1.0,20240724,20240724_023625_A0724,Glia_Genes_Screen,5.0,...,0.0,0.092280,1.138600,0.129543,11.340000,0.284367,43.355000,31.720000,0.021453,2.679917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9853,859.932,0.78,0.166,0.437500,0.212821,75.0,20250319,20250319_174606_C0319,Glia_Genes_Screen,27.0,...,0.0,0.114903,1.118912,0.150983,4.084848,0.237121,37.060608,26.875760,0.011070,7.177030
9854,869.932,1.14,0.231,0.320000,0.202632,75.0,20250319,20250319_174606_C0319,Glia_Genes_Screen,28.0,...,0.0,0.116145,1.114200,0.150267,5.015151,0.238879,36.896970,26.942423,0.011942,7.058454
9855,879.987,1.39,0.325,0.250000,0.233813,75.0,20250319,20250319_174606_C0319,Glia_Genes_Screen,29.0,...,0.0,0.113608,1.109426,0.148670,4.742105,0.228158,34.723682,26.602633,0.011876,7.053052
9856,889.930,0.57,0.095,0.313725,0.166667,75.0,20250319,20250319_174606_C0319,Glia_Genes_Screen,30.0,...,0.0,0.122656,1.115550,0.151009,4.808824,0.237735,36.814705,28.602942,0.011588,7.537529


In [13]:
tap_psa_output.to_csv("tap_psa_output")

# 2. DataFrame preparation

### 2.1. Tap Data

In [14]:
# Dataframe for first tap
PD_first_tap = (
    tap_output[(tap_output.taps==1)]
    .reset_index().drop(columns="index")
    .rename(columns={"dura": "init_dura", "prob": "init_prob", "speed": "init_speed"}, errors="raise")
)

PD_first_tap.head()

Unnamed: 0,time,init_dura,dist,init_prob,init_speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.983,2.22,0.562,1.0,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
1,599.997,1.82,0.408,0.666667,0.224176,2,20240724,20240724_032303_A0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
2,599.965,2.56,0.457,0.844444,0.178516,3,20240724,20240724_101025_B0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
3,599.94,2.95,0.491,0.76087,0.166441,4,20240724,20240724_101818_C0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
4,599.982,2.5,0.421,0.764706,0.1684,5,20240724,20240724_110244_B0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109


In [15]:
# Dataframe for recovery taps
PD_recov_taps = (
    tap_output[(tap_output.taps==31)]
    .reset_index().drop(columns="index")
    .rename(columns={"dura": "recov_dura", "prob": "recov_prob", "speed":"recov_speed"})
)

PD_recov_taps.head()

Unnamed: 0,time,recov_dura,dist,recov_prob,recov_speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,1189.985,1.51,0.263,0.828571,0.174172,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,31.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
1,1189.939,1.05,0.19,0.630435,0.180952,2,20240724,20240724_032303_A0724,Glia_Genes_Screen,31.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
2,1189.93,2.11,0.271,0.886364,0.128436,3,20240724,20240724_101025_B0724,Glia_Genes_Screen,31.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
3,1189.965,1.69,0.24,0.822222,0.142012,4,20240724,20240724_101818_C0724,Glia_Genes_Screen,31.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
4,1189.967,1.95,0.265,0.973684,0.135897,5,20240724,20240724_110244_B0724,Glia_Genes_Screen,31.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109


In [16]:
# Dataframe for last three taps
PD_final_taps = (
    tap_output[((tap_output.taps >= 28) & (tap_output.taps <= 30))]
    .groupby(["dataset", "Date","Plate_id","Screen","Gene","Allele","plate"])
    .mean()
    .reset_index()
    .rename(columns={"dura": "final_dura", "prob": "final_prob", "speed": "final_speed"}, errors="raise")
)

PD_final_taps.head()

Unnamed: 0,dataset,Date,Plate_id,Screen,Gene,Allele,plate,time,final_dura,dist,final_prob,final_speed,taps
0,AMshABLATE_nsIs109,20240724,20240724_023625_A0724,Glia_Genes_Screen,AMshABLATE,nsIs109,1,879.970333,0.866667,0.144333,0.41978,0.167793,29.0
1,AMshABLATE_nsIs109,20240724,20240724_032303_A0724,Glia_Genes_Screen,AMshABLATE,nsIs109,2,879.969333,0.906667,0.144667,0.336761,0.160319,29.0
2,AMshABLATE_nsIs109,20240724,20240724_101025_B0724,Glia_Genes_Screen,AMshABLATE,nsIs109,3,879.936,1.48,0.215667,0.611189,0.145128,29.0
3,AMshABLATE_nsIs109,20240724,20240724_101818_C0724,Glia_Genes_Screen,AMshABLATE,nsIs109,4,879.949,1.53,0.208,0.567925,0.136765,29.0
4,AMshABLATE_nsIs109,20240724,20240724_110244_B0724,Glia_Genes_Screen,AMshABLATE,nsIs109,5,879.947667,1.466667,0.217333,0.503704,0.148535,29.0


In [17]:
# Dataframe to analyse habituation behaviour after merging first tap and final taps

PD_habit_levels = pd.merge(
    PD_first_tap, 
    PD_final_taps, 
    on =['dataset', 'plate', "Plate_id", "Screen", "Gene", "Allele", "Date"], how ='left'
).drop(columns=['time_x','time_y','dist_x','dist_y', 'taps_x', 'taps_y']).dropna()

PD_habit_levels['habit_dura'] = PD_habit_levels['init_dura'] - PD_habit_levels['final_dura']

PD_habit_levels['habit_prob'] = PD_habit_levels['init_prob'] - PD_habit_levels['final_prob']

PD_habit_levels['habit_speed'] = PD_habit_levels['init_speed'] - PD_habit_levels['final_speed']

In [18]:
# Continue to analyse habituation behaviour after merging with recovery taps

if PD_recov_taps.empty:
    PD_habituation = pd.merge(PD_habit_levels, PD_recov_taps, on =['dataset','plate',"Plate_id","Screen","Gene","Allele","Date"], how ='outer')
else:
    PD_habituation = pd.merge(PD_habit_levels, PD_recov_taps, on =['dataset','plate',"Plate_id","Screen","Gene","Allele","Date"], how ='left')

if Screen not in ['Neuron_Genes_Screen', 'G-Proteins_Screen']:
    PD_habituation = PD_habituation.dropna() 

PD_habituation['recovery_dura']=(PD_habituation.recov_dura-PD_habituation.init_dura)/PD_habituation.init_dura*100

PD_habituation['recovery_prob']=(PD_habituation.recov_prob-PD_habituation.init_prob)/PD_habituation.init_prob*100

PD_habituation['recovery_speed']=(PD_habituation.recov_speed-PD_habituation.init_speed)/PD_habituation.init_speed*100

PD_habituation['memory_retention_dura']=(PD_habituation.recov_dura-PD_habituation.final_dura)

PD_habituation['memory_retention_prob']=(PD_habituation.recov_prob-PD_habituation.final_prob)

PD_habituation['memory_retention_speed']=(PD_habituation.recov_speed-PD_habituation.final_speed)


# Rename `PD_habituation` to `tap_data` based on the condition below
if Screen in ['Neuron_Genes_Screen', 'G-Proteins_Screen']:
    tap_data=PD_habituation.dropna(subset = ['init_dura', 'init_prob', 'init_speed', 'plate', 'Date', 'Plate_id',
       'Screen', 'dataset', 'Gene', 'Allele', 'final_dura', 'final_prob',
       'final_speed', 'habit_dura', 'habit_prob', 'habit_speed'])
else:
    tap_data=PD_habituation.dropna() 


# Display final dataframe
tap_data.head()


Unnamed: 0,init_dura,init_prob,init_speed,plate,Date,Plate_id,Screen,dataset,Gene,Allele,...,dist,recov_prob,recov_speed,taps,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,2.22,1.0,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,...,0.263,0.828571,0.174172,31.0,-31.981982,-17.142857,-31.198888,0.643333,0.408791,0.006379
1,1.82,0.666667,0.224176,2,20240724,20240724_032303_A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,...,0.19,0.630435,0.180952,31.0,-42.307692,-5.434783,-19.281046,0.143333,0.293673,0.020633
2,2.56,0.844444,0.178516,3,20240724,20240724_101025_B0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,...,0.271,0.886364,0.128436,31.0,-17.578125,4.964115,-28.053346,0.63,0.275175,-0.016692
3,2.95,0.76087,0.166441,4,20240724,20240724_101818_C0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,...,0.24,0.822222,0.142012,31.0,-42.711864,8.063492,-14.677207,0.16,0.254298,0.005247
4,2.5,0.764706,0.1684,5,20240724,20240724_110244_B0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,...,0.265,0.973684,0.135897,31.0,-22.0,27.327935,-19.30081,0.483333,0.469981,-0.012638


### 2.2. PSA data

In [19]:
# function to calculate Inidial, Final, Peak, ect values for specified column (metric)

def summary_metrics(df, metric = 'Instantaneous Speed'):

    initial = df[metric].iloc[0]
    recovery = df[metric].iloc[-1]
    peak = df[metric].max()
    mean = df[metric].mean()
    peak_id = df[metric].values.argmax()
    initial_to_peak = df[metric].iloc[: peak_id+1].mean()
    peak_to_recovery = df[metric].iloc[peak_id:].mean()
    

    return pd.Series({
        f'PSA Initial {metric}': initial, 
        f'PSA Recovery {metric}': recovery, 
        f'PSA Peak {metric}': peak,
        f'PSA Initial_to_peak {metric}': initial_to_peak, 
        f'PSA Peak_to_recovery {metric}': peak_to_recovery,
        f'PSA Average {metric}': mean
        })

In [28]:
warnings.filterwarnings('ignore')

# columns to summarize
metrics_to_summarize = ['Instantaneous Speed', 'Bias', 'Angular Speed', 'Aspect Ratio', 'Kink', 'Curve', 'Crab']

# standard columns
group_cols = ['Experiment', 'Plate_id', 'Date', 'Screen', 'dataset', 'Gene', 'Allele']

# pass each column to summarise through `summary_metrics` function and merge the summarised values to psa_output
psa_data = psa_output[group_cols].drop_duplicates()
for metric in metrics_to_summarize:
    summary = psa_output.groupby(group_cols).apply(lambda x: summary_metrics(x, metric)).reset_index()
    psa_data = pd.merge(psa_data, summary, on=group_cols, how='left')

In [31]:
psa_data.head()

Unnamed: 0,Experiment,Plate_id,Date,Screen,dataset,Gene,Allele,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,1,20241024_171133_B1024,20241024,Glia_Genes_Screen,N2,N2,N2,0.058189,0.035267,0.111542,...,37.046772,35.732256,28.879593,29.058243,0.007847,0.007175,0.013332,0.01059,0.008201,0.008189
1,2,20240724_025822_A0724,20240724,Glia_Genes_Screen,N2,N2,N2,0.070634,0.086083,0.201181,...,37.37931,37.37931,30.650897,30.650897,0.01149,0.009569,0.020244,0.017161,0.012907,0.013082
2,3,20240724_035049_A0724,20240724,Glia_Genes_Screen,N2,N2,N2,0.078074,0.06256,0.235954,...,38.19434,38.19434,30.402042,30.402042,0.01413,0.009323,0.025125,0.020426,0.014596,0.014821
3,4,20240724_094826_B0724,20240724,Glia_Genes_Screen,N2,N2,N2,0.127673,0.125018,0.227077,...,36.342426,36.342426,28.917527,28.917527,0.019697,0.012982,0.021259,0.020478,0.013067,0.013281
4,5,20240724_095505_C0724,20240724,Glia_Genes_Screen,N2,N2,N2,0.097451,0.113376,0.235098,...,40.177143,40.177143,29.27945,29.27945,0.018106,0.013045,0.02145,0.019778,0.013717,0.013858


In [32]:
psa_data.shape

(318, 49)

# 3. Run Statistics (T-Test and mean sample distance) on Data

## 3.1 Generate dataframes conditioned by `baseline` (True/False) and `allele` (True/False)

In [33]:
def get_output_byplate(output, baseline=["true", "false", "psa"], allele = [False, True]):
    """
    Aggregates data by 'Plate_id','Date','Screen','dataset','Gene','Allele'

    Parameters:
        output (pd.DataFrame): Input DataFrame (either baseline_output or tap_data)
        baseline (boolean): whether data is baseline (True) or tap response (False)
        allele (boolean): group by allele (True) or group by gene (False)

    Returns:
        A DataFrame with plate-level averages
    """
    
    # columns to delete if baseline = true
    if baseline == "true":
        drop_col = ['Plate_id','n','Number','Time','Screen','Date','Allele']
    # columns to delete if baseline = false
    elif baseline == "false":
        drop_col = ['Plate_id','Screen','Date','Allele','dist','plate','time',
                       'taps','recov_dura','recov_prob','recov_speed']
    # columns to delete if baseline = psa
    else: 
        drop_col = ['Experiment', 'Plate_id', 'Date', 'Screen', 'Allele']

    drop_col.append('Gene') if allele else drop_col.append('dataset')
     
    output_byplate = output.groupby(
        by=['Plate_id','Date','Screen','dataset','Gene','Allele'],
        as_index=False).mean().drop(columns=drop_col)
    
    return output_byplate

#### 3.1.1 `baseline` = True, `allele` = False

In [34]:
baseline_output_byplate=get_output_byplate(baseline_output, baseline= "true", allele=False)

print(f"Shape: {baseline_output_byplate.shape}")

baseline_output_byplate.head()

Shape: (318, 14)


Unnamed: 0,Gene,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength,plate
0,AMshABLATE,0.028947,0.025567,-0.00887,0.145966,1.239813,0.190805,2.056068,0.330311,57.238641,29.283398,0.006057,1.904028,0.0
1,N2,0.060303,0.0723,0.157983,0.099035,1.213483,0.145692,2.757155,0.25276,41.775481,29.871081,0.007687,7.330147,62.0
2,AMshABLATE,0.030928,0.025825,0.000614,0.092516,1.115931,0.127868,2.036695,0.256568,47.684237,28.306229,0.005724,2.600575,58.0
3,N2,0.042891,0.048333,0.037748,0.102591,1.21481,0.148884,2.70357,0.271092,49.11343,30.837654,0.007713,4.78246,120.0
4,N2,0.113829,0.094346,0.653023,0.11996,1.146936,0.155877,4.156619,0.245972,40.113238,28.564274,0.009363,3.751665,173.0


#### 3.1.2 `baseline` = False, `allele` = False

In [35]:
tap_data_byplate=get_output_byplate(tap_data, baseline="false", allele=False)

print(f"Shape: {tap_data_byplate.shape}")

tap_data_byplate.head()

Shape: (312, 16)


Unnamed: 0,Gene,init_dura,init_prob,init_speed,final_dura,final_prob,final_speed,habit_dura,habit_prob,habit_speed,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,AMshABLATE,2.22,1.0,0.253153,0.866667,0.41978,0.167793,1.353333,0.58022,0.08536,-31.981982,-17.142857,-31.198888,0.643333,0.408791,0.006379
1,N2,2.64,0.780488,0.233712,0.766667,0.291875,0.208448,1.873333,0.488612,0.025264,-26.893939,-13.327206,-7.330305,1.163333,0.384595,0.008132
2,AMshABLATE,1.82,0.666667,0.224176,0.906667,0.336761,0.160319,0.913333,0.329905,0.063857,-42.307692,-5.434783,-19.281046,0.143333,0.293673,0.020633
3,N2,2.25,0.84,0.223556,0.756667,0.196452,0.211548,1.493333,0.643548,0.012008,-20.0,4.166667,3.379722,1.043333,0.678548,0.019563
4,N2,3.62,0.933333,0.269337,0.563333,0.21819,0.160423,3.056667,0.715144,0.108914,-25.414365,-23.076923,-10.892308,2.136667,0.499759,0.079577


#### 3.1.3 `baseline` = True, `allele` = True

In [36]:
baseline_output_allele_byplate = get_output_byplate(baseline_output,baseline="true", allele=True)

print(f"Shape: {baseline_output_allele_byplate.shape}")

baseline_output_allele_byplate.head()

Shape: (318, 14)


Unnamed: 0,dataset,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength,plate
0,AMshABLATE_nsIs109,0.028947,0.025567,-0.00887,0.145966,1.239813,0.190805,2.056068,0.330311,57.238641,29.283398,0.006057,1.904028,0.0
1,N2,0.060303,0.0723,0.157983,0.099035,1.213483,0.145692,2.757155,0.25276,41.775481,29.871081,0.007687,7.330147,62.0
2,AMshABLATE_nsIs109,0.030928,0.025825,0.000614,0.092516,1.115931,0.127868,2.036695,0.256568,47.684237,28.306229,0.005724,2.600575,58.0
3,N2,0.042891,0.048333,0.037748,0.102591,1.21481,0.148884,2.70357,0.271092,49.11343,30.837654,0.007713,4.78246,120.0
4,N2,0.113829,0.094346,0.653023,0.11996,1.146936,0.155877,4.156619,0.245972,40.113238,28.564274,0.009363,3.751665,173.0


#### 3.1.4 `baseline` = False, `allele` = True

In [37]:
tap_data_allele_byplate = get_output_byplate(tap_data, baseline="false", allele=True)

print(f"Shape: {tap_data_allele_byplate.shape}")

tap_data_allele_byplate.head()

Shape: (312, 16)


Unnamed: 0,dataset,init_dura,init_prob,init_speed,final_dura,final_prob,final_speed,habit_dura,habit_prob,habit_speed,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,AMshABLATE_nsIs109,2.22,1.0,0.253153,0.866667,0.41978,0.167793,1.353333,0.58022,0.08536,-31.981982,-17.142857,-31.198888,0.643333,0.408791,0.006379
1,N2,2.64,0.780488,0.233712,0.766667,0.291875,0.208448,1.873333,0.488612,0.025264,-26.893939,-13.327206,-7.330305,1.163333,0.384595,0.008132
2,AMshABLATE_nsIs109,1.82,0.666667,0.224176,0.906667,0.336761,0.160319,0.913333,0.329905,0.063857,-42.307692,-5.434783,-19.281046,0.143333,0.293673,0.020633
3,N2,2.25,0.84,0.223556,0.756667,0.196452,0.211548,1.493333,0.643548,0.012008,-20.0,4.166667,3.379722,1.043333,0.678548,0.019563
4,N2,3.62,0.933333,0.269337,0.563333,0.21819,0.160423,3.056667,0.715144,0.108914,-25.414365,-23.076923,-10.892308,2.136667,0.499759,0.079577


In [38]:
# tap_data_allele_byplate[tap_data_allele_byplate.dataset=='N2_XJ1']

#### 3.1.5 `baseline` = "psa" , `allele` = False

In [39]:
psa_data_byplate = get_output_byplate(psa_data, baseline="psa", allele=False)

print(f"Shape: {psa_data_byplate.shape}")

psa_data_byplate.head()

Shape: (318, 43)


Unnamed: 0,Gene,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,AMshABLATE,0.084048,0.048078,0.213855,0.173633,0.11619,0.122304,0.213983,0.042478,0.936383,...,41.18793,41.18793,31.538197,31.538197,0.017031,0.009287,0.026074,0.022618,0.013657,0.014124
1,N2,0.070634,0.086083,0.201181,0.170146,0.147193,0.150634,0.241138,0.268167,0.909474,...,37.37931,37.37931,30.650897,30.650897,0.01149,0.009569,0.020244,0.017161,0.012907,0.013082
2,AMshABLATE,0.054834,0.034445,0.180244,0.140297,0.101329,0.106326,0.100021,0.034,0.749591,...,37.464584,37.045059,31.154585,31.331067,0.011026,0.007,0.021827,0.017477,0.012628,0.0128
3,N2,0.078074,0.06256,0.235954,0.182166,0.159633,0.160805,0.163377,0.329,0.955111,...,38.19434,38.19434,30.402042,30.402042,0.01413,0.009323,0.025125,0.020426,0.014596,0.014821
4,N2,0.127673,0.125018,0.227077,0.197505,0.167496,0.171382,0.654212,0.677227,0.939548,...,36.342426,36.342426,28.917527,28.917527,0.019697,0.012982,0.021259,0.020478,0.013067,0.013281


#### 3.1.6 `baseline` = "psa" , `allele` = True

In [40]:
psa_data_allele_byplate = get_output_byplate(psa_data, baseline="psa", allele=True)

print(f"Shape: {psa_data_allele_byplate.shape}")

psa_data_allele_byplate.head()

Shape: (318, 43)


Unnamed: 0,dataset,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,AMshABLATE_nsIs109,0.084048,0.048078,0.213855,0.173633,0.11619,0.122304,0.213983,0.042478,0.936383,...,41.18793,41.18793,31.538197,31.538197,0.017031,0.009287,0.026074,0.022618,0.013657,0.014124
1,N2,0.070634,0.086083,0.201181,0.170146,0.147193,0.150634,0.241138,0.268167,0.909474,...,37.37931,37.37931,30.650897,30.650897,0.01149,0.009569,0.020244,0.017161,0.012907,0.013082
2,AMshABLATE_nsIs109,0.054834,0.034445,0.180244,0.140297,0.101329,0.106326,0.100021,0.034,0.749591,...,37.464584,37.045059,31.154585,31.331067,0.011026,0.007,0.021827,0.017477,0.012628,0.0128
3,N2,0.078074,0.06256,0.235954,0.182166,0.159633,0.160805,0.163377,0.329,0.955111,...,38.19434,38.19434,30.402042,30.402042,0.01413,0.009323,0.025125,0.020426,0.014596,0.014821
4,N2,0.127673,0.125018,0.227077,0.197505,0.167496,0.171382,0.654212,0.677227,0.939548,...,36.342426,36.342426,28.917527,28.917527,0.019697,0.012982,0.021259,0.020478,0.013067,0.013281


## 3.2 Calculate Mean Distances and CIs

In [41]:

def extract_phenotypes(df):
    ''' 
    Splits a multi-column DataFrame into a list of DataFrames, each containing one phenotype

    input: 
        df (pd.DataFrame): dataframe with multiple columns (1st column is the index, the other are phenotypes)

    returns:
        list_phenotypes_df: list with 2 columns - one for index and one for phenotype, 
            for how many phenotypes there are in the input
    '''
    list_phenotypes_df = []
    index = df.columns[0]
    for i in df.columns[1:]:
        list_phenotypes_df.append(df[[index, i]].copy())

    return list_phenotypes_df



def ci95(df):
    """
    input: df of 4 columns: index, mean, count, std

    returns: df of 6 columns: index, mean, count, std, ci95_hi, ci95_low

    """
    for metric in df.columns.levels[0]:
        if metric == 'Gene':
            pass
        else:
            ci95_hi = []
            ci95_lo = []
            for i in df[metric].index:
                m = df[metric]['mean'].loc[i]
                c = df[metric]['count'].loc[i]
                s = df[metric]['sem'].loc[i]
                ci95_hi.append(stats.t.interval(confidence=0.95, df=c-1, loc=m, scale=s)[1])
                ci95_lo.append(stats.t.interval(confidence=0.95, df=c-1, loc=m, scale=s)[0])
            df[metric,'ci95_hi'] = ci95_hi
            df[metric,'ci95_lo'] = ci95_lo
            # df[metric,'ci95']=list(zip(ci95_lo,ci95_hi))
            
    return df



def calculate_MSD(list_of_dfs, by):
    new_list_of_dfs = []
    
    for df in list_of_dfs:
        # Get phenotype column name (assuming 2nd column is the metric)
        pheno_col = df.columns[1]
        
        # Calculate statistics
        stats = df.groupby(by)[df.columns[1]].agg(['mean', 'count', 'sem'])

        
        # Convert to MultiIndex if needed (more robust version)
        if not isinstance(stats.columns, pd.MultiIndex):
            stats.columns = pd.MultiIndex.from_tuples([(pheno_col, col) for col in stats.columns])
        
        # Calculate CI
        stats_2 = ci95(stats)
        
        # Get N2 control data
        if Screen == "Neuron_Genes_Screen":
            N2_mask = stats_2.index == 'N2' if by == "Gene" else stats_2.index.isin(['N2_XJ1','N2_N2'])
        else:
            N2_mask = stats_2.index == 'N2'
            
        N2_data = stats_2[N2_mask]
        
        # Subtract N2 values
        stats_2.iloc[:, 0] -= N2_data.iloc[0, 0]  # mean
        stats_2.iloc[:, 3] -= N2_data.iloc[0, 0]  # ci95_hi
        stats_2.iloc[:, 4] -= N2_data.iloc[0, 0]  # ci95_low
        
        new_list_of_dfs.append(stats_2)
    
    return new_list_of_dfs

In [42]:
def calculate_MSD(list_of_dfs, by):
    new_list_of_dfs = []
    
    for df in list_of_dfs:
        # Get phenotype column name (assuming 2nd column is the metric)
        pheno_col = df.columns[1]
        
        # Create proper MultiIndex structure
        stats = df.groupby(by)[df.columns[1]].agg(['mean', 'count', 'sem'])

        # Convert to MultiIndex if needed (more robust version)
        if not isinstance(stats.columns, pd.MultiIndex):
            stats.columns = pd.MultiIndex.from_tuples([(pheno_col, col) for col in stats.columns])
        
        # Calculate CIs
        stats_2 = ci95(stats)
        
        # Get N2 control data
        if Screen == "Neuron_Genes_Screen":
            N2_mask = stats_2.index == 'N2' if by == "Gene" else stats_2.index.isin(['N2_XJ1','N2_N2'])
        else:
            N2_mask = stats_2.index == 'N2'
            
        N2_data = stats_2[N2_mask]
        
        # Subtract N2 values
        stats_2.iloc[:, 0] -= N2_data.iloc[0, 0]  # mean
        stats_2.iloc[:, 3] -= N2_data.iloc[0, 0]  # ci95_hi
        stats_2.iloc[:, 4] -= N2_data.iloc[0, 0]  # ci95_low
        
        new_list_of_dfs.append(stats_2)
    
    return new_list_of_dfs

In [43]:
def get_MSD(list_MSD):
    '''
    input: List of dataframes, each representing a phenotype with calculated MSD.

    returns: Single combined dataframe joining all input dataframes with MSD values.
    '''
    for a in list_MSD:
        if a.columns.levels[0] == list_MSD[0].columns.levels[0]:
            MSD=a
        else:
            MSD=MSD.join(a)
    return MSD

In [44]:
def get_combined_MSD(baseline_byplate,tap_byplate, psa_byplate, by=['Gene','dataset']):
    """
    Combines MSD datafram from baseline plates and tap plates

    input:
        - baseline_byplate: baseline data by plate
        - tap_byplate: tap data by plate
        - by: what to group by "Gene" or "dataset"
    returns:
        - combined MSD dataframe
    """
    list_baseline_MSD=calculate_MSD(extract_phenotypes(baseline_byplate), by=by)

    list_tap_MSD=calculate_MSD(extract_phenotypes(tap_byplate), by=by)

    list_psa_MSD=calculate_MSD(extract_phenotypes(psa_byplate), by=by)

    baseline_MSD = get_MSD(list_baseline_MSD)
    
    tap_MSD = get_MSD(list_tap_MSD)

    psa_MSD = get_MSD(list_psa_MSD)

    combined_MSD = pd.merge(pd.merge(baseline_MSD, tap_MSD, on=by, how='outer'), psa_MSD, on=by, how='outer')

    combined_MSD=combined_MSD.rename(columns={"habit_dura":"Habituation of Response Duration",
                                         "habit_prob": "Habituation of Respones Probability",
                                         "habit_speed":"Habituation of Response Speed",
                                         "init_dura": "Initial Response Duration",
                                         "init_prob": "Initial Response Probability",
                                         "init_speed": "Initial Response Speed",
                                         "final_dura": "Final Response Duration",
                                         "final_prob": "Final Response Probability",
                                         "final_speed": "Final Response Speed",
                                         "recovery_dura": "Spontaneous Recovery of Response Duration",
                                         "recovery_prob": "Spontaneous Recovery of Response Probability",
                                         "recovery_speed": "Spontaneous Recovery of Response Speed",
                                         "memory_retention_dura": "Memory Retention of Response Duration",
                                         "memory_retention_prob": "Memory Retention of Response Probability",
                                         "memory_retention_speed": "Memory Retention of Response Speed"})

    combined_MSD=combined_MSD.reset_index()
    combined_MSD.columns = combined_MSD.columns.to_flat_index().str.join('-')
    combined_MSD=combined_MSD.rename(columns={by+"-": by})
    combined_MSD['Screen']=Screen
    
    return combined_MSD

### 3.2.1 Gene-level SMD

In [45]:
combined_MSD=get_combined_MSD(baseline_output_byplate,
                              tap_data_byplate, 
                              psa_data_byplate,
                              by='Gene')

combined_MSD.head()

Unnamed: 0,Gene,Instantaneous Speed-mean,Instantaneous Speed-count,Instantaneous Speed-sem,Instantaneous Speed-ci95_hi,Instantaneous Speed-ci95_lo,Interval Speed-mean,Interval Speed-count,Interval Speed-sem,Interval Speed-ci95_hi,...,PSA Peak_to_recovery Crab-count,PSA Peak_to_recovery Crab-sem,PSA Peak_to_recovery Crab-ci95_hi,PSA Peak_to_recovery Crab-ci95_lo,PSA Average Crab-mean,PSA Average Crab-count,PSA Average Crab-sem,PSA Average Crab-ci95_hi,PSA Average Crab-ci95_lo,Screen
0,AMshABLATE,-0.041748,5,0.001839,-0.036643,-0.046853,-0.046065,5,0.001349,-0.04232,...,5,0.000373,-0.000889,-0.002957,-0.001843,5,0.000412,-0.000698,-0.002987,Glia_Genes_Screen
1,N2,0.0,75,0.002216,0.004416,-0.004416,0.0,75,0.001907,0.0038,...,75,0.000247,0.000492,-0.000492,0.0,75,0.000251,0.0005,-0.0005,Glia_Genes_Screen
2,ced-10,-0.032393,12,0.002271,-0.027395,-0.037392,-0.039954,12,0.00234,-0.034804,...,12,0.000519,6.7e-05,-0.002219,-0.00099,12,0.000637,0.000411,-0.002391,Glia_Genes_Screen
3,ced-5,-0.029867,9,0.006192,-0.015588,-0.044147,-0.041997,9,0.005558,-0.02918,...,9,0.000781,-0.003997,-0.007598,-0.0065,9,0.000804,-0.004647,-0.008353,Glia_Genes_Screen
4,delm-1,-0.001439,10,0.006662,0.013633,-0.016511,-0.008167,10,0.011252,0.017287,...,10,0.000711,0.003089,-0.000128,0.001008,10,0.000594,0.00235,-0.000335,Glia_Genes_Screen


### 3.2.2 Allele-level SMD

In [46]:
allele_combined_MSD=get_combined_MSD(baseline_output_allele_byplate,
                                     tap_data_allele_byplate, 
                                     psa_data_allele_byplate,
                                     by='dataset')

allele_combined_MSD.head()

Unnamed: 0,dataset,Instantaneous Speed-mean,Instantaneous Speed-count,Instantaneous Speed-sem,Instantaneous Speed-ci95_hi,Instantaneous Speed-ci95_lo,Interval Speed-mean,Interval Speed-count,Interval Speed-sem,Interval Speed-ci95_hi,...,PSA Peak_to_recovery Crab-count,PSA Peak_to_recovery Crab-sem,PSA Peak_to_recovery Crab-ci95_hi,PSA Peak_to_recovery Crab-ci95_lo,PSA Average Crab-mean,PSA Average Crab-count,PSA Average Crab-sem,PSA Average Crab-ci95_hi,PSA Average Crab-ci95_lo,Screen
0,AMshABLATE_nsIs109,-0.041748,5,0.001839,-0.036643,-0.046853,-0.046065,5,0.001349,-0.04232,...,5,0.000373,-0.000889,-0.002957,-0.001843,5,0.000412,-0.000698,-0.002987,Glia_Genes_Screen
1,N2,0.0,75,0.002216,0.004416,-0.004416,0.0,75,0.001907,0.0038,...,75,0.000247,0.000492,-0.000492,0.0,75,0.000251,0.0005,-0.0005,Glia_Genes_Screen
2,ced-10_n3246,-0.032393,12,0.002271,-0.027395,-0.037392,-0.039954,12,0.00234,-0.034804,...,12,0.000519,6.7e-05,-0.002219,-0.00099,12,0.000637,0.000411,-0.002391,Glia_Genes_Screen
3,ced-5_n2002,-0.029867,9,0.006192,-0.015588,-0.044147,-0.041997,9,0.005558,-0.02918,...,9,0.000781,-0.003997,-0.007598,-0.0065,9,0.000804,-0.004647,-0.008353,Glia_Genes_Screen
4,delm-1_ok1226,-0.001439,10,0.006662,0.013633,-0.016511,-0.008167,10,0.011252,0.017287,...,10,0.000711,0.003089,-0.000128,0.001008,10,0.000594,0.00235,-0.000335,Glia_Genes_Screen


## 3.3 T-Stat analysis

In [47]:
def baseline_metrics(by=["Gene","dataset"]):
    """
    Create a list of empty dataframe and list of metrics for baseline analysis

    input:
        by (list): what to group by "Gene" or "dataset"
        
    returns:
        list_baseline_Tstats: dataframes to store t-statistics
        list_baseline_metrics: dataframes to store metic names
    """
    PD_baseline_instantspeed_T=pd.DataFrame(columns = [by,"Instantaneous Speed"])
    PD_baseline_intspeed_T=pd.DataFrame(columns = [by,"Interval Speed"])
    PD_baseline_bias_T=pd.DataFrame(columns = [by,"Bias"])
    PD_baseline_morphwidth_T=pd.DataFrame(columns = [by,"Morphwidth"])
    PD_baseline_midline_T=pd.DataFrame(columns = [by,"Midline"])
    PD_baseline_area_T=pd.DataFrame(columns = [by,"Area"])
    PD_baseline_angularspeed_T=pd.DataFrame(columns = [by,"Angular Speed"])
    PD_baseline_aspectratio_T=pd.DataFrame(columns = [by,"Aspect Ratio"])
    PD_baseline_kink_T=pd.DataFrame(columns = [by,"Kink"])
    PD_baseline_curve_T=pd.DataFrame(columns = [by,"Curve"])
    PD_baseline_crab_T=pd.DataFrame(columns = [by,"Crab"])
    PD_baseline_pathlength_T=pd.DataFrame(columns = [by,"Pathlength"])

    list_baseline_Tstats=[PD_baseline_instantspeed_T,
                        PD_baseline_intspeed_T,
                        PD_baseline_bias_T,
                        PD_baseline_morphwidth_T,
                        PD_baseline_midline_T,
                        PD_baseline_area_T,
                        PD_baseline_angularspeed_T,
                        PD_baseline_aspectratio_T,
                        PD_baseline_kink_T,
                        PD_baseline_curve_T,
                        PD_baseline_crab_T,
                        PD_baseline_pathlength_T]

    list_baseline_metrics=["Instantaneous Speed",
                        "Interval Speed",
                        "Bias",
                        "Morphwidth",
                        "Midline",
                        "Area",
                        "Angular Speed",
                        "Aspect Ratio",
                        "Kink",
                        "Curve",
                        "Crab",
                        "Pathlength"]
    
    return list_baseline_Tstats, list_baseline_metrics

In [48]:
def tap_metrics(by=["Gene","dataset"]):
    """
    Create a list of empty dataframes and list of metrics for tap analysis

    input:
        by (list): what to group by "Gene" or "dataset"
        
    returns:
        list_tap_Tstats: dataframes to store t-statistics
        list_tap_metrics: dataframes to store metic names
    """
    recovery_dura=pd.DataFrame(columns = [by,"Recovery Duration"])
    recovery_prob=pd.DataFrame(columns = [by,"Recovery Probability"])
    recovery_speed=pd.DataFrame(columns = [by,"Recovery Speed"])
    memory_retention_dura=pd.DataFrame(columns = [by,"Memory Retention Duration"])
    memory_retention_prob=pd.DataFrame(columns = [by,"Memory Retention Probability"])
    memory_retention_speed=pd.DataFrame(columns = [by,"Memory Retention Speed"])
    init_dura=pd.DataFrame(columns = [by,"Initial Duration"])
    init_prob=pd.DataFrame(columns = [by,"Initial Probability"])
    init_speed=pd.DataFrame(columns = [by,"Initial Speed"])
    final_dura=pd.DataFrame(columns = [by,"Final Duration"])
    final_prob=pd.DataFrame(columns = [by,"Final Probability"])
    final_speed=pd.DataFrame(columns = [by,"Final Speed"])
    hab_dura=pd.DataFrame(columns = [by,"Habituation of Duration"])
    hab_prob=pd.DataFrame(columns = [by,"Habituation of Probability"])
    hab_speed=pd.DataFrame(columns = [by,"Habituation of Speed"])

    list_tap_Tstats = [recovery_dura,
                    recovery_prob,
                    recovery_speed,
                    memory_retention_dura,
                    memory_retention_prob,
                    memory_retention_speed,
                    init_dura,
                    init_prob,
                    init_speed,
                    final_dura,
                    final_prob,
                    final_speed,
                    hab_dura,
                    hab_prob,
                    hab_speed]
    
    list_tap_metrics = ["recovery_dura",
                        "recovery_prob",
                        "recovery_speed",
                        "memory_retention_dura",
                        "memory_retention_prob",
                        "memory_retention_speed",
                        "init_dura",
                        "init_prob",
                        "init_speed",
                        "final_dura",
                        "final_prob",
                        "final_speed",
                        "habit_dura",
                        "habit_prob",
                        "habit_speed"]
    
    return list_tap_Tstats, list_tap_metrics

In [49]:
def psa_metrics(by=["Gene", "dataset"]):
    """
    Create a list of empty dataframes and list of metric names for PSA summary analysis.

    input:
        by (list): what to group by ("Gene" or "dataset")

    returns:
        list_psa_Tstats: list of empty DataFrames for t-statistics
        list_psa_metrics: list of metric names (short strings)
    """

    psa_initial_speed = pd.DataFrame(columns=[by,"PSA Initial Instantaneous Speed"])
    psa_recovery_speed = pd.DataFrame(columns=[by,"PSA Recovery Instantaneous Speed"])
    psa_peak_speed = pd.DataFrame(columns=[by,"PSA Peak Instantaneous Speed"])
    psa_initial_to_peak_speed = pd.DataFrame(columns=[by,"PSA Initial_to_peak Instantaneous Speed"])
    psa_peak_to_recovery_speed = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Instantaneous Speed"])
    psa_avg_speed = pd.DataFrame(columns=[by,"PSA Average Instantaneous Speed"])

    psa_initial_bias = pd.DataFrame(columns=[by,"PSA Initial Bias"])
    psa_recovery_bias = pd.DataFrame(columns=[by,"PSA Recovery Bias"])
    psa_peak_bias = pd.DataFrame(columns=[by,"PSA Peak Bias"])
    psa_initial_to_peak_bias = pd.DataFrame(columns=[by,"PSA Initial_to_peak Bias"])
    psa_peak_to_recovery_bias = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Bias"])
    psa_avg_bias = pd.DataFrame(columns=[by,"PSA Average Bias"])

    psa_initial_ang_speed = pd.DataFrame(columns=[by,"PSA Initial Angular Speed"])
    psa_recovery_ang_speed = pd.DataFrame(columns=[by,"PSA Recovery Angular Speed"])
    psa_peak_ang_speed = pd.DataFrame(columns=[by,"PSA Peak Angular Speed"])
    psa_initial_to_peak_ang_speed = pd.DataFrame(columns=[by,"PSA Initial_to_peak Angular Speed"])
    psa_peak_to_recovery_ang_speed = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Angular Speed"])
    psa_avg_ang_speed = pd.DataFrame(columns=[by,"PSA Average Angular Speed"])

    psa_initial_aspect = pd.DataFrame(columns=[by,"PSA Initial Aspect Ratio"])
    psa_recovery_aspect = pd.DataFrame(columns=[by,"PSA Recovery Aspect Ratio"])
    psa_peak_aspect = pd.DataFrame(columns=[by,"PSA Peak Aspect Ratio"])
    psa_initial_to_peak_aspect = pd.DataFrame(columns=[by,"PSA Initial_to_peak Aspect Ratio"])
    psa_peak_to_recovery_aspect = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Aspect Ratio"])
    psa_avg_aspect = pd.DataFrame(columns=[by,"PSA Average Aspect Ratio"])

    psa_initial_kink = pd.DataFrame(columns=[by,"PSA Initial Kink"])
    psa_recovery_kink = pd.DataFrame(columns=[by,"PSA Recovery Kink"])
    psa_peak_kink = pd.DataFrame(columns=[by,"PSA Peak Kink"])
    psa_initial_to_peak_kink = pd.DataFrame(columns=[by,"PSA Initial_to_peak Kink"])
    psa_peak_to_recovery_kink = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Kink"])
    psa_avg_kink = pd.DataFrame(columns=[by,"PSA Average Kink"])

    psa_initial_curve = pd.DataFrame(columns=[by,"PSA Initial Curve"])
    psa_recovery_curve = pd.DataFrame(columns=[by,"PSA Recovery Curve"])
    psa_peak_curve = pd.DataFrame(columns=[by,"PSA Peak Curve"])
    psa_initial_to_peak_curve = pd.DataFrame(columns=[by,"PSA Initial_to_peak Curve"])
    psa_peak_to_recovery_curve = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Curve"])
    psa_avg_curve = pd.DataFrame(columns=[by,"PSA Average Curve"])

    psa_initial_crab = pd.DataFrame(columns=[by,"PSA Initial Crab"])
    psa_recovery_crab = pd.DataFrame(columns=[by,"PSA Recovery Crab"])
    psa_peak_crab = pd.DataFrame(columns=[by,"PSA Peak Crab"])
    psa_initial_to_peak_crab = pd.DataFrame(columns=[by,"PSA Initial_to_peak Crab"])
    psa_peak_to_recovery_crab = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Crab"])
    psa_avg_crab = pd.DataFrame(columns=[by,"PSA Average Crab"])

    list_psa_Tstats = [
        psa_initial_speed, psa_recovery_speed, psa_peak_speed,
        psa_initial_to_peak_speed, psa_peak_to_recovery_speed, psa_avg_speed,

        psa_initial_bias, psa_recovery_bias, psa_peak_bias,
        psa_initial_to_peak_bias, psa_peak_to_recovery_bias, psa_avg_bias,

        psa_initial_ang_speed, psa_recovery_ang_speed, psa_peak_ang_speed,
        psa_initial_to_peak_ang_speed, psa_peak_to_recovery_ang_speed, psa_avg_ang_speed,

        psa_initial_aspect, psa_recovery_aspect, psa_peak_aspect,
        psa_initial_to_peak_aspect, psa_peak_to_recovery_aspect, psa_avg_aspect,

        psa_initial_kink, psa_recovery_kink, psa_peak_kink,
        psa_initial_to_peak_kink, psa_peak_to_recovery_kink, psa_avg_kink,

        psa_initial_curve, psa_recovery_curve, psa_peak_curve,
        psa_initial_to_peak_curve, psa_peak_to_recovery_curve, psa_avg_curve,

        psa_initial_crab, psa_recovery_crab, psa_peak_crab,
        psa_initial_to_peak_crab, psa_peak_to_recovery_crab, psa_avg_crab
    ]

    list_psa_metrics = [
    "PSA Initial Instantaneous Speed",
    "PSA Recovery Instantaneous Speed",
    "PSA Peak Instantaneous Speed",
    "PSA Initial_to_peak Instantaneous Speed",
    "PSA Peak_to_recovery Instantaneous Speed",
    "PSA Average Instantaneous Speed",

    "PSA Initial Bias",
    "PSA Recovery Bias",
    "PSA Peak Bias",
    "PSA Initial_to_peak Bias",
    "PSA Peak_to_recovery Bias",
    "PSA Average Bias",

    "PSA Initial Angular Speed",
    "PSA Recovery Angular Speed",
    "PSA Peak Angular Speed",
    "PSA Initial_to_peak Angular Speed",
    "PSA Peak_to_recovery Angular Speed",
    "PSA Average Angular Speed",

    "PSA Initial Aspect Ratio",
    "PSA Recovery Aspect Ratio",
    "PSA Peak Aspect Ratio",
    "PSA Initial_to_peak Aspect Ratio",
    "PSA Peak_to_recovery Aspect Ratio",
    "PSA Average Aspect Ratio",

    "PSA Initial Kink",
    "PSA Recovery Kink",
    "PSA Peak Kink",
    "PSA Initial_to_peak Kink",
    "PSA Peak_to_recovery Kink",
    "PSA Average Kink",

    "PSA Initial Curve",
    "PSA Recovery Curve",
    "PSA Peak Curve",
    "PSA Initial_to_peak Curve",
    "PSA Peak_to_recovery Curve",
    "PSA Average Curve",

    "PSA Initial Crab",
    "PSA Recovery Crab",
    "PSA Peak Crab",
    "PSA Initial_to_peak Crab",
    "PSA Peak_to_recovery Crab",
    "PSA Average Crab"
]
    
    return list_psa_Tstats, list_psa_metrics


In [50]:
def TTest(Type, DF_ref, output, by=["Gene", "dataset"]):
    """
    Perform two sample t-test for each unique Gene/dataset column in the Df_ref
    input: 
        - a:column name of values 
        - DF_ref:reference dataframe
        - output: output df to store results in 
        - by: what to group by "Gene" or "dataset"
        
    """
    for a in DF_ref[by].unique():
        Tstat_a = ttest_ind(DF_ref[DF_ref.dataset == a][Type], DF_ref[DF_ref.Allele.isin(["XJ1","N2"])][Type],equal_var=False)[0]
        Tstat_g = ttest_ind(DF_ref[DF_ref.Gene == a][Type], DF_ref[DF_ref.Gene == "N2"][Type],equal_var=False)[0]
        Tstat = Tstat_g if by=="Gene" else Tstat_a
        row = [a, Tstat]
        output.loc[len(output)] = row
    # print(output)

def do_TTest(by=["Gene", "dataset"], baseline=["true", "false", "psa"]):
    """
    Perform TTest function for each unique Gene/dataset column in baseline_output/tap_data
    
    input: 
        - by: what to group by "Gene" or "dataset"
        - baseline: whether or not to use baseline data

    returns: sorted T-statistics dataframe
    """

    if baseline=="true":
        list_Tstats, list_metrics = baseline_metrics(by)
        data = baseline_output
    elif baseline=="false":
        list_Tstats,list_metrics = tap_metrics(by)
        data = tap_data
    else:
        list_Tstats,list_metrics = psa_metrics(by)
        data = psa_data
    for x in data[by].unique():
        if Screen=="Neuron_Genes_Screen":
            condition = x in (["N2"] if by == "Gene" else ["N2_XJ1", "N2_N2"])
        else:
            condition = (x =="N2")
        if condition:
            pass
        else:
            output_gene=data[data[by]==x]
            gene_data=data[data['Date'].isin(output_gene['Date'].unique())]
            if Screen=="Neuron_Genes_Screen":
                gene_data_final = gene_data[gene_data[by].isin(['N2', x])] if by=="Gene" else gene_data[gene_data[by].isin(['N2_N2','N2_XJ1', x])]
            else:
                gene_data_final = gene_data[gene_data[by].isin(['N2', x])]

            for a,b in zip(list_metrics, list_Tstats):
                TTest(a, gene_data_final, b, by) # calls t test function
    
    PD_Tstats=pd.DataFrame()
    for a in list_Tstats:
        b=a.groupby([by], as_index=False).mean()
        if b.columns.values[1] == list_Tstats[0].columns.values[1]:
            PD_Tstats=b
        else:
            PD_Tstats=PD_Tstats.join(b.iloc[:,1])
            
    PD_Tstats=PD_Tstats.set_index(by)
    
    return PD_Tstats
            

### T-stat on Baseline data:

### 3.3.1 Allele-level T-stat analysis of baseline data

In [51]:
warnings.filterwarnings('ignore')

PD_baseline_Tstats_allele = do_TTest("dataset", baseline="true") # get sorted T-statistics DataFrame 

# PD_baseline_Tstats_allele_sorted=PD_baseline_Tstats_allele.sort_index()

PD_baseline_Tstats_allele.head()

Unnamed: 0_level_0,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AMshABLATE_nsIs109,-170.402072,-185.50277,-129.051227,8.436649,-72.062483,-19.016885,-72.427009,67.186598,60.225199,-59.206564,-76.040905,-174.635472
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10_n3246,-178.516631,-121.15098,-101.994376,-96.420955,-590.22632,-328.825337,86.467808,324.132657,276.5116,81.799335,7.8202,34.234078
ced-5_n2002,-124.360515,-132.839687,-87.457709,-162.535954,-311.278255,-347.893201,6.824703,-59.527752,-97.326696,-156.138037,-58.958304,-33.193366
delm-1_ok1226,-3.381671,-7.564952,-7.161008,-335.410406,-670.970972,-489.588565,92.161494,58.931913,9.494694,-2.329122,56.520463,106.468057


### 3.3.2 Gene-level T-stat analysis of baseline data

In [52]:
warnings.filterwarnings('ignore')

PD_baseline_Tstats=do_TTest("Gene", baseline="true") # get sorted T-statistics DataFrame 

# PD_baseline_Tstats_sorted=PD_baseline_Tstats.sort_index()

PD_baseline_Tstats.head()

Unnamed: 0_level_0,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AMshABLATE,-170.402072,-185.50277,-129.051227,8.436649,-72.062483,-19.016885,-72.427009,67.186598,60.225199,-59.206564,-76.040905,-174.635472
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10,-178.516631,-121.15098,-101.994376,-96.420955,-590.22632,-328.825337,86.467808,324.132657,276.5116,81.799335,7.8202,34.234078
ced-5,-124.360515,-132.839687,-87.457709,-162.535954,-311.278255,-347.893201,6.824703,-59.527752,-97.326696,-156.138037,-58.958304,-33.193366
delm-1,-3.381671,-7.564952,-7.161008,-335.410406,-670.970972,-489.588565,92.161494,58.931913,9.494694,-2.329122,56.520463,106.468057


### T-stat analysis for tap-response data:

### 3.3.3 Allele level T-stat analysis of tap response data

In [53]:
warnings.filterwarnings('ignore')

PD_habituation_Tstats_allele = do_TTest("dataset", baseline="false") # get sorted T-statistics DataFrame 

# PD_habituation_Tstats_allele_sorted=PD_habituation_Tstats_allele.sort_index()

PD_habituation_Tstats_allele.head()

Unnamed: 0_level_0,Recovery Duration,Recovery Probability,Recovery Speed,Memory Retention Duration,Memory Retention Probability,Memory Retention Speed,Initial Duration,Initial Probability,Initial Speed,Final Duration,Final Probability,Final Speed,Habituation of Duration,Habituation of Probability,Habituation of Speed
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AMshABLATE_nsIs109,-1.961762,1.918844,-5.224931,-5.484683,-3.125683,-3.300698,-1.895033,-0.967447,-2.824795,1.802522,5.264003,-4.91969,-3.987772,-4.36802,-0.235936
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10_n3246,-0.010591,1.286679,-0.208076,-1.905773,-3.286405,-3.637684,-1.262823,-3.630366,-7.253569,-5.115169,1.645364,-6.711686,0.029125,-4.365596,-2.704125
ced-5_n2002,0.908764,-2.60661,-1.672565,-1.643956,-11.953176,-1.227305,-2.330504,-2.829142,-8.294158,0.691405,0.44202,-13.289988,-2.931369,-1.497765,-0.3966
delm-1_ok1226,1.479204,0.679554,-0.081651,0.352711,0.613993,-0.746649,-0.896926,-1.425577,-2.631308,-0.986245,-2.943531,-1.50898,-0.414799,0.264624,-0.414722


### 3.3.4 Gene-level T-stat analysis of Tap response data

In [54]:
warnings.filterwarnings('ignore')

PD_habituation_Tstats = do_TTest("Gene", baseline="false") # get sorted T-statistics DataFrame 

PD_habituation_Tstats_sorted=PD_habituation_Tstats.sort_index()

PD_habituation_Tstats.head()

Unnamed: 0_level_0,Recovery Duration,Recovery Probability,Recovery Speed,Memory Retention Duration,Memory Retention Probability,Memory Retention Speed,Initial Duration,Initial Probability,Initial Speed,Final Duration,Final Probability,Final Speed,Habituation of Duration,Habituation of Probability,Habituation of Speed
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AMshABLATE,-1.961762,1.918844,-5.224931,-5.484683,-3.125683,-3.300698,-1.895033,-0.967447,-2.824795,1.802522,5.264003,-4.91969,-3.987772,-4.36802,-0.235936
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10,-0.010591,1.286679,-0.208076,-1.905773,-3.286405,-3.637684,-1.262823,-3.630366,-7.253569,-5.115169,1.645364,-6.711686,0.029125,-4.365596,-2.704125
ced-5,0.908764,-2.60661,-1.672565,-1.643956,-11.953176,-1.227305,-2.330504,-2.829142,-8.294158,0.691405,0.44202,-13.289988,-2.931369,-1.497765,-0.3966
delm-1,1.479204,0.679554,-0.081651,0.352711,0.613993,-0.746649,-0.896926,-1.425577,-2.631308,-0.986245,-2.943531,-1.50898,-0.414799,0.264624,-0.414722


### T-stat analysis for psa data:

### 3.3.5 Allele level T-stat analysis of PSA data

In [55]:
warnings.filterwarnings('ignore')

psa_tstats_allele = do_TTest("dataset", baseline="psa") # get sorted T-statistics DataFrame 

psa_tstats_allele.head()

Unnamed: 0_level_0,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,PSA Initial_to_peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AMshABLATE_nsIs109,-2.879719,-6.511223,-2.413182,-3.684067,-7.751243,-8.056602,-3.321509,-5.647825,-3.109608,-4.551242,...,0.336616,0.976302,-0.135794,3.246578,-1.626474,-4.445465,1.866462,0.053034,-2.015918,-1.688506
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10_n3246,-8.285814,-0.764735,-4.36643,-18.736589,-2.782467,-15.382731,-4.602111,-3.798713,-4.591808,-9.47544,...,1.748417,-3.354388,10.977187,9.935081,-5.665189,-0.024395,-0.997846,-4.56922,-0.494284,-0.423095
ced-5_n2002,-8.461492,-7.653565,-4.288245,-17.067475,-13.736343,-17.399081,-9.53171,-4.706598,-4.908107,-8.815168,...,-3.033007,-2.828781,-4.935898,-3.433947,-5.649231,-4.374404,-2.451194,-6.357402,-5.685529,-6.28097
delm-1_ok1226,-1.618097,-2.386163,-8.571276,-12.119191,-8.467582,-10.31466,-3.06293,-1.498044,-1.04032,-1.741669,...,0.291632,-1.460014,8.948989,8.352395,0.740383,0.866805,-0.41653,-0.01283,2.686459,2.197676


### 3.3.6 Gene-level T-stat analysis of PSA data

In [56]:
warnings.filterwarnings('ignore')

psa_tstats = do_TTest("Gene", baseline="psa") # get sorted T-statistics DataFrame 

psa_tstats.head()

Unnamed: 0_level_0,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,PSA Initial_to_peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AMshABLATE,-2.879719,-6.511223,-2.413182,-3.684067,-7.751243,-8.056602,-3.321509,-5.647825,-3.109608,-4.551242,...,0.336616,0.976302,-0.135794,3.246578,-1.626474,-4.445465,1.866462,0.053034,-2.015918,-1.688506
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10,-8.285814,-0.764735,-4.36643,-18.736589,-2.782467,-15.382731,-4.602111,-3.798713,-4.591808,-9.47544,...,1.748417,-3.354388,10.977187,9.935081,-5.665189,-0.024395,-0.997846,-4.56922,-0.494284,-0.423095
ced-5,-8.461492,-7.653565,-4.288245,-17.067475,-13.736343,-17.399081,-9.53171,-4.706598,-4.908107,-8.815168,...,-3.033007,-2.828781,-4.935898,-3.433947,-5.649231,-4.374404,-2.451194,-6.357402,-5.685529,-6.28097
delm-1,-1.618097,-2.386163,-8.571276,-12.119191,-8.467582,-10.31466,-3.06293,-1.498044,-1.04032,-1.741669,...,0.291632,-1.460014,8.948989,8.352395,0.740383,0.866805,-0.41653,-0.01283,2.686459,2.197676


# 4. Merging t-stat data into one dataset

In [57]:
def pop_cols(combined):
    """
    Reorders columns in the combined dataframe. 
    (pops specific columns["Area", "Midline", "Morphwidth", "Angular Speed"] and
    reinserts at different positions)

    input:
        combined: dataframe with columns to be reordered

    returns: 
        NA    
        
    """
    first_col=combined.pop("Area")
    combined.insert(0,"Area",first_col)

    first_col=combined.pop("Midline")
    combined.insert(0,"Midline",first_col)

    first_col=combined.pop("Morphwidth")
    combined.insert(0,"Morphwidth",first_col)

    first_col=combined.pop("Angular Speed")
    combined.insert(5,"Angular Speed",first_col)

def pop_last(combined):
    """
    Reorders the last three columns of the combined dataframe.
    input:
        combined: dataframe with columns to be reordered

    """
    last_col=combined.pop("Spontaneous Recovery of Response Duration")
    combined.insert(26,"Spontaneous Recovery of Response Duration",last_col)

    last_col=combined.pop("Spontaneous Recovery of Response Probability")
    combined.insert(26,"Spontaneous Recovery of Response Probability",last_col)

    last_col=combined.pop("Spontaneous Recovery of Response Speed")
    combined.insert(26,"Spontaneous Recovery of Response Speed",last_col)

    last_col=combined.pop("Memory Retention of Response Duration")
    combined.insert(26,"Memory Retention of Response Duration",last_col)

    last_col=combined.pop("Memory Retention of Response Probability")
    combined.insert(26,"Memory Retention of Response Probability",last_col)

    last_col=combined.pop("Memory Retention of Response Speed")
    combined.insert(26,"Memory Retention of Response Speed",last_col)

def rename_columns(df):
    '''
    Renames columns in the input dataframe
    input:
        combined: dataframe with columns to be renamed   
    returns:
        input dataframe with renamed columns 
    '''
    renames = {
        "Habituation of Duration": "Habituation of Response Duration",
        "Habituation of Probability": "Habituation of Respones Probability",
        "Habituation of Speed": "Habituation of Response Speed",
        "Initial Duration": "Initial Response Duration",
        "Initial Probability": "Initial Response Probability",
        "Initial Speed": "Initial Response Speed",
        "Final Duration": "Final Response Duration",
        "Final Probability": "Final Response Probability",
        "Final Speed": "Final Response Speed",
        "Recovery Duration": "Spontaneous Recovery of Response Duration",
        "Recovery Probability": "Spontaneous Recovery of Response Probability",
        "Recovery Speed": "Spontaneous Recovery of Response Speed",
        "Memory Retention Duration": "Memory Retention of Response Duration",
        "Memory Retention Probability": "Memory Retention of Response Probability",
        "Memory Retention Speed": "Memory Retention of Response Speed"
    }
    return df.rename(columns=renames)

def merge_Tstats(baseline, habituation, by=["Gene", "dataset"], Screen=Screen, psa=False):
    """
    merge baseline and tap response dataframes based on the Gene/dataset
    normalize the merged dataframe and then return it with melted version

    input:
        - baseline: baseline dataframe to merge
        - habituation: habituation dataframe to merge
        - by: what to group by "Gene" or "dataset"
    """

    #merge baseline and habituation data
    combined_Tstats = pd.merge(baseline, habituation, on=by, how='left')
    combined_Tstats = combined_Tstats.sort_index() # sort by index

    # ------------ NORMALISATION STEPS TO BE MOVED TO DASHBOARD -------------------
    # # normalise combined dataframe by subtracting mean and div by sd
    # combined_Tstats_normalized = (combined_Tstats-combined_Tstats.mean())/combined_Tstats.std()

    # if by=="dataset" and Screen=="Neuron_Genes_Screen":
    #     combined_Tstats_normalized_2 = combined_Tstats-combined_Tstats[combined_Tstats.index=="N2_XJ1"].squeeze()
    # else :
    #     combined_Tstats_normalized_2 = combined_Tstats-combined_Tstats[combined_Tstats.index=="N2"].squeeze()  

    pop_cols(combined_Tstats) # reorder columns

    # Skip this step if data = psa
    if not psa:
        #rename columns of combined and normalized df
        combined_Tstats = rename_columns(combined_Tstats)
        # combined_Tstats_normalized_2=rename_columns(combined_Tstats_normalized_2)
        pop_cols(combined_Tstats) # reorder columns
        pop_last(combined_Tstats) # reorder columns

    # -------------- PIVOTING STEPS TO BE MOVED TO DASHBOARD ---------------------
    # # Melt the combined dataframe
    # combined_Tstats_melted=combined_Tstats.reset_index()
    # combined_Tstats_melted=pd.melt(combined_Tstats_melted, id_vars=[by],
    #                             var_name='Metric',
    #                             value_name='T_score')
    
    # # Sort the melted dataframe by T_score
    # combined_Tstats_melted_sorted=combined_Tstats_melted.sort_values(by=['T_score'])

    # # Melt the normalized dataframe
    # combined_Tstats_normalized_melted=combined_Tstats.reset_index()
    # combined_Tstats_normalized_melted=pd.melt(combined_Tstats_normalized_melted, id_vars=[by],
    #                                                var_name='Metric',
    #                                                value_name='T_score')

    # add Screen column to df and its melted version
    combined_Tstats['Screen']=Screen
    # combined_Tstats_normalized_melted['Screen']=Screen

    return combined_Tstats#, combined_Tstats_normalized_melted



## 4.1 Gene-level

- Pass Tap and baseline through merge_Tstats() as df1
- Pass PSA and baseline through merge_Tstats()as df2
- pd.merge df1 and df2 using all columns of baseline

In [58]:
# Baseline + Tap
combined_Tstats = merge_Tstats(PD_baseline_Tstats, PD_habituation_Tstats, "Gene")

In [59]:
# Baseline + PSA 
combined_Tstats_psa = merge_Tstats(
    PD_baseline_Tstats, psa_tstats, by="Gene", psa=True
)

In [60]:
# Baseline + Tap + PSA
final_tstat = pd.merge(combined_Tstats.reset_index(), combined_Tstats_psa.reset_index(), on = PD_baseline_Tstats.columns.to_list().append(['Gene','Screen']), how = 'inner')

final_tstat.head()

Unnamed: 0,Gene,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,AMshABLATE,8.436649,-72.062483,-19.016885,-170.402072,-185.50277,-72.427009,-129.051227,67.186598,60.225199,...,0.336616,0.976302,-0.135794,3.246578,-1.626474,-4.445465,1.866462,0.053034,-2.015918,-1.688506
1,N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ced-10,-96.420955,-590.22632,-328.825337,-178.516631,-121.15098,86.467808,-101.994376,324.132657,276.5116,...,1.748417,-3.354388,10.977187,9.935081,-5.665189,-0.024395,-0.997846,-4.56922,-0.494284,-0.423095
3,ced-5,-162.535954,-311.278255,-347.893201,-124.360515,-132.839687,6.824703,-87.457709,-59.527752,-97.326696,...,-3.033007,-2.828781,-4.935898,-3.433947,-5.649231,-4.374404,-2.451194,-6.357402,-5.685529,-6.28097
4,delm-1,-335.410406,-670.970972,-489.588565,-3.381671,-7.564952,92.161494,-7.161008,58.931913,9.494694,...,0.291632,-1.460014,8.948989,8.352395,0.740383,0.866805,-0.41653,-0.01283,2.686459,2.197676


In [61]:
# # Baseline + Tap + PSA melted
# final_tstat_melted = pd.concat([combined_Tstats_normalized_melted, combined_Tstats_psa_melted]).drop_duplicates()

# final_tstat_melted.head()

## 4.2 Allele level 


- Pass Tap and baseline through merge_Tstats() as df3
- Pass PSA and baseline through merge_Tstats()as df4
- pd.merge df3 and df4 using all columns of basline

In [62]:
# Baseline + Tap
combined_Tstats_allele = merge_Tstats(PD_baseline_Tstats_allele,PD_habituation_Tstats_allele, "dataset")

In [63]:
# Baseline + PSA 
combined_Tstats_psa_allele = merge_Tstats(
    PD_baseline_Tstats_allele, psa_tstats_allele, by="dataset", psa=True
)

In [64]:
# Baseline + Tap + PSA
final_tstat_allele = pd.merge(combined_Tstats_allele.reset_index(), combined_Tstats_psa_allele.reset_index(), on = PD_baseline_Tstats_allele.columns.to_list().append(['dataset','Screen']), how = 'outer')

final_tstat_allele.head()

Unnamed: 0,dataset,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,AMshABLATE_nsIs109,8.436649,-72.062483,-19.016885,-170.402072,-185.50277,-72.427009,-129.051227,67.186598,60.225199,...,0.336616,0.976302,-0.135794,3.246578,-1.626474,-4.445465,1.866462,0.053034,-2.015918,-1.688506
1,N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ced-10_n3246,-96.420955,-590.22632,-328.825337,-178.516631,-121.15098,86.467808,-101.994376,324.132657,276.5116,...,1.748417,-3.354388,10.977187,9.935081,-5.665189,-0.024395,-0.997846,-4.56922,-0.494284,-0.423095
3,ced-5_n2002,-162.535954,-311.278255,-347.893201,-124.360515,-132.839687,6.824703,-87.457709,-59.527752,-97.326696,...,-3.033007,-2.828781,-4.935898,-3.433947,-5.649231,-4.374404,-2.451194,-6.357402,-5.685529,-6.28097
4,delm-1_ok1226,-335.410406,-670.970972,-489.588565,-3.381671,-7.564952,92.161494,-7.161008,58.931913,9.494694,...,0.291632,-1.460014,8.948989,8.352395,0.740383,0.866805,-0.41653,-0.01283,2.686459,2.197676


In [65]:
# # Baseline + Tap + PSA melted
# final_tstat_melted_allele = pd.concat([combined_Tstats_normalized_melted_allele, combined_Tstats_psa_melted_allele]).drop_duplicates()

# final_tstat_melted_allele.head()

# 5. Save data to database (sqlite3)

#### A janky way to add data and update the sql 

1. Read table to pd.DataFrame
2. Add new data to pd.DataFrame
3. Replace old table with newly updated pd.DataFrame

# Primary Keys For Each SQL Table:

####  -- Gene_Allele_WormBaseID:
WBGene, WBAllele
#### -- alleleMSD:
dataset, Screen
#### -- gene_MSD:
Gene, Screen
#### -- allele_profile_data:
dataset, Metric, Screen
#### -- gene_profile_data:
Gene, Metric, Screen
#### -- tap_baseline_data:
Time, Plate_id, Date, Screen, dataset
#### -- tap_response_data:
plate, Date, Plate_id, Screen, taps, dataset, Gene, Allele
#### -- tstat_allele_data:
dataset, Screen
#### -- tstat_gene_data:
Gene, Screen
#### -- psa_summarized_data:
Plate_id,Date,Scree,dataset,Gene,Allele

In [66]:
# print(tap_output.head(5))
# print(baseline_output.head(5))

tap_output.Screen = Screen
baseline_output.Screen = Screen

# print(tap_output.head(5))
# print(baseline_output.head(5))

In [56]:

### This code will connect to PostgreSQL database and write non-duplicate data into the database tables.

# Loads database config values from database.ini file and validates that user and password are set.
config = load_config()
if (config['user'] == "" or config['password'] == ""):
    print("Please set your user and password in the database.ini file.")
    sys.exit(1)
    
# Creates a connection pool to PostgreSQL database using SQLAlchemy.
engine = create_engine(f"postgresql+psycopg://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}")

# Function to insert data into PostgreSQL table, skipping duplicates based on primary keys.
def postgres_skip_on_duplicate(pd_table, conn, keys, data_iter):
    data = [dict(zip(keys,row)) for row in data_iter]
    conn.execute(insert(pd_table.table).on_conflict_do_nothing(), data)

# --------- Write the dataframes to PostgreSQL tables -----------

# Complete tap response data
print("working on tap_output:") 
tap_output.to_sql('tap_response_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Complete baseline data  >NO
print("working on tap_baseline_data:") 
baseline_output.to_sql('tap_baseline_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Baseline + Tap + PSA combined tstat data by Gene
print("working on tstat_gene_data")
final_tstat.reset_index().to_sql('tstat_gene_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Baseline + Tap + PSA combined tstat data by Allele
print("working on tstat_allele_data")
final_tstat_allele.reset_index().to_sql('tstat_allele_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# MSD Baseline + Tap + PSA by Gene
print("working on gene_MSD")
combined_MSD.to_sql('gene_MSD', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# MSD Baseline + Tap + PSA by Allele
print("working on allele_MSD")
allele_combined_MSD.to_sql('allele_MSD', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Summarised PSA data (speed, kink, curve, etc.)
print("working on psa_data:") 
psa_data.to_sql('psa_summarised_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# # Melted Baseline + Tap + PSA combined tstat data by Gene
# print("working on gene_profile_data")
# final_tstat_melted.to_sql('gene_profile_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# # Melted Baseline + Tap + PSA combined tstat data by Allele
# print("working on allele_profile_data")
# final_tstat_melted_allele.to_sql('allele_profile_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

working on tap_output:
working on tap_baseline_data:
working on tstat_gene_data
working on tstat_allele_data
working on gene_MSD
working on allele_MSD
working on psa_data:


### Use the below cell to just replace/update one table:

In [70]:
# Loads database config values from database.ini file and validates that user and password are set.
config = load_config()
if (config['user'] == "" or config['password'] == ""):
    print("Please set your user and password in the database.ini file.")
    sys.exit(1)
    
# Creates a connection pool to PostgreSQL database using SQLAlchemy.
engine = create_engine(f"postgresql+psycopg://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}")

# Function to insert data into PostgreSQL table, skipping duplicates based on primary keys.
def postgres_skip_on_duplicate(pd_table, conn, keys, data_iter):
    data = [dict(zip(keys,row)) for row in data_iter]
    conn.execute(insert(pd_table.table).on_conflict_do_nothing(), data)


# Complete tap response data
print("working on tap_output:") 
tap_psa_output.to_sql('tap_response_data', engine, if_exists='append', index=False, method=None)

working on tap_output:


ProgrammingError: (psycopg.errors.UndefinedColumn) column "Time" of relation "tap_response_data" does not exist
LINE 1: ...e_id", "Screen", taps, dataset, "Gene", "Allele", "Time", n,...
                                                             ^
[SQL: INSERT INTO tap_response_data (time, dura, dist, prob, speed, plate, "Date", "Plate_id", "Screen", taps, dataset, "Gene", "Allele", "Time", n, "Number", "Instantaneous Speed", "Interval Speed", "Bias", "Tap", "Morphwidth", "Midline", "Area", "Angular Speed", "Aspect Ratio", "Kink", "Curve", "Crab", "Pathlength") VALUES (%(time)s, %(dura)s, %(dist)s, %(prob)s, %(speed)s, %(plate)s, %(Date)s::BIGINT, %(Plate_id)s::VARCHAR, %(Screen)s::VARCHAR, %(taps)s, %(dataset)s::VARCHAR, %(Gene)s::VARCHAR, %(Allele)s::VARCHAR, %(Time)s, %(n)s, %(Number)s, %(Instantaneous_Speed)s, %(Interval_Speed)s, %(Bias)s, %(Tap)s, %(Morphwidth)s, %(Midline)s, %(Area)s, %(Angular_Speed)s, %(Aspect_Ratio)s, %(Kink)s, %(Curve)s, %(Crab)s, %(Pathlength)s)]
[parameters: [{'time': 599.983, 'dura': 2.22, 'dist': 0.562, 'prob': 1.0, 'speed': 0.2531531531531532, 'plate': 1.0, 'Date': 20240724, 'Plate_id': '20240724_023625_A0724', 'Screen': 'Glia_Genes_Screen', 'taps': 1.0, 'dataset': 'AMshABLATE_nsIs109', 'Gene': 'AMshABLATE', 'Allele': 'nsIs109', 'Time': 607.04, 'n': 68.13793103448276, 'Number': 11.89655172413793, 'Instantaneous_Speed': 0.08404828, 'Interval_Speed': 0.09432586, 'Bias': 0.21398276, 'Tap': 0.0, 'Morphwidth': 0.12407587, 'Midline': 1.1186051, 'Area': 0.15321799, 'Angular_Speed': 12.52931, 'Aspect_Ratio': 0.46332762, 'Kink': 83.25862, 'Curve': 41.18793, 'Crab': 0.017031034, 'Pathlength': 0.09562069}, {'time': 609.979, 'dura': 1.45, 'dist': 0.371, 'prob': 0.5454545454545454, 'speed': 0.2558620689655172, 'plate': 1.0, 'Date': 20240724, 'Plate_id': '20240724_023625_A0724', 'Screen': 'Glia_Genes_Screen', 'taps': 2.0, 'dataset': 'AMshABLATE_nsIs109', 'Gene': 'AMshABLATE', 'Allele': 'nsIs109', 'Time': 617.014, 'n': 69.36206896551724, 'Number': 25.155172413793103, 'Instantaneous_Speed': 0.16272758, 'Interval_Speed': 0.13711207, 'Bias': 0.34686208, 'Tap': 0.0, 'Morphwidth': 0.112041384, 'Midline': 1.1285137, 'Area': 0.14582771, 'Angular_Speed': 14.563793, 'Aspect_Ratio': 0.39818963, 'Kink': 65.17759, 'Curve': 36.9069, 'Crab': 0.024748277, 'Pathlength': 0.66763794}, {'time': 619.996, 'dura': 2.11, 'dist': 0.669, 'prob': 0.52, 'speed': 0.3170616113744076, 'plate': 1.0, 'Date': 20240724, 'Plate_id': '20240724_023625_A0724', 'Screen': 'Glia_Genes_Screen', 'taps': 3.0, 'dataset': 'AMshABLATE_nsIs109', 'Gene': 'AMshABLATE', 'Allele': 'nsIs109', 'Time': 627.009, 'n': 70.94444444444444, 'Number': 40.870370370370374, 'Instantaneous_Speed': 0.19981481, 'Interval_Speed': 0.107235186, 'Bias': 0.7932778, 'Tap': 0.0, 'Morphwidth': 0.10114629, 'Midline': 1.1213518, 'Area': 0.13391848, 'Angular_Speed': 16.11111, 'Aspect_Ratio': 0.35142592, 'Kink': 56.307407, 'Curve': 34.988888, 'Crab': 0.026074074, 'Pathlength': 1.1174259}, {'time': 629.971, 'dura': 1.59, 'dist': 0.422, 'prob': 0.8095238095238095, 'speed': 0.2654088050314465, 'plate': 1.0, 'Date': 20240724, 'Plate_id': '20240724_023625_A0724', 'Screen': 'Glia_Genes_Screen', 'taps': 4.0, 'dataset': 'AMshABLATE_nsIs109', 'Gene': 'AMshABLATE', 'Allele': 'nsIs109', 'Time': 637.026, 'n': 71.27586206896552, 'Number': 48.0, 'Instantaneous_Speed': 0.2077207, 'Interval_Speed': 0.100125864, 'Bias': 0.8733966, 'Tap': 0.0, 'Morphwidth': 0.094000004, 'Midline': 1.1287068, 'Area': 0.12957662, 'Angular_Speed': 14.318966, 'Aspect_Ratio': 0.32424137, 'Kink': 51.543102, 'Curve': 34.353447, 'Crab': 0.023929311, 'Pathlength': 1.8166034}, {'time': 639.968, 'dura': 1.5, 'dist': 0.378, 'prob': 0.8958333333333334, 'speed': 0.252, 'plate': 1.0, 'Date': 20240724, 'Plate_id': '20240724_023625_A0724', 'Screen': 'Glia_Genes_Screen', 'taps': 5.0, 'dataset': 'AMshABLATE_nsIs109', 'Gene': 'AMshABLATE', 'Allele': 'nsIs109', 'Time': 647.003, 'n': 71.21666666666667, 'Number': 54.15, 'Instantaneous_Speed': 0.213855, 'Interval_Speed': 0.09183667, 'Bias': 0.88996667, 'Tap': 0.0, 'Morphwidth': 0.09228, 'Midline': 1.1386, 'Area': 0.12954302, 'Angular_Speed': 11.34, 'Aspect_Ratio': 0.28436667, 'Kink': 43.355, 'Curve': 31.72, 'Crab': 0.021453332, 'Pathlength': 2.6799166}, {'time': 649.961, 'dura': 1.14, 'dist': 0.264, 'prob': 0.7777777777777778, 'speed': 0.231578947368421, 'plate': 1.0, 'Date': 20240724, 'Plate_id': '20240724_023625_A0724', 'Screen': 'Glia_Genes_Screen', 'taps': 6.0, 'dataset': 'AMshABLATE_nsIs109', 'Gene': 'AMshABLATE', 'Allele': 'nsIs109', 'Time': 657.006, 'n': 69.56896551724138, 'Number': 55.98275862068966, 'Instantaneous_Speed': 0.2045, 'Interval_Speed': 0.09959138, 'Bias': 0.91255176, 'Tap': 0.0, 'Morphwidth': 0.09471552, 'Midline': 1.1400639, 'Area': 0.13040562, 'Angular_Speed': 9.693104, 'Aspect_Ratio': 0.26172414, 'Kink': 38.524136, 'Curve': 30.543104, 'Crab': 0.019868966, 'Pathlength': 3.8448102}, {'time': 659.938, 'dura': 1.43, 'dist': 0.322, 'prob': 0.6666666666666666, 'speed': 0.2251748251748251, 'plate': 1.0, 'Date': 20240724, 'Plate_id': '20240724_023625_A0724', 'Screen': 'Glia_Genes_Screen', 'taps': 7.0, 'dataset': 'AMshABLATE_nsIs109', 'Gene': 'AMshABLATE', 'Allele': 'nsIs109', 'Time': 667.015, 'n': 73.65, 'Number': 53.5, 'Instantaneous_Speed': 0.20054667, 'Interval_Speed': 0.09267333, 'Bias': 0.9363833, 'Tap': 0.0, 'Morphwidth': 0.09511001, 'Midline': 1.1319416, 'Area': 0.12948398, 'Angular_Speed': 9.343333, 'Aspect_Ratio': 0.27366668, 'Kink': 43.4, 'Curve': 31.721668, 'Crab': 0.019683333, 'Pathlength': 4.91475}, {'time': 669.955, 'dura': 1.28, 'dist': 0.287, 'prob': 0.7454545454545455, 'speed': 0.2242187499999999, 'plate': 1.0, 'Date': 20240724, 'Plate_id': '20240724_023625_A0724', 'Screen': 'Glia_Genes_Screen', 'taps': 8.0, 'dataset': 'AMshABLATE_nsIs109', 'Gene': 'AMshABLATE', 'Allele': 'nsIs109', 'Time': 677.01, 'n': 70.51785714285714, 'Number': 45.839285714285715, 'Instantaneous_Speed': 0.17549464, 'Interval_Speed': 0.09807321, 'Bias': 0.7779107, 'Tap': 0.0, 'Morphwidth': 0.09253214, 'Midline': 1.1299518, 'Area': 0.12797293, 'Angular_Speed': 8.1839285, 'Aspect_Ratio': 0.25605357, 'Kink': 42.976784, 'Curve': 30.696428, 'Crab': 0.016401786, 'Pathlength': 5.529607}  ... displaying 10 of 9858 total bound parameter sets ...  {'time': 889.93, 'dura': 0.57, 'dist': 0.095, 'prob': 0.3137254901960784, 'speed': 0.1666666666666666, 'plate': 75.0, 'Date': 20250319, 'Plate_id': '20250319_174606_C0319', 'Screen': 'Glia_Genes_Screen', 'taps': 30.0, 'dataset': 'N2', 'Gene': 'N2', 'Allele': 'N2', 'Time': 897.079, 'n': 73.94117647058823, 'Number': 48.76470588235294, 'Instantaneous_Speed': 0.14076765, 'Interval_Speed': 0.08852353, 'Bias': 0.6981765, 'Tap': 0.0, 'Morphwidth': 0.12265588, 'Midline': 1.11555, 'Area': 0.15100908, 'Angular_Speed': 4.8088236, 'Aspect_Ratio': 0.2377353, 'Kink': 36.814705, 'Curve': 28.602942, 'Crab': 0.011588235, 'Pathlength': 7.5375295}, {'time': 1189.928, 'dura': 2.9, 'dist': 0.667, 'prob': 0.5, 'speed': 0.23, 'plate': 75.0, 'Date': 20250319, 'Plate_id': '20250319_174606_C0319', 'Screen': 'Glia_Genes_Screen', 'taps': 31.0, 'dataset': 'N2', 'Gene': 'N2', 'Allele': 'N2', 'Time': 1197.555, 'n': 74.52380952380952, 'Number': 28.0, 'Instantaneous_Speed': 0.06568571, 'Interval_Speed': 0.08305238, 'Bias': 0.14719048, 'Tap': 0.0, 'Morphwidth': 0.119928576, 'Midline': 1.0866143, 'Area': 0.14990357, 'Angular_Speed': 8.276191, 'Aspect_Ratio': 0.37161905, 'Kink': 58.51905, 'Curve': 32.43333, 'Crab': 0.012709524, 'Pathlength': 5.536905}]]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
# # USE THIS CELL TO UPDATE ALL THE NEED TALBES (Also have baseline_output on the second line)

# conn=sqlite3.connect('/Users/lavanya/Desktop/Lavanya_Test/data_updated2.db')

# tap_output.to_sql('tap_response_data', conn, if_exists='append', index=False)

# baseline_output.to_sql('tap_baseline_data', conn, if_exists='append', index=False)

# combined_Tstats_normalize_2.reset_index().to_sql('tstat_gene_data', conn, if_exists='append', index=False)

# combined_Tstats_normalize_allele_2.reset_index().to_sql('tstat_allele_data', conn, if_exists='append', index=False)

# combined_Tstats_normalized_melted.to_sql('gene_profile_data', conn, if_exists='append', index=False)

# combined_Tstats_normalized_melted_allele.to_sql('allele_profile_data', conn, if_exists='append', index=False)

# combined_MSD.to_sql('gene_MSD', conn, if_exists='append', index=False)

# allele_combined_MSD.to_sql('allele_MSD', conn, if_exists='append', index=False)

# # combined_Tstats_melted_sorted.to_sql('allele_phenotype_data', conn, if_exists='replace', index=False)

# print(conn.total_changes)

# conn.close()


# # Want to test edge cases of pd.to_sql functionality#############