# 1. Imports and File selection 

In [26]:
import io
import ipywidgets as widgets
import math
import numpy
import psycopg
import pandas as pd
import requests
import sqlite3
import sys
import tqdm
import warnings

from config import load_config
from ipyfilechooser import FileChooser
from scipy import stats
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import insert
from sqlite3 import Error
from sqlite3 import IntegrityError

## Select Baseline .csv File

In [7]:
starting_directory = '/Users/gurmehak/Documents/RankinLab/Test_Datasets/'
baseline_chooser = FileChooser(starting_directory)
display(baseline_chooser)

FileChooser(path='/Users/gurmehak/Documents/RankinLab/Test_Datasets', filename='', title='', show_hidden=False…

## Select Tap .csv File

In [27]:
tap_chooser=FileChooser('/Users/gurmehak/Documents/RankinLab/Test_Datasets/')
display(tap_chooser)

FileChooser(path='/Users/gurmehak/Documents/RankinLab/Test_Datasets', filename='', title='', show_hidden=False…

## Select Post Stimulus Arousal .csv File

In [28]:
psa_chooser = FileChooser('/Users/gurmehak/Documents/RankinLab/Test_Datasets')
display(psa_chooser)

FileChooser(path='/Users/gurmehak/Documents/RankinLab/Test_Datasets', filename='', title='', show_hidden=False…

In [10]:
screens = ['PD_Screen', 'ASD_Screen', 'G-Proteins_Screen', 'Glia_Genes_Screen', 
           'Neuron_Genes_Screen', 'PD_GWAS_Locus71_Screen', 'ASD_WGS_Screen']

screen_chooser = widgets.Select(options=screens, value=screens[0], description='Screen:')
display(screen_chooser)

Select(description='Screen:', options=('PD_Screen', 'ASD_Screen', 'G-Proteins_Screen', 'Glia_Genes_Screen', 'N…

In [29]:
Screen=screen_chooser.value
folder_path=baseline_chooser.selected_path
print(folder_path)

/Users/gurmehak/Documents/RankinLab/Test_Datasets/Glia_Genes_Screen_2025


## Read baseline, tap and post stimulus arousal (psa) data

In [30]:
# Read the baseline file
baseline_output = pd.read_csv(baseline_chooser.selected, index_col=0)#.drop(columns=['index'])

print(f"\nShape of the baseline .csv file: {baseline_output.shape}")

# Print the first five rows of the file
baseline_output.head()


Shape of the baseline .csv file: (710261, 22)


Unnamed: 0,Time,n,Number,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,...,Curve,Crab,Pathlength,Plate_id,Date,Screen,plate,dataset,Gene,Allele
12178,490.0,14,12,0.0278,0.0199,0.083,0.0671,0.7847,0.07166,4.8,...,30.4,0.0042,3.168,20241024_171133_B1024,20241024,Glia_Genes_Screen,0,N2,N2,N2
12179,490.0,14,12,0.027,0.0199,0.083,0.068,0.782,0.072,3.9,...,29.7,0.0048,3.168,20241024_171133_B1024,20241024,Glia_Genes_Screen,0,N2,N2,N2
12180,490.0,14,12,0.0246,0.0162,0.083,0.0683,0.779,0.0721,4.5,...,29.6,0.006,3.168,20241024_171133_B1024,20241024,Glia_Genes_Screen,0,N2,N2,N2
12181,490.0,14,12,0.0195,0.0138,0.083,0.0658,0.776,0.07074,4.2,...,30.5,0.0049,3.168,20241024_171133_B1024,20241024,Glia_Genes_Screen,0,N2,N2,N2
12182,490.0,14,12,0.027,0.0186,0.083,0.0653,0.7676,0.0693,4.3,...,30.1,0.0034,3.168,20241024_171133_B1024,20241024,Glia_Genes_Screen,0,N2,N2,N2


In [31]:
# Read the tap file
tap_output = pd.read_csv(tap_chooser.selected, index_col=0)

print(f"\nShape of the tap .csv file: {tap_output.shape}")

# Print the first five rows of the file
tap_output.head()


Shape of the tap .csv file: (9695, 13)


Unnamed: 0,time,dura,dist,prob,speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.983,2.22,0.562,1.0,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
1,609.979,1.45,0.371,0.545455,0.255862,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,2.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
2,619.996,2.11,0.669,0.52,0.317062,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,3.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
3,629.971,1.59,0.422,0.809524,0.265409,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,4.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
4,639.968,1.5,0.378,0.895833,0.252,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,5.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109


In [38]:
# Read the psa file
psa_output = pd.read_csv(psa_chooser.selected, index_col=0)

print(f"\nShape of the tap .csv file: {psa_output.shape}")

# Print the first five rows of the file
psa_output.head()


Shape of the tap .csv file: (9842, 25)


Unnamed: 0,Experiment,Tap_num,Screen,Date,Plate_id,Gene,Allele,dataset,plate,Time,...,Tap,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,1,1,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,0,607,...,0.0,0.063646,0.774335,0.072058,6.153131,0.32681,66.139114,34.680946,0.008325,2.666646
1,1,2,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,0,617,...,0.0,0.069399,0.790535,0.077552,13.373078,0.399368,71.857635,36.909225,0.013644,2.884301
2,1,3,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,0,627,...,0.0,0.075289,0.814492,0.083328,9.304876,0.346762,73.954636,30.78755,0.012437,2.725775
3,1,4,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,0,637,...,0.0,0.08128,0.815705,0.085755,7.869519,0.297793,50.762096,31.686745,0.012143,2.732737
4,1,5,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,0,647,...,0.0,0.078201,0.848771,0.088958,5.326356,0.269462,50.68455,30.131191,0.007745,3.019273


# ************* testing

In [None]:
tap_output.head()

Unnamed: 0,time,dura,dist,prob,speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.983,2.22,0.562,1.0,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
1,609.979,1.45,0.371,0.545455,0.255862,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,2.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
2,619.996,2.11,0.669,0.52,0.317062,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,3.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
3,629.971,1.59,0.422,0.809524,0.265409,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,4.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
4,639.968,1.5,0.378,0.895833,0.252,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,5.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109


In [41]:
print(f"\nShape of the tap .csv file: {tap_output.shape}")


Shape of the tap .csv file: (9695, 13)


In [46]:
psa_chooser2 = FileChooser('/Users/gurmehak/Documents/RankinLab/Test_Datasets')
display(psa_chooser2)

FileChooser(path='/Users/gurmehak/Documents/RankinLab/Test_Datasets', filename='', title='', show_hidden=False…

In [50]:
# Read the psa file
psa_output2 = pd.read_csv(psa_chooser2.selected, index_col=0)

print(f"\nShape of the psa .csv file: {psa_output2.shape}")

# Print the first five rows of the file
psa_output2.head()


Shape of the psa .csv file: (19048, 26)


Unnamed: 0,Experiment,Tap_num,Screen,Date,Plate_id,Gene,Allele,dataset,plate,taps,...,Tap,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,1,1,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,100,1.0,...,0.0,0.064501,0.777186,0.072598,7.561366,0.332338,64.738914,35.189518,0.008956,2.644342
1,1,1,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,100,2.0,...,0.0,0.062791,0.771484,0.071517,4.744897,0.321281,67.539314,34.17238,0.007694,2.688949
2,1,2,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,100,2.0,...,0.0,0.068634,0.782104,0.076162,14.339844,0.411125,77.84375,37.039062,0.014187,2.830872
3,1,2,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,100,3.0,...,0.0,0.070188,0.799238,0.078987,12.375126,0.387231,65.67843,36.7752,0.013084,2.939453
4,1,3,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,100,3.0,...,0.0,0.077635,0.802356,0.082813,8.948714,0.366227,77.90927,31.169355,0.013268,2.851184


In [54]:
psa_output2[['Time','Tap_num','taps']].head(60)

Unnamed: 0,Time,Tap_num,taps
0,607,1,1.0
1,608,1,2.0
2,617,2,2.0
3,618,2,3.0
4,627,3,3.0
5,628,3,4.0
6,637,4,4.0
7,638,4,5.0
8,647,5,5.0
9,648,5,6.0


In [49]:
psa_output2.columns

Index(['Experiment', 'Tap_num', 'Screen', 'Date', 'Plate_id', 'Gene', 'Allele',
       'dataset', 'plate', 'taps', 'Time', 'n', 'Number',
       'Instantaneous Speed', 'Interval Speed', 'Bias', 'Tap', 'Morphwidth',
       'Midline', 'Area', 'Angular Speed', 'Aspect Ratio', 'Kink', 'Curve',
       'Crab', 'Pathlength'],
      dtype='object')

# **************** MERGE HERE

In [51]:
tap_psa_output = pd.merge(
    tap_output, psa_output.drop(columns='Experiment'),
    how='outer', 
    left_on=['Date', 'Plate_id', 'Screen', 'dataset', 'Gene', 'Allele' ],
    right_on=['Date', 'Plate_id', 'Screen', 'dataset', 'Gene', 'Allele']
)

tap_psa_output

Unnamed: 0,time,dura,dist,prob,speed,plate_x,Date,Plate_id,Screen,taps,...,Tap,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,599.983,2.22,0.562,1.0,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,...,0.0,0.128116,1.113601,0.154105,12.125471,0.470118,84.205820,41.272630,0.016355,0.085145
1,599.983,2.22,0.562,1.0,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,...,0.0,0.111871,1.127290,0.145849,14.628772,0.402264,66.401940,36.906788,0.025597,0.650264
2,599.983,2.22,0.562,1.0,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,...,0.0,0.102091,1.120226,0.134445,16.301504,0.353800,55.842014,34.955440,0.026336,1.094564
3,599.983,2.22,0.562,1.0,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,...,0.0,0.094335,1.127728,0.129765,14.447198,0.328744,52.698814,34.655174,0.024290,1.775559
4,599.983,2.22,0.562,1.0,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,...,0.0,0.092195,1.138802,0.129539,11.823307,0.287520,44.550000,31.834635,0.021802,2.632422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300072,1189.928,2.90,0.667,0.5,0.230000,75,20250319,20250319_174606_C0319,Glia_Genes_Screen,31.0,...,0.0,0.115630,1.118785,0.151349,4.257694,0.238973,36.904358,26.854166,0.011197,7.228693
300073,1189.928,2.90,0.667,0.5,0.230000,75,20250319,20250319_174606_C0319,Glia_Genes_Screen,31.0,...,0.0,0.116634,1.113932,0.150542,5.000474,0.240338,36.788826,26.881628,0.011719,7.051137
300074,1189.928,2.90,0.667,0.5,0.230000,75,20250319,20250319_174606_C0319,Glia_Genes_Screen,31.0,...,0.0,0.113964,1.109600,0.148635,4.612781,0.228459,35.281650,26.856972,0.011926,7.046074
300075,1189.928,2.90,0.667,0.5,0.230000,75,20250319,20250319_174606_C0319,Glia_Genes_Screen,31.0,...,0.0,0.123110,1.114425,0.151311,4.848549,0.238749,36.765180,28.742857,0.011700,7.571205


In [20]:
tap_psa_output.to_csv("tap_psa_output")

In [21]:
tap_psa_output.columns

Index(['time', 'dura', 'dist', 'prob', 'speed', 'plate_x', 'Date', 'Plate_id',
       'Screen', 'taps', 'dataset', 'Gene', 'Allele', 'Tap_num', 'plate_y',
       'Time', 'n', 'Number', 'Instantaneous Speed', 'Interval Speed', 'Bias',
       'Tap', 'Morphwidth', 'Midline', 'Area', 'Angular Speed', 'Aspect Ratio',
       'Kink', 'Curve', 'Crab', 'Pathlength'],
      dtype='object')

In [22]:
psa_output

Unnamed: 0,Experiment,Tap_num,Screen,Date,Plate_id,Gene,Allele,dataset,plate,Time,...,Tap,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,1,1,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,0,607,...,0.0,0.063646,0.774335,0.072058,6.153131,0.326810,66.139114,34.680946,0.008325,2.666646
1,1,2,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,0,617,...,0.0,0.069399,0.790535,0.077552,13.373078,0.399368,71.857635,36.909225,0.013644,2.884301
2,1,3,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,0,627,...,0.0,0.075289,0.814492,0.083328,9.304876,0.346762,73.954636,30.787550,0.012437,2.725775
3,1,4,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,0,637,...,0.0,0.081280,0.815705,0.085755,7.869519,0.297793,50.762096,31.686745,0.012143,2.732737
4,1,5,Glia_Genes_Screen,20241024,20241024_171133_B1024,N2,N2,N2,0,647,...,0.0,0.078201,0.848771,0.088958,5.326356,0.269462,50.684550,30.131191,0.007745,3.019273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9837,318,26,Glia_Genes_Screen,20250313,20250313_220015_C0313,ztf-16,ok1916,ztf-16_ok1916,0,867,...,0.0,0.109338,0.947949,0.123141,7.622991,0.287667,43.341072,29.637947,0.012086,5.264286
9838,318,27,Glia_Genes_Screen,20250313,20250313_220015_C0313,ztf-16,ok1916,ztf-16_ok1916,0,877,...,0.0,0.110248,0.939063,0.122620,7.770215,0.298352,46.813282,30.241796,0.011915,5.338965
9839,318,28,Glia_Genes_Screen,20250313,20250313_220015_C0313,ztf-16,ok1916,ztf-16_ok1916,0,887,...,0.0,0.109735,0.939132,0.123718,8.224466,0.302889,43.856710,30.286966,0.012758,5.445217
9840,318,29,Glia_Genes_Screen,20250313,20250313_220015_C0313,ztf-16,ok1916,ztf-16_ok1916,0,897,...,0.0,0.110977,0.944440,0.124626,6.387548,0.290638,42.818180,29.170929,0.011620,4.668324


In [23]:
tap_output

Unnamed: 0,time,dura,dist,prob,speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.983,2.22,0.562,1.000000,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
1,609.979,1.45,0.371,0.545455,0.255862,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,2.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
2,619.996,2.11,0.669,0.520000,0.317062,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,3.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
3,629.971,1.59,0.422,0.809524,0.265409,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,4.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
4,639.968,1.50,0.378,0.895833,0.252000,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,5.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,859.929,0.41,0.075,0.364865,0.182927,12,20250313,20250313_220015_C0313,Glia_Genes_Screen,27.0,ztf-16_ok1916,ztf-16,ok1916
367,869.928,0.73,0.117,0.409091,0.160274,12,20250313,20250313_220015_C0313,Glia_Genes_Screen,28.0,ztf-16_ok1916,ztf-16,ok1916
368,879.967,1.10,0.186,0.436364,0.169091,12,20250313,20250313_220015_C0313,Glia_Genes_Screen,29.0,ztf-16_ok1916,ztf-16,ok1916
369,889.963,1.12,0.160,0.461538,0.142857,12,20250313,20250313_220015_C0313,Glia_Genes_Screen,30.0,ztf-16_ok1916,ztf-16,ok1916


In [24]:
y = tap_output.copy()
y.shape

(9695, 13)

In [25]:
y.drop_duplicates()

Unnamed: 0,time,dura,dist,prob,speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.983,2.22,0.562,1.000000,0.253153,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
1,609.979,1.45,0.371,0.545455,0.255862,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,2.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
2,619.996,2.11,0.669,0.520000,0.317062,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,3.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
3,629.971,1.59,0.422,0.809524,0.265409,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,4.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
4,639.968,1.50,0.378,0.895833,0.252000,1,20240724,20240724_023625_A0724,Glia_Genes_Screen,5.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,859.929,0.41,0.075,0.364865,0.182927,12,20250313,20250313_220015_C0313,Glia_Genes_Screen,27.0,ztf-16_ok1916,ztf-16,ok1916
367,869.928,0.73,0.117,0.409091,0.160274,12,20250313,20250313_220015_C0313,Glia_Genes_Screen,28.0,ztf-16_ok1916,ztf-16,ok1916
368,879.967,1.10,0.186,0.436364,0.169091,12,20250313,20250313_220015_C0313,Glia_Genes_Screen,29.0,ztf-16_ok1916,ztf-16,ok1916
369,889.963,1.12,0.160,0.461538,0.142857,12,20250313,20250313_220015_C0313,Glia_Genes_Screen,30.0,ztf-16_ok1916,ztf-16,ok1916


In [45]:
x = tap_output[['Date', 'Plate_id', 'Screen', 'dataset', 'Gene', 'Allele', 'taps', 'plate']]
x

Unnamed: 0,Date,Plate_id,Screen,dataset,Gene,Allele,taps,plate
0,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,1.0,1
1,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,2.0,1
2,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,3.0,1
3,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,4.0,1
4,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,5.0,1
...,...,...,...,...,...,...,...,...
366,20250313,C0313,Glia_Genes_Screen,ztf-16_ok1916,ztf-16,ok1916,27.0,12
367,20250313,C0313,Glia_Genes_Screen,ztf-16_ok1916,ztf-16,ok1916,28.0,12
368,20250313,C0313,Glia_Genes_Screen,ztf-16_ok1916,ztf-16,ok1916,29.0,12
369,20250313,C0313,Glia_Genes_Screen,ztf-16_ok1916,ztf-16,ok1916,30.0,12


In [46]:
x.drop_duplicates()

Unnamed: 0,Date,Plate_id,Screen,dataset,Gene,Allele,taps,plate
0,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,1.0,1
1,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,2.0,1
2,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,3.0,1
3,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,4.0,1
4,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,5.0,1
...,...,...,...,...,...,...,...,...
366,20250313,C0313,Glia_Genes_Screen,ztf-16_ok1916,ztf-16,ok1916,27.0,12
367,20250313,C0313,Glia_Genes_Screen,ztf-16_ok1916,ztf-16,ok1916,28.0,12
368,20250313,C0313,Glia_Genes_Screen,ztf-16_ok1916,ztf-16,ok1916,29.0,12
369,20250313,C0313,Glia_Genes_Screen,ztf-16_ok1916,ztf-16,ok1916,30.0,12


In [None]:
tap_output['plate_temp'] = tap_output.aggregate()

# 2. DataFrame preparation

### 2.1. Tap Data

In [14]:
# Dataframe for first tap
PD_first_tap = (
    tap_output[(tap_output.taps==1)]
    .reset_index().drop(columns="index")
    .rename(columns={"dura": "init_dura", "prob": "init_prob", "speed": "init_speed"}, errors="raise")
)

PD_first_tap.head()

Unnamed: 0,time,init_dura,dist,init_prob,init_speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.983,2.22,0.562,1.0,0.253153,1,20240724,A0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
1,599.997,1.82,0.408,0.666667,0.224176,2,20240724,A0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
2,599.965,2.56,0.457,0.844444,0.178516,3,20240724,B0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
3,599.94,2.95,0.491,0.76087,0.166441,4,20240724,C0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
4,599.982,2.5,0.421,0.764706,0.1684,5,20240724,B0724,Glia_Genes_Screen,1.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109


In [15]:
# Dataframe for recovery taps
PD_recov_taps = (
    tap_output[(tap_output.taps==31)]
    .reset_index().drop(columns="index")
    .rename(columns={"dura": "recov_dura", "prob": "recov_prob", "speed":"recov_speed"})
)

PD_recov_taps.head()

Unnamed: 0,time,recov_dura,dist,recov_prob,recov_speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,1189.985,1.51,0.263,0.828571,0.174172,1,20240724,A0724,Glia_Genes_Screen,31.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
1,1189.939,1.05,0.19,0.630435,0.180952,2,20240724,A0724,Glia_Genes_Screen,31.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
2,1189.93,2.11,0.271,0.886364,0.128436,3,20240724,B0724,Glia_Genes_Screen,31.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
3,1189.965,1.69,0.24,0.822222,0.142012,4,20240724,C0724,Glia_Genes_Screen,31.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109
4,1189.967,1.95,0.265,0.973684,0.135897,5,20240724,B0724,Glia_Genes_Screen,31.0,AMshABLATE_nsIs109,AMshABLATE,nsIs109


In [16]:
# Dataframe for last three taps
PD_final_taps = (
    tap_output[((tap_output.taps >= 28) & (tap_output.taps <= 30))]
    .groupby(["dataset", "Date","Plate_id","Screen","Gene","Allele","plate"])
    .mean()
    .reset_index()
    .rename(columns={"dura": "final_dura", "prob": "final_prob", "speed": "final_speed"}, errors="raise")
)

PD_final_taps.head()

Unnamed: 0,dataset,Date,Plate_id,Screen,Gene,Allele,plate,time,final_dura,dist,final_prob,final_speed,taps
0,AMshABLATE_nsIs109,20240724,A0724,Glia_Genes_Screen,AMshABLATE,nsIs109,1,879.970333,0.866667,0.144333,0.41978,0.167793,29.0
1,AMshABLATE_nsIs109,20240724,A0724,Glia_Genes_Screen,AMshABLATE,nsIs109,2,879.969333,0.906667,0.144667,0.336761,0.160319,29.0
2,AMshABLATE_nsIs109,20240724,B0724,Glia_Genes_Screen,AMshABLATE,nsIs109,3,879.936,1.48,0.215667,0.611189,0.145128,29.0
3,AMshABLATE_nsIs109,20240724,B0724,Glia_Genes_Screen,AMshABLATE,nsIs109,5,879.947667,1.466667,0.217333,0.503704,0.148535,29.0
4,AMshABLATE_nsIs109,20240724,C0724,Glia_Genes_Screen,AMshABLATE,nsIs109,4,879.949,1.53,0.208,0.567925,0.136765,29.0


In [17]:
# Dataframe to analyse habituation behaviour after merging first tap and final taps

PD_habit_levels = pd.merge(
    PD_first_tap, 
    PD_final_taps, 
    on =['dataset', 'plate', "Plate_id", "Screen", "Gene", "Allele", "Date"], how ='left'
).drop(columns=['time_x','time_y','dist_x','dist_y', 'taps_x', 'taps_y']).dropna()

PD_habit_levels['habit_dura'] = PD_habit_levels['init_dura'] - PD_habit_levels['final_dura']

PD_habit_levels['habit_prob'] = PD_habit_levels['init_prob'] - PD_habit_levels['final_prob']

PD_habit_levels['habit_speed'] = PD_habit_levels['init_speed'] - PD_habit_levels['final_speed']

In [18]:
# Continue to analyse habituation behaviour after merging with recovery taps

if PD_recov_taps.empty:
    PD_habituation = pd.merge(PD_habit_levels, PD_recov_taps, on =['dataset','plate',"Plate_id","Screen","Gene","Allele","Date"], how ='outer')
else:
    PD_habituation = pd.merge(PD_habit_levels, PD_recov_taps, on =['dataset','plate',"Plate_id","Screen","Gene","Allele","Date"], how ='left')

if Screen not in ['Neuron_Genes_Screen', 'G-Proteins_Screen']:
    PD_habituation = PD_habituation.dropna() 

PD_habituation['recovery_dura']=(PD_habituation.recov_dura-PD_habituation.init_dura)/PD_habituation.init_dura*100

PD_habituation['recovery_prob']=(PD_habituation.recov_prob-PD_habituation.init_prob)/PD_habituation.init_prob*100

PD_habituation['recovery_speed']=(PD_habituation.recov_speed-PD_habituation.init_speed)/PD_habituation.init_speed*100

PD_habituation['memory_retention_dura']=(PD_habituation.recov_dura-PD_habituation.final_dura)

PD_habituation['memory_retention_prob']=(PD_habituation.recov_prob-PD_habituation.final_prob)

PD_habituation['memory_retention_speed']=(PD_habituation.recov_speed-PD_habituation.final_speed)


# Rename `PD_habituation` to `tap_data` based on the condition below
if Screen in ['Neuron_Genes_Screen', 'G-Proteins_Screen']:
    tap_data=PD_habituation.dropna(subset = ['init_dura', 'init_prob', 'init_speed', 'plate', 'Date', 'Plate_id',
       'Screen', 'dataset', 'Gene', 'Allele', 'final_dura', 'final_prob',
       'final_speed', 'habit_dura', 'habit_prob', 'habit_speed'])
else:
    tap_data=PD_habituation.dropna() 


# Display final dataframe
tap_data.head()


Unnamed: 0,init_dura,init_prob,init_speed,plate,Date,Plate_id,Screen,dataset,Gene,Allele,...,dist,recov_prob,recov_speed,taps,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,2.22,1.0,0.253153,1,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,...,0.263,0.828571,0.174172,31.0,-31.981982,-17.142857,-31.198888,0.643333,0.408791,0.006379
1,1.82,0.666667,0.224176,2,20240724,A0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,...,0.19,0.630435,0.180952,31.0,-42.307692,-5.434783,-19.281046,0.143333,0.293673,0.020633
2,2.56,0.844444,0.178516,3,20240724,B0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,...,0.271,0.886364,0.128436,31.0,-17.578125,4.964115,-28.053346,0.63,0.275175,-0.016692
3,2.95,0.76087,0.166441,4,20240724,C0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,...,0.24,0.822222,0.142012,31.0,-42.711864,8.063492,-14.677207,0.16,0.254298,0.005247
4,2.5,0.764706,0.1684,5,20240724,B0724,Glia_Genes_Screen,AMshABLATE_nsIs109,AMshABLATE,nsIs109,...,0.265,0.973684,0.135897,31.0,-22.0,27.327935,-19.30081,0.483333,0.469981,-0.012638


### 2.2. PSA data

In [19]:
# function to calculate Inidial, Final, Peak, ect values for specified column (metric)

def summary_metrics(df, metric = 'Instantaneous Speed'):

    initial = df[metric].iloc[0]
    recovery = df[metric].iloc[-1]
    peak = df[metric].max()
    mean = df[metric].mean()
    peak_id = df[metric].values.argmax()
    initial_to_peak = df[metric].iloc[: peak_id+1].mean()
    peak_to_recovery = df[metric].iloc[peak_id:].mean()
    

    return pd.Series({
        f'PSA Initial {metric}': initial, 
        f'PSA Recovery {metric}': recovery, 
        f'PSA Peak {metric}': peak,
        f'PSA Initial_to_peak {metric}': initial_to_peak, 
        f'PSA Peak_to_recovery {metric}': peak_to_recovery,
        f'PSA Average {metric}': mean
        })

In [20]:
warnings.filterwarnings('ignore')

# columns to summarize
metrics_to_summarize = ['Instantaneous Speed', 'Bias', 'Angular Speed', 'Aspect Ratio', 'Kink', 'Curve', 'Crab']

# standard columns
group_cols = ['Experiment', 'Plate_id', 'Date', 'Screen', 'dataset', 'Gene', 'Allele']

# pass each column to summarise through `summary_metrics` function and merge the summarised values to psa_output
psa_data = psa_output[group_cols]
for metric in metrics_to_summarize:
    summary = psa_output.groupby(group_cols).apply(lambda x: summary_metrics(x, metric)).reset_index()
    psa_data = pd.merge(psa_data, summary, on=group_cols, how='left')

In [21]:
psa_data.head()

Unnamed: 0,Experiment,Plate_id,Date,Screen,dataset,Gene,Allele,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,1,B1024,20241024,Glia_Genes_Screen,N2,N2,N2,0.057722,0.036236,0.111811,...,36.832,35.691,28.960777,29.141075,0.008123,0.007293,0.013567,0.010845,0.008313,0.008306
1,1,B1024,20241024,Glia_Genes_Screen,N2,N2,N2,0.057722,0.036236,0.111811,...,36.832,35.691,28.960777,29.141075,0.008123,0.007293,0.013567,0.010845,0.008313,0.008306
2,1,B1024,20241024,Glia_Genes_Screen,N2,N2,N2,0.057722,0.036236,0.111811,...,36.832,35.691,28.960777,29.141075,0.008123,0.007293,0.013567,0.010845,0.008313,0.008306
3,1,B1024,20241024,Glia_Genes_Screen,N2,N2,N2,0.057722,0.036236,0.111811,...,36.832,35.691,28.960777,29.141075,0.008123,0.007293,0.013567,0.010845,0.008313,0.008306
4,1,B1024,20241024,Glia_Genes_Screen,N2,N2,N2,0.057722,0.036236,0.111811,...,36.832,35.691,28.960777,29.141075,0.008123,0.007293,0.013567,0.010845,0.008313,0.008306


# 3. Run Statistics (T-Test and mean sample distance) on Data

## 3.1 Generate dataframes conditioned by `baseline` (True/False) and `allele` (True/False)

In [22]:
def get_output_byplate(output, baseline=["true", "false", "psa"], allele = [False, True]):
    """
    Aggregates data by 'Plate_id','Date','Screen','dataset','Gene','Allele'

    Parameters:
        output (pd.DataFrame): Input DataFrame (either baseline_output or tap_data)
        baseline (boolean): whether data is baseline (True) or tap response (False)
        allele (boolean): group by allele (True) or group by gene (False)

    Returns:
        A DataFrame with plate-level averages
    """
    
    # columns to delete if baseline = true
    if baseline == "true":
        drop_col = ['Plate_id','n','Number','Time','Screen','Date','Allele']
    # columns to delete if baseline = false
    elif baseline == "false":
        drop_col = ['Plate_id','Screen','Date','Allele','dist','plate','time',
                       'taps','recov_dura','recov_prob','recov_speed']
    # columns to delete if baseline = psa
    else: 
        drop_col = ['Experiment', 'Plate_id', 'Date', 'Screen', 'Allele']

    drop_col.append('Gene') if allele else drop_col.append('dataset')
     
    output_byplate = output.groupby(
        by=['Plate_id','Date','Screen','dataset','Gene','Allele'],
        as_index=False).mean().drop(columns=drop_col)
    
    return output_byplate

#### 3.1.1 `baseline` = True, `allele` = False

In [23]:
baseline_output_byplate=get_output_byplate(baseline_output, baseline= "true", allele=False)

print(f"Shape: {baseline_output_byplate.shape}")

baseline_output_byplate.head()

Shape: (166, 13)


Unnamed: 0,Gene,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,N2,0.045778,0.052441,0.092385,0.098094,0.948807,0.115285,3.099142,0.292758,48.78319,29.642596,0.00827,7.582197
1,ifa-4,0.031511,0.031892,0.039721,0.091903,0.856181,0.099356,3.122801,0.311255,50.051513,28.86479,0.008298,5.537552
2,ifa-4,0.062317,0.062015,0.262354,0.086668,0.799796,0.088779,6.00729,0.301353,45.518583,26.849336,0.009997,7.668276
3,mgl-2,0.026714,0.022262,-0.011699,0.086717,0.831531,0.093584,2.982189,0.331307,58.039498,29.521827,0.007927,3.374875
4,mgl-2xmgl-1,0.083278,0.074526,0.372784,0.096506,0.992729,0.118414,5.549343,0.303556,48.922313,32.124609,0.012376,7.768073


#### 3.1.2 `baseline` = False, `allele` = False

In [24]:
tap_data_byplate=get_output_byplate(tap_data, baseline="false", allele=False)

print(f"Shape: {tap_data_byplate.shape}")

tap_data_byplate.head()

Shape: (164, 16)


Unnamed: 0,Gene,init_dura,init_prob,init_speed,final_dura,final_prob,final_speed,habit_dura,habit_prob,habit_speed,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,N2,2.205,0.751052,0.209041,0.983333,0.34994,0.197817,1.221667,0.401112,0.011225,35.571392,-38.785539,10.145989,2.036667,0.130452,0.030388
1,ifa-4,3.265,0.734375,0.177764,1.468333,0.498536,0.187481,1.796667,0.235839,-0.009716,-0.87634,10.833155,20.22848,1.766667,0.31772,0.026264
2,ifa-4,2.51,0.888739,0.21897,0.946667,0.533974,0.240082,1.563333,0.354765,-0.021111,13.765919,4.715673,7.474791,1.828333,0.386972,-0.004744
3,mgl-2,1.98,0.727941,0.167104,0.943333,0.509251,0.185464,1.036667,0.21869,-0.01836,-9.208909,0.30525,-6.193971,0.891667,0.214192,-0.028901
4,mgl-2xmgl-1,1.9,0.844067,0.229524,0.371667,0.135329,0.180858,1.528333,0.708738,0.048666,-39.992798,-26.448943,0.335514,0.768333,0.485883,0.050223


#### 3.1.3 `baseline` = True, `allele` = True

In [25]:
baseline_output_allele_byplate = get_output_byplate(baseline_output,baseline="true", allele=True)

print(f"Shape: {baseline_output_allele_byplate.shape}")

baseline_output_allele_byplate.head()

Shape: (166, 13)


Unnamed: 0,dataset,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,N2,0.045778,0.052441,0.092385,0.098094,0.948807,0.115285,3.099142,0.292758,48.78319,29.642596,0.00827,7.582197
1,ifa-4_ok1717,0.031511,0.031892,0.039721,0.091903,0.856181,0.099356,3.122801,0.311255,50.051513,28.86479,0.008298,5.537552
2,ifa-4_ok1734,0.062317,0.062015,0.262354,0.086668,0.799796,0.088779,6.00729,0.301353,45.518583,26.849336,0.009997,7.668276
3,mgl-2_tm355,0.026714,0.022262,-0.011699,0.086717,0.831531,0.093584,2.982189,0.331307,58.039498,29.521827,0.007927,3.374875
4,mgl-2xmgl-1_tm355xtm1811,0.083278,0.074526,0.372784,0.096506,0.992729,0.118414,5.549343,0.303556,48.922313,32.124609,0.012376,7.768073


#### 3.1.4 `baseline` = False, `allele` = True

In [26]:
tap_data_allele_byplate = get_output_byplate(tap_data, baseline="false", allele=True)

print(f"Shape: {tap_data_allele_byplate.shape}")

tap_data_allele_byplate.head()

Shape: (164, 16)


Unnamed: 0,dataset,init_dura,init_prob,init_speed,final_dura,final_prob,final_speed,habit_dura,habit_prob,habit_speed,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,N2,2.205,0.751052,0.209041,0.983333,0.34994,0.197817,1.221667,0.401112,0.011225,35.571392,-38.785539,10.145989,2.036667,0.130452,0.030388
1,ifa-4_ok1717,3.265,0.734375,0.177764,1.468333,0.498536,0.187481,1.796667,0.235839,-0.009716,-0.87634,10.833155,20.22848,1.766667,0.31772,0.026264
2,ifa-4_ok1734,2.51,0.888739,0.21897,0.946667,0.533974,0.240082,1.563333,0.354765,-0.021111,13.765919,4.715673,7.474791,1.828333,0.386972,-0.004744
3,mgl-2_tm355,1.98,0.727941,0.167104,0.943333,0.509251,0.185464,1.036667,0.21869,-0.01836,-9.208909,0.30525,-6.193971,0.891667,0.214192,-0.028901
4,mgl-2xmgl-1_tm355xtm1811,1.9,0.844067,0.229524,0.371667,0.135329,0.180858,1.528333,0.708738,0.048666,-39.992798,-26.448943,0.335514,0.768333,0.485883,0.050223


In [27]:
# tap_data_allele_byplate[tap_data_allele_byplate.dataset=='N2_XJ1']

#### 3.1.5 `baseline` = "psa" , `allele` = False

In [28]:
psa_data_byplate = get_output_byplate(psa_data, baseline="psa", allele=False)

print(f"Shape: {psa_data_byplate.shape}")

psa_data_byplate.head()

Shape: (166, 43)


Unnamed: 0,Gene,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,N2,0.082989,0.081324,0.211282,0.171682,0.164803,0.165542,0.098555,0.271785,0.971985,...,40.262514,40.262514,30.415904,30.415904,0.017088,0.011654,0.023461,0.02024,0.014211,0.014513
1,ifa-4,0.060561,0.072115,0.141085,0.109887,0.105121,0.10463,0.141648,0.177686,0.832196,...,38.676,38.271405,32.961621,33.029883,0.013514,0.014503,0.021946,0.01805,0.014523,0.014657
2,ifa-4,0.081006,0.113214,0.175476,0.150985,0.144711,0.145782,0.190421,0.397394,0.882951,...,34.588867,34.588867,30.122254,30.122254,0.014715,0.017266,0.022484,0.019563,0.017703,0.017899
3,mgl-2,0.080292,0.038243,0.144953,0.113508,0.108358,0.107737,0.24832,0.23357,0.925011,...,37.352,35.808222,29.777603,29.938466,0.014796,0.009388,0.018553,0.016305,0.010981,0.011176
4,mgl-2xmgl-1,0.078938,0.093982,0.192454,0.150704,0.149263,0.148514,0.32609,0.458584,0.936397,...,39.135632,39.135632,32.119114,32.119114,0.014984,0.016075,0.022621,0.019761,0.016044,0.016312


#### 3.1.6 `baseline` = "psa" , `allele` = True

In [29]:
psa_data_allele_byplate = get_output_byplate(psa_data, baseline="psa", allele=True)

print(f"Shape: {psa_data_allele_byplate.shape}")

psa_data_allele_byplate.head()

Shape: (166, 43)


Unnamed: 0,dataset,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,N2,0.082989,0.081324,0.211282,0.171682,0.164803,0.165542,0.098555,0.271785,0.971985,...,40.262514,40.262514,30.415904,30.415904,0.017088,0.011654,0.023461,0.02024,0.014211,0.014513
1,ifa-4_ok1717,0.060561,0.072115,0.141085,0.109887,0.105121,0.10463,0.141648,0.177686,0.832196,...,38.676,38.271405,32.961621,33.029883,0.013514,0.014503,0.021946,0.01805,0.014523,0.014657
2,ifa-4_ok1734,0.081006,0.113214,0.175476,0.150985,0.144711,0.145782,0.190421,0.397394,0.882951,...,34.588867,34.588867,30.122254,30.122254,0.014715,0.017266,0.022484,0.019563,0.017703,0.017899
3,mgl-2_tm355,0.080292,0.038243,0.144953,0.113508,0.108358,0.107737,0.24832,0.23357,0.925011,...,37.352,35.808222,29.777603,29.938466,0.014796,0.009388,0.018553,0.016305,0.010981,0.011176
4,mgl-2xmgl-1_tm355xtm1811,0.078938,0.093982,0.192454,0.150704,0.149263,0.148514,0.32609,0.458584,0.936397,...,39.135632,39.135632,32.119114,32.119114,0.014984,0.016075,0.022621,0.019761,0.016044,0.016312


## 3.2 Calculate Mean Distances and CIs

In [30]:

def extract_phenotypes(df):
    ''' 
    Splits a multi-column DataFrame into a list of DataFrames, each containing one phenotype

    input: 
        df (pd.DataFrame): dataframe with multiple columns (1st column is the index, the other are phenotypes)

    returns:
        list_phenotypes_df: list with 2 columns - one for index and one for phenotype, 
            for how many phenotypes there are in the input
    '''
    list_phenotypes_df = []
    index = df.columns[0]
    for i in df.columns[1:]:
        list_phenotypes_df.append(df[[index, i]].copy())

    return list_phenotypes_df



def ci95(df):
    """
    input: df of 4 columns: index, mean, count, std

    returns: df of 6 columns: index, mean, count, std, ci95_hi, ci95_low

    """
    for metric in df.columns.levels[0]:
        if metric == 'Gene':
            pass
        else:
            ci95_hi = []
            ci95_lo = []
            for i in df[metric].index:
                m = df[metric]['mean'].loc[i]
                c = df[metric]['count'].loc[i]
                s = df[metric]['sem'].loc[i]
                ci95_hi.append(stats.t.interval(confidence=0.95, df=c-1, loc=m, scale=s)[1])
                ci95_lo.append(stats.t.interval(confidence=0.95, df=c-1, loc=m, scale=s)[0])
            df[metric,'ci95_hi'] = ci95_hi
            df[metric,'ci95_lo'] = ci95_lo
            # df[metric,'ci95']=list(zip(ci95_lo,ci95_hi))
            
    return df



def calculate_MSD(list_of_dfs, by):
    new_list_of_dfs = []
    
    for df in list_of_dfs:
        # Get phenotype column name (assuming 2nd column is the metric)
        pheno_col = df.columns[1]
        
        # Calculate statistics
        stats = df.groupby(by)[df.columns[1]].agg(['mean', 'count', 'sem'])

        
        # Convert to MultiIndex if needed (more robust version)
        if not isinstance(stats.columns, pd.MultiIndex):
            stats.columns = pd.MultiIndex.from_tuples([(pheno_col, col) for col in stats.columns])
        
        # Calculate CI
        stats_2 = ci95(stats)
        
        # Get N2 control data
        if Screen == "Neuron_Genes_Screen":
            N2_mask = stats_2.index == 'N2' if by == "Gene" else stats_2.index.isin(['N2_XJ1','N2_N2'])
        else:
            N2_mask = stats_2.index == 'N2'
            
        N2_data = stats_2[N2_mask]
        
        # Subtract N2 values
        stats_2.iloc[:, 0] -= N2_data.iloc[0, 0]  # mean
        stats_2.iloc[:, 3] -= N2_data.iloc[0, 0]  # ci95_hi
        stats_2.iloc[:, 4] -= N2_data.iloc[0, 0]  # ci95_low
        
        new_list_of_dfs.append(stats_2)
    
    return new_list_of_dfs

In [31]:
def calculate_MSD(list_of_dfs, by):
    new_list_of_dfs = []
    
    for df in list_of_dfs:
        # Get phenotype column name (assuming 2nd column is the metric)
        pheno_col = df.columns[1]
        
        # Create proper MultiIndex structure
        stats = df.groupby(by)[df.columns[1]].agg(['mean', 'count', 'sem'])

        # Convert to MultiIndex if needed (more robust version)
        if not isinstance(stats.columns, pd.MultiIndex):
            stats.columns = pd.MultiIndex.from_tuples([(pheno_col, col) for col in stats.columns])
        
        # Calculate CIs
        stats_2 = ci95(stats)
        
        # Get N2 control data
        if Screen == "Neuron_Genes_Screen":
            N2_mask = stats_2.index == 'N2' if by == "Gene" else stats_2.index.isin(['N2_XJ1','N2_N2'])
        else:
            N2_mask = stats_2.index == 'N2'
            
        N2_data = stats_2[N2_mask]
        
        # Subtract N2 values
        stats_2.iloc[:, 0] -= N2_data.iloc[0, 0]  # mean
        stats_2.iloc[:, 3] -= N2_data.iloc[0, 0]  # ci95_hi
        stats_2.iloc[:, 4] -= N2_data.iloc[0, 0]  # ci95_low
        
        new_list_of_dfs.append(stats_2)
    
    return new_list_of_dfs

In [32]:
def get_MSD(list_MSD):
    '''
    input: List of dataframes, each representing a phenotype with calculated MSD.

    returns: Single combined dataframe joining all input dataframes with MSD values.
    '''
    for a in list_MSD:
        if a.columns.levels[0] == list_MSD[0].columns.levels[0]:
            MSD=a
        else:
            MSD=MSD.join(a)
    return MSD

In [33]:
def get_combined_MSD(baseline_byplate,tap_byplate, psa_byplate, by=['Gene','dataset']):
    """
    Combines MSD datafram from baseline plates and tap plates

    input:
        - baseline_byplate: baseline data by plate
        - tap_byplate: tap data by plate
        - by: what to group by "Gene" or "dataset"
    returns:
        - combined MSD dataframe
    """
    list_baseline_MSD=calculate_MSD(extract_phenotypes(baseline_byplate), by=by)

    list_tap_MSD=calculate_MSD(extract_phenotypes(tap_byplate), by=by)

    list_psa_MSD=calculate_MSD(extract_phenotypes(psa_byplate), by=by)

    baseline_MSD = get_MSD(list_baseline_MSD)
    
    tap_MSD = get_MSD(list_tap_MSD)

    psa_MSD = get_MSD(list_psa_MSD)

    combined_MSD = pd.merge(pd.merge(baseline_MSD, tap_MSD, on=by, how='outer'), psa_MSD, on=by, how='outer')

    combined_MSD=combined_MSD.rename(columns={"habit_dura":"Habituation of Response Duration",
                                         "habit_prob": "Habituation of Respones Probability",
                                         "habit_speed":"Habituation of Response Speed",
                                         "init_dura": "Initial Response Duration",
                                         "init_prob": "Initial Response Probability",
                                         "init_speed": "Initial Response Speed",
                                         "final_dura": "Final Response Duration",
                                         "final_prob": "Final Response Probability",
                                         "final_speed": "Final Response Speed",
                                         "recovery_dura": "Spontaneous Recovery of Response Duration",
                                         "recovery_prob": "Spontaneous Recovery of Response Probability",
                                         "recovery_speed": "Spontaneous Recovery of Response Speed",
                                         "memory_retention_dura": "Memory Retention of Response Duration",
                                         "memory_retention_prob": "Memory Retention of Response Probability",
                                         "memory_retention_speed": "Memory Retention of Response Speed"})

    combined_MSD=combined_MSD.reset_index()
    combined_MSD.columns = combined_MSD.columns.to_flat_index().str.join('-')
    combined_MSD=combined_MSD.rename(columns={by+"-": by})
    combined_MSD['Screen']=Screen
    
    return combined_MSD

### 3.2.1 Gene-level SMD

In [34]:
combined_MSD=get_combined_MSD(baseline_output_byplate,
                              tap_data_byplate, 
                              psa_data_byplate,
                              by='Gene')

combined_MSD.head()

Unnamed: 0,Gene,Instantaneous Speed-mean,Instantaneous Speed-count,Instantaneous Speed-sem,Instantaneous Speed-ci95_hi,Instantaneous Speed-ci95_lo,Interval Speed-mean,Interval Speed-count,Interval Speed-sem,Interval Speed-ci95_hi,...,PSA Peak_to_recovery Crab-count,PSA Peak_to_recovery Crab-sem,PSA Peak_to_recovery Crab-ci95_hi,PSA Peak_to_recovery Crab-ci95_lo,PSA Average Crab-mean,PSA Average Crab-count,PSA Average Crab-sem,PSA Average Crab-ci95_hi,PSA Average Crab-ci95_lo,Screen
0,AMshABLATE,-0.043684,3,0.002396,-0.033375,-0.053992,-0.047518,3,0.001217,-0.042283,...,3,0.000446,4e-06,-0.003837,-0.001832,3,0.000472,0.0002,-0.003865,Glia_Genes_Screen
1,N2,0.0,40,0.002712,0.005486,-0.005486,0.0,40,0.002266,0.004584,...,40,0.000306,0.000618,-0.000618,0.0,40,0.000313,0.000632,-0.000632,Glia_Genes_Screen
2,ced-10,-0.033728,6,0.003237,-0.025407,-0.04205,-0.041124,6,0.003215,-0.032859,...,6,0.003309,0.009962,-0.00705,-0.001232,6,0.000903,0.00109,-0.003553,Glia_Genes_Screen
3,ced-5,-0.031062,5,0.001218,-0.02768,-0.034443,-0.042521,5,0.001681,-0.037853,...,5,0.000606,-0.004665,-0.008028,-0.006759,5,0.000312,-0.005892,-0.007625,Glia_Genes_Screen
4,delm-1,0.002854,6,0.010184,0.029033,-0.023325,-0.00215,6,0.017955,0.044006,...,6,0.001534,0.006306,-0.001581,0.001049,6,0.000746,0.002968,-0.00087,Glia_Genes_Screen


### 3.2.2 Allele-level SMD

In [35]:
allele_combined_MSD=get_combined_MSD(baseline_output_allele_byplate,
                                     tap_data_allele_byplate, 
                                     psa_data_allele_byplate,
                                     by='dataset')

allele_combined_MSD.head()

Unnamed: 0,dataset,Instantaneous Speed-mean,Instantaneous Speed-count,Instantaneous Speed-sem,Instantaneous Speed-ci95_hi,Instantaneous Speed-ci95_lo,Interval Speed-mean,Interval Speed-count,Interval Speed-sem,Interval Speed-ci95_hi,...,PSA Peak_to_recovery Crab-count,PSA Peak_to_recovery Crab-sem,PSA Peak_to_recovery Crab-ci95_hi,PSA Peak_to_recovery Crab-ci95_lo,PSA Average Crab-mean,PSA Average Crab-count,PSA Average Crab-sem,PSA Average Crab-ci95_hi,PSA Average Crab-ci95_lo,Screen
0,AMshABLATE_nsIs109,-0.043684,3,0.002396,-0.033375,-0.053992,-0.047518,3,0.001217,-0.042283,...,3,0.000446,4e-06,-0.003837,-0.001832,3,0.000472,0.0002,-0.003865,Glia_Genes_Screen
1,N2,0.0,40,0.002712,0.005486,-0.005486,0.0,40,0.002266,0.004584,...,40,0.000306,0.000618,-0.000618,0.0,40,0.000313,0.000632,-0.000632,Glia_Genes_Screen
2,ced-10_n3246,-0.033728,6,0.003237,-0.025407,-0.04205,-0.041124,6,0.003215,-0.032859,...,6,0.003309,0.009962,-0.00705,-0.001232,6,0.000903,0.00109,-0.003553,Glia_Genes_Screen
3,ced-5_n2002,-0.031062,5,0.001218,-0.02768,-0.034443,-0.042521,5,0.001681,-0.037853,...,5,0.000606,-0.004665,-0.008028,-0.006759,5,0.000312,-0.005892,-0.007625,Glia_Genes_Screen
4,delm-1_ok1226,0.002854,6,0.010184,0.029033,-0.023325,-0.00215,6,0.017955,0.044006,...,6,0.001534,0.006306,-0.001581,0.001049,6,0.000746,0.002968,-0.00087,Glia_Genes_Screen


## 3.3 T-Stat analysis

In [36]:
def baseline_metrics(by=["Gene","dataset"]):
    """
    Create a list of empty dataframe and list of metrics for baseline analysis

    input:
        by (list): what to group by "Gene" or "dataset"
        
    returns:
        list_baseline_Tstats: dataframes to store t-statistics
        list_baseline_metrics: dataframes to store metic names
    """
    PD_baseline_instantspeed_T=pd.DataFrame(columns = [by,"Instantaneous Speed"])
    PD_baseline_intspeed_T=pd.DataFrame(columns = [by,"Interval Speed"])
    PD_baseline_bias_T=pd.DataFrame(columns = [by,"Bias"])
    PD_baseline_morphwidth_T=pd.DataFrame(columns = [by,"Morphwidth"])
    PD_baseline_midline_T=pd.DataFrame(columns = [by,"Midline"])
    PD_baseline_area_T=pd.DataFrame(columns = [by,"Area"])
    PD_baseline_angularspeed_T=pd.DataFrame(columns = [by,"Angular Speed"])
    PD_baseline_aspectratio_T=pd.DataFrame(columns = [by,"Aspect Ratio"])
    PD_baseline_kink_T=pd.DataFrame(columns = [by,"Kink"])
    PD_baseline_curve_T=pd.DataFrame(columns = [by,"Curve"])
    PD_baseline_crab_T=pd.DataFrame(columns = [by,"Crab"])
    PD_baseline_pathlength_T=pd.DataFrame(columns = [by,"Pathlength"])

    list_baseline_Tstats=[PD_baseline_instantspeed_T,
                        PD_baseline_intspeed_T,
                        PD_baseline_bias_T,
                        PD_baseline_morphwidth_T,
                        PD_baseline_midline_T,
                        PD_baseline_area_T,
                        PD_baseline_angularspeed_T,
                        PD_baseline_aspectratio_T,
                        PD_baseline_kink_T,
                        PD_baseline_curve_T,
                        PD_baseline_crab_T,
                        PD_baseline_pathlength_T]

    list_baseline_metrics=["Instantaneous Speed",
                        "Interval Speed",
                        "Bias",
                        "Morphwidth",
                        "Midline",
                        "Area",
                        "Angular Speed",
                        "Aspect Ratio",
                        "Kink",
                        "Curve",
                        "Crab",
                        "Pathlength"]
    
    return list_baseline_Tstats, list_baseline_metrics

In [37]:
def tap_metrics(by=["Gene","dataset"]):
    """
    Create a list of empty dataframes and list of metrics for tap analysis

    input:
        by (list): what to group by "Gene" or "dataset"
        
    returns:
        list_tap_Tstats: dataframes to store t-statistics
        list_tap_metrics: dataframes to store metic names
    """
    recovery_dura=pd.DataFrame(columns = [by,"Recovery Duration"])
    recovery_prob=pd.DataFrame(columns = [by,"Recovery Probability"])
    recovery_speed=pd.DataFrame(columns = [by,"Recovery Speed"])
    memory_retention_dura=pd.DataFrame(columns = [by,"Memory Retention Duration"])
    memory_retention_prob=pd.DataFrame(columns = [by,"Memory Retention Probability"])
    memory_retention_speed=pd.DataFrame(columns = [by,"Memory Retention Speed"])
    init_dura=pd.DataFrame(columns = [by,"Initial Duration"])
    init_prob=pd.DataFrame(columns = [by,"Initial Probability"])
    init_speed=pd.DataFrame(columns = [by,"Initial Speed"])
    final_dura=pd.DataFrame(columns = [by,"Final Duration"])
    final_prob=pd.DataFrame(columns = [by,"Final Probability"])
    final_speed=pd.DataFrame(columns = [by,"Final Speed"])
    hab_dura=pd.DataFrame(columns = [by,"Habituation of Duration"])
    hab_prob=pd.DataFrame(columns = [by,"Habituation of Probability"])
    hab_speed=pd.DataFrame(columns = [by,"Habituation of Speed"])

    list_tap_Tstats = [recovery_dura,
                    recovery_prob,
                    recovery_speed,
                    memory_retention_dura,
                    memory_retention_prob,
                    memory_retention_speed,
                    init_dura,
                    init_prob,
                    init_speed,
                    final_dura,
                    final_prob,
                    final_speed,
                    hab_dura,
                    hab_prob,
                    hab_speed]
    
    list_tap_metrics = ["recovery_dura",
                        "recovery_prob",
                        "recovery_speed",
                        "memory_retention_dura",
                        "memory_retention_prob",
                        "memory_retention_speed",
                        "init_dura",
                        "init_prob",
                        "init_speed",
                        "final_dura",
                        "final_prob",
                        "final_speed",
                        "habit_dura",
                        "habit_prob",
                        "habit_speed"]
    
    return list_tap_Tstats, list_tap_metrics

In [38]:
def psa_metrics(by=["Gene", "dataset"]):
    """
    Create a list of empty dataframes and list of metric names for PSA summary analysis.

    input:
        by (list): what to group by ("Gene" or "dataset")

    returns:
        list_psa_Tstats: list of empty DataFrames for t-statistics
        list_psa_metrics: list of metric names (short strings)
    """

    psa_initial_speed = pd.DataFrame(columns=[by,"PSA Initial Instantaneous Speed"])
    psa_recovery_speed = pd.DataFrame(columns=[by,"PSA Recovery Instantaneous Speed"])
    psa_peak_speed = pd.DataFrame(columns=[by,"PSA Peak Instantaneous Speed"])
    psa_initial_to_peak_speed = pd.DataFrame(columns=[by,"PSA Initial_to_peak Instantaneous Speed"])
    psa_peak_to_recovery_speed = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Instantaneous Speed"])
    psa_avg_speed = pd.DataFrame(columns=[by,"PSA Average Instantaneous Speed"])

    psa_initial_bias = pd.DataFrame(columns=[by,"PSA Initial Bias"])
    psa_recovery_bias = pd.DataFrame(columns=[by,"PSA Recovery Bias"])
    psa_peak_bias = pd.DataFrame(columns=[by,"PSA Peak Bias"])
    psa_initial_to_peak_bias = pd.DataFrame(columns=[by,"PSA Initial_to_peak Bias"])
    psa_peak_to_recovery_bias = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Bias"])
    psa_avg_bias = pd.DataFrame(columns=[by,"PSA Average Bias"])

    psa_initial_ang_speed = pd.DataFrame(columns=[by,"PSA Initial Angular Speed"])
    psa_recovery_ang_speed = pd.DataFrame(columns=[by,"PSA Recovery Angular Speed"])
    psa_peak_ang_speed = pd.DataFrame(columns=[by,"PSA Peak Angular Speed"])
    psa_initial_to_peak_ang_speed = pd.DataFrame(columns=[by,"PSA Initial_to_peak Angular Speed"])
    psa_peak_to_recovery_ang_speed = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Angular Speed"])
    psa_avg_ang_speed = pd.DataFrame(columns=[by,"PSA Average Angular Speed"])

    psa_initial_aspect = pd.DataFrame(columns=[by,"PSA Initial Aspect Ratio"])
    psa_recovery_aspect = pd.DataFrame(columns=[by,"PSA Recovery Aspect Ratio"])
    psa_peak_aspect = pd.DataFrame(columns=[by,"PSA Peak Aspect Ratio"])
    psa_initial_to_peak_aspect = pd.DataFrame(columns=[by,"PSA Initial_to_peak Aspect Ratio"])
    psa_peak_to_recovery_aspect = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Aspect Ratio"])
    psa_avg_aspect = pd.DataFrame(columns=[by,"PSA Average Aspect Ratio"])

    psa_initial_kink = pd.DataFrame(columns=[by,"PSA Initial Kink"])
    psa_recovery_kink = pd.DataFrame(columns=[by,"PSA Recovery Kink"])
    psa_peak_kink = pd.DataFrame(columns=[by,"PSA Peak Kink"])
    psa_initial_to_peak_kink = pd.DataFrame(columns=[by,"PSA Initial_to_peak Kink"])
    psa_peak_to_recovery_kink = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Kink"])
    psa_avg_kink = pd.DataFrame(columns=[by,"PSA Average Kink"])

    psa_initial_curve = pd.DataFrame(columns=[by,"PSA Initial Curve"])
    psa_recovery_curve = pd.DataFrame(columns=[by,"PSA Recovery Curve"])
    psa_peak_curve = pd.DataFrame(columns=[by,"PSA Peak Curve"])
    psa_initial_to_peak_curve = pd.DataFrame(columns=[by,"PSA Initial_to_peak Curve"])
    psa_peak_to_recovery_curve = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Curve"])
    psa_avg_curve = pd.DataFrame(columns=[by,"PSA Average Curve"])

    psa_initial_crab = pd.DataFrame(columns=[by,"PSA Initial Crab"])
    psa_recovery_crab = pd.DataFrame(columns=[by,"PSA Recovery Crab"])
    psa_peak_crab = pd.DataFrame(columns=[by,"PSA Peak Crab"])
    psa_initial_to_peak_crab = pd.DataFrame(columns=[by,"PSA Initial_to_peak Crab"])
    psa_peak_to_recovery_crab = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Crab"])
    psa_avg_crab = pd.DataFrame(columns=[by,"PSA Average Crab"])

    list_psa_Tstats = [
        psa_initial_speed, psa_recovery_speed, psa_peak_speed,
        psa_initial_to_peak_speed, psa_peak_to_recovery_speed, psa_avg_speed,

        psa_initial_bias, psa_recovery_bias, psa_peak_bias,
        psa_initial_to_peak_bias, psa_peak_to_recovery_bias, psa_avg_bias,

        psa_initial_ang_speed, psa_recovery_ang_speed, psa_peak_ang_speed,
        psa_initial_to_peak_ang_speed, psa_peak_to_recovery_ang_speed, psa_avg_ang_speed,

        psa_initial_aspect, psa_recovery_aspect, psa_peak_aspect,
        psa_initial_to_peak_aspect, psa_peak_to_recovery_aspect, psa_avg_aspect,

        psa_initial_kink, psa_recovery_kink, psa_peak_kink,
        psa_initial_to_peak_kink, psa_peak_to_recovery_kink, psa_avg_kink,

        psa_initial_curve, psa_recovery_curve, psa_peak_curve,
        psa_initial_to_peak_curve, psa_peak_to_recovery_curve, psa_avg_curve,

        psa_initial_crab, psa_recovery_crab, psa_peak_crab,
        psa_initial_to_peak_crab, psa_peak_to_recovery_crab, psa_avg_crab
    ]

    list_psa_metrics = [
    "PSA Initial Instantaneous Speed",
    "PSA Recovery Instantaneous Speed",
    "PSA Peak Instantaneous Speed",
    "PSA Initial_to_peak Instantaneous Speed",
    "PSA Peak_to_recovery Instantaneous Speed",
    "PSA Average Instantaneous Speed",

    "PSA Initial Bias",
    "PSA Recovery Bias",
    "PSA Peak Bias",
    "PSA Initial_to_peak Bias",
    "PSA Peak_to_recovery Bias",
    "PSA Average Bias",

    "PSA Initial Angular Speed",
    "PSA Recovery Angular Speed",
    "PSA Peak Angular Speed",
    "PSA Initial_to_peak Angular Speed",
    "PSA Peak_to_recovery Angular Speed",
    "PSA Average Angular Speed",

    "PSA Initial Aspect Ratio",
    "PSA Recovery Aspect Ratio",
    "PSA Peak Aspect Ratio",
    "PSA Initial_to_peak Aspect Ratio",
    "PSA Peak_to_recovery Aspect Ratio",
    "PSA Average Aspect Ratio",

    "PSA Initial Kink",
    "PSA Recovery Kink",
    "PSA Peak Kink",
    "PSA Initial_to_peak Kink",
    "PSA Peak_to_recovery Kink",
    "PSA Average Kink",

    "PSA Initial Curve",
    "PSA Recovery Curve",
    "PSA Peak Curve",
    "PSA Initial_to_peak Curve",
    "PSA Peak_to_recovery Curve",
    "PSA Average Curve",

    "PSA Initial Crab",
    "PSA Recovery Crab",
    "PSA Peak Crab",
    "PSA Initial_to_peak Crab",
    "PSA Peak_to_recovery Crab",
    "PSA Average Crab"
]
    
    return list_psa_Tstats, list_psa_metrics


In [39]:
def TTest(Type, DF_ref, output, by=["Gene", "dataset"]):
    """
    Perform two sample t-test for each unique Gene/dataset column in the Df_ref
    input: 
        - a:column name of values 
        - DF_ref:reference dataframe
        - output: output df to store results in 
        - by: what to group by "Gene" or "dataset"
        
    """
    for a in DF_ref[by].unique():
        Tstat_a = ttest_ind(DF_ref[DF_ref.dataset == a][Type], DF_ref[DF_ref.Allele.isin(["XJ1","N2"])][Type],equal_var=False)[0]
        Tstat_g = ttest_ind(DF_ref[DF_ref.Gene == a][Type], DF_ref[DF_ref.Gene == "N2"][Type],equal_var=False)[0]
        Tstat = Tstat_g if by=="Gene" else Tstat_a
        row = [a, Tstat]
        output.loc[len(output)] = row
    # print(output)

def do_TTest(by=["Gene", "dataset"], baseline=["true", "false", "psa"]):
    """
    Perform TTest function for each unique Gene/dataset column in baseline_output/tap_data
    
    input: 
        - by: what to group by "Gene" or "dataset"
        - baseline: whether or not to use baseline data

    returns: sorted T-statistics dataframe
    """

    if baseline=="true":
        list_Tstats, list_metrics = baseline_metrics(by)
        data = baseline_output
    elif baseline=="false":
        list_Tstats,list_metrics = tap_metrics(by)
        data = tap_data
    else:
        list_Tstats,list_metrics = psa_metrics(by)
        data = psa_data
    for x in data[by].unique():
        if Screen=="Neuron_Genes_Screen":
            condition = x in (["N2"] if by == "Gene" else ["N2_XJ1", "N2_N2"])
        else:
            condition = (x =="N2")
        if condition:
            pass
        else:
            output_gene=data[data[by]==x]
            gene_data=data[data['Date'].isin(output_gene['Date'].unique())]
            if Screen=="Neuron_Genes_Screen":
                gene_data_final = gene_data[gene_data[by].isin(['N2', x])] if by=="Gene" else gene_data[gene_data[by].isin(['N2_N2','N2_XJ1', x])]
            else:
                gene_data_final = gene_data[gene_data[by].isin(['N2', x])]

            for a,b in zip(list_metrics, list_Tstats):
                TTest(a, gene_data_final, b, by) # calls t test function
    
    PD_Tstats=pd.DataFrame()
    for a in list_Tstats:
        b=a.groupby([by], as_index=False).mean()
        if b.columns.values[1] == list_Tstats[0].columns.values[1]:
            PD_Tstats=b
        else:
            PD_Tstats=PD_Tstats.join(b.iloc[:,1])
            
    PD_Tstats=PD_Tstats.set_index(by)
    
    return PD_Tstats
            

### T-stat on Baseline data:

### 3.3.1 Allele-level T-stat analysis of baseline data

In [40]:
warnings.filterwarnings('ignore')

PD_baseline_Tstats_allele = do_TTest("dataset", baseline="true") # get sorted T-statistics DataFrame 

# PD_baseline_Tstats_allele_sorted=PD_baseline_Tstats_allele.sort_index()

PD_baseline_Tstats_allele.head()

Unnamed: 0_level_0,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AMshABLATE_nsIs109,-170.402072,-185.50277,-129.051227,8.436649,-72.062483,-19.016885,-72.427009,67.186598,60.225199,-59.206564,-76.040905,-174.635472
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10_n3246,-178.516631,-121.15098,-101.994376,-96.420955,-590.22632,-328.825337,86.467808,324.132657,276.5116,81.799335,7.8202,34.234078
ced-5_n2002,-124.360515,-132.839687,-87.457709,-162.535954,-311.278255,-347.893201,6.824703,-59.527752,-97.326696,-156.138037,-58.958304,-33.193366
delm-1_ok1226,-3.381671,-7.564952,-7.161008,-335.410406,-670.970972,-489.588565,92.161494,58.931913,9.494694,-2.329122,56.520463,106.468057


### 3.3.2 Gene-level T-stat analysis of baseline data

In [41]:
warnings.filterwarnings('ignore')

PD_baseline_Tstats=do_TTest("Gene", baseline="true") # get sorted T-statistics DataFrame 

# PD_baseline_Tstats_sorted=PD_baseline_Tstats.sort_index()

PD_baseline_Tstats.head()

Unnamed: 0_level_0,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AMshABLATE,-170.402072,-185.50277,-129.051227,8.436649,-72.062483,-19.016885,-72.427009,67.186598,60.225199,-59.206564,-76.040905,-174.635472
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10,-178.516631,-121.15098,-101.994376,-96.420955,-590.22632,-328.825337,86.467808,324.132657,276.5116,81.799335,7.8202,34.234078
ced-5,-124.360515,-132.839687,-87.457709,-162.535954,-311.278255,-347.893201,6.824703,-59.527752,-97.326696,-156.138037,-58.958304,-33.193366
delm-1,-3.381671,-7.564952,-7.161008,-335.410406,-670.970972,-489.588565,92.161494,58.931913,9.494694,-2.329122,56.520463,106.468057


### T-stat analysis for tap-response data:

### 3.3.3 Allele level T-stat analysis of tap response data

In [42]:
warnings.filterwarnings('ignore')

PD_habituation_Tstats_allele = do_TTest("dataset", baseline="false") # get sorted T-statistics DataFrame 

# PD_habituation_Tstats_allele_sorted=PD_habituation_Tstats_allele.sort_index()

PD_habituation_Tstats_allele.head()

Unnamed: 0_level_0,Recovery Duration,Recovery Probability,Recovery Speed,Memory Retention Duration,Memory Retention Probability,Memory Retention Speed,Initial Duration,Initial Probability,Initial Speed,Final Duration,Final Probability,Final Speed,Habituation of Duration,Habituation of Probability,Habituation of Speed
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AMshABLATE_nsIs109,-1.961762,1.918844,-5.224931,-5.484683,-3.125683,-3.300698,-1.895033,-0.967447,-2.824795,1.802522,5.264003,-4.91969,-3.987772,-4.36802,-0.235936
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10_n3246,-0.010591,1.286679,-0.208076,-1.905773,-3.286405,-3.637684,-1.262823,-3.630366,-7.253569,-5.115169,1.645364,-6.711686,0.029125,-4.365596,-2.704125
ced-5_n2002,0.908764,-2.60661,-1.672565,-1.643956,-11.953176,-1.227305,-2.330504,-2.829142,-8.294158,0.691405,0.44202,-13.289988,-2.931369,-1.497765,-0.3966
delm-1_ok1226,1.479204,0.679554,-0.081651,0.352711,0.613993,-0.746649,-0.896926,-1.425577,-2.631308,-0.986245,-2.943531,-1.50898,-0.414799,0.264624,-0.414722


### 3.3.4 Gene-level T-stat analysis of Tap response data

In [43]:
warnings.filterwarnings('ignore')

PD_habituation_Tstats = do_TTest("Gene", baseline="false") # get sorted T-statistics DataFrame 

PD_habituation_Tstats_sorted=PD_habituation_Tstats.sort_index()

PD_habituation_Tstats.head()

Unnamed: 0_level_0,Recovery Duration,Recovery Probability,Recovery Speed,Memory Retention Duration,Memory Retention Probability,Memory Retention Speed,Initial Duration,Initial Probability,Initial Speed,Final Duration,Final Probability,Final Speed,Habituation of Duration,Habituation of Probability,Habituation of Speed
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AMshABLATE,-1.961762,1.918844,-5.224931,-5.484683,-3.125683,-3.300698,-1.895033,-0.967447,-2.824795,1.802522,5.264003,-4.91969,-3.987772,-4.36802,-0.235936
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10,-0.010591,1.286679,-0.208076,-1.905773,-3.286405,-3.637684,-1.262823,-3.630366,-7.253569,-5.115169,1.645364,-6.711686,0.029125,-4.365596,-2.704125
ced-5,0.908764,-2.60661,-1.672565,-1.643956,-11.953176,-1.227305,-2.330504,-2.829142,-8.294158,0.691405,0.44202,-13.289988,-2.931369,-1.497765,-0.3966
delm-1,1.479204,0.679554,-0.081651,0.352711,0.613993,-0.746649,-0.896926,-1.425577,-2.631308,-0.986245,-2.943531,-1.50898,-0.414799,0.264624,-0.414722


### T-stat analysis for psa data:

### 3.3.5 Allele level T-stat analysis of PSA data

In [44]:
warnings.filterwarnings('ignore')

psa_tstats_allele = do_TTest("dataset", baseline="psa") # get sorted T-statistics DataFrame 

psa_tstats_allele.head()

Unnamed: 0_level_0,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,PSA Initial_to_peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AMshABLATE_nsIs109,-17.660847,-41.933829,-14.101386,-20.841655,-46.828136,-48.772164,-22.20351,-34.683665,-20.059075,-28.180968,...,0.877889,5.280821,-0.945358,19.311291,-10.818214,-30.917813,14.061402,1.773753,-13.582627,-10.644029
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10_n3246,-48.519881,-4.130517,-25.071121,-90.265434,-15.856838,-89.570486,-29.995072,-23.574878,-29.999482,-52.089736,...,7.733273,-20.98229,61.679598,56.974491,-34.116441,2.205571,-5.471755,-30.231187,4.401344,-3.54012
ced-5_n2002,-49.955543,-45.376395,-25.441985,-91.261218,-83.746668,-103.132291,-57.751024,-28.623309,-31.051577,-51.238078,...,-18.351049,-17.330642,-28.759385,-20.13346,-32.222774,-27.17547,-16.797011,-41.02766,-40.017833,-39.638776
delm-1_ok1226,-11.267871,-12.710428,-55.187683,-62.001504,-51.611106,-60.267509,-21.633538,-8.505825,-6.958793,-10.848089,...,0.364854,-11.808009,49.94368,49.026101,3.453865,5.848819,-1.765663,-13.195271,13.604711,12.329933


### 3.3.6 Gene-level T-stat analysis of PSA data

In [45]:
warnings.filterwarnings('ignore')

psa_tstats = do_TTest("Gene", baseline="psa") # get sorted T-statistics DataFrame 

psa_tstats.head()

Unnamed: 0_level_0,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,PSA Initial_to_peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AMshABLATE,-17.660847,-41.933829,-14.101386,-20.841655,-46.828136,-48.772164,-22.20351,-34.683665,-20.059075,-28.180968,...,0.877889,5.280821,-0.945358,19.311291,-10.818214,-30.917813,14.061402,1.773753,-13.582627,-10.644029
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ced-10,-48.519881,-4.130517,-25.071121,-90.265434,-15.856838,-89.570486,-29.995072,-23.574878,-29.999482,-52.089736,...,7.733273,-20.98229,61.679598,56.974491,-34.116441,2.205571,-5.471755,-30.231187,4.401344,-3.54012
ced-5,-49.955543,-45.376395,-25.441985,-91.261218,-83.746668,-103.132291,-57.751024,-28.623309,-31.051577,-51.238078,...,-18.351049,-17.330642,-28.759385,-20.13346,-32.222774,-27.17547,-16.797011,-41.02766,-40.017833,-39.638776
delm-1,-11.267871,-12.710428,-55.187683,-62.001504,-51.611106,-60.267509,-21.633538,-8.505825,-6.958793,-10.848089,...,0.364854,-11.808009,49.94368,49.026101,3.453865,5.848819,-1.765663,-13.195271,13.604711,12.329933


# 4. Merging t-stat data into one dataset

In [46]:
def pop_cols(combined):
    """
    Reorders columns in the combined dataframe. 
    (pops specific columns["Area", "Midline", "Morphwidth", "Angular Speed"] and
    reinserts at different positions)

    input:
        combined: dataframe with columns to be reordered

    returns: 
        NA    
        
    """
    first_col=combined.pop("Area")
    combined.insert(0,"Area",first_col)

    first_col=combined.pop("Midline")
    combined.insert(0,"Midline",first_col)

    first_col=combined.pop("Morphwidth")
    combined.insert(0,"Morphwidth",first_col)

    first_col=combined.pop("Angular Speed")
    combined.insert(5,"Angular Speed",first_col)

def pop_last(combined):
    """
    Reorders the last three columns of the combined dataframe.
    input:
        combined: dataframe with columns to be reordered

    """
    last_col=combined.pop("Spontaneous Recovery of Response Duration")
    combined.insert(26,"Spontaneous Recovery of Response Duration",last_col)

    last_col=combined.pop("Spontaneous Recovery of Response Probability")
    combined.insert(26,"Spontaneous Recovery of Response Probability",last_col)

    last_col=combined.pop("Spontaneous Recovery of Response Speed")
    combined.insert(26,"Spontaneous Recovery of Response Speed",last_col)

    last_col=combined.pop("Memory Retention of Response Duration")
    combined.insert(26,"Memory Retention of Response Duration",last_col)

    last_col=combined.pop("Memory Retention of Response Probability")
    combined.insert(26,"Memory Retention of Response Probability",last_col)

    last_col=combined.pop("Memory Retention of Response Speed")
    combined.insert(26,"Memory Retention of Response Speed",last_col)

def rename_columns(df):
    '''
    Renames columns in the input dataframe
    input:
        combined: dataframe with columns to be renamed   
    returns:
        input dataframe with renamed columns 
    '''
    renames = {
        "Habituation of Duration": "Habituation of Response Duration",
        "Habituation of Probability": "Habituation of Respones Probability",
        "Habituation of Speed": "Habituation of Response Speed",
        "Initial Duration": "Initial Response Duration",
        "Initial Probability": "Initial Response Probability",
        "Initial Speed": "Initial Response Speed",
        "Final Duration": "Final Response Duration",
        "Final Probability": "Final Response Probability",
        "Final Speed": "Final Response Speed",
        "Recovery Duration": "Spontaneous Recovery of Response Duration",
        "Recovery Probability": "Spontaneous Recovery of Response Probability",
        "Recovery Speed": "Spontaneous Recovery of Response Speed",
        "Memory Retention Duration": "Memory Retention of Response Duration",
        "Memory Retention Probability": "Memory Retention of Response Probability",
        "Memory Retention Speed": "Memory Retention of Response Speed"
    }
    return df.rename(columns=renames)

def merge_Tstats(baseline, habituation, by=["Gene", "dataset"], Screen=Screen, psa=False):
    """
    merge baseline and tap response dataframes based on the Gene/dataset
    normalize the merged dataframe and then return it with melted version

    input:
        - baseline: baseline dataframe to merge
        - habituation: habituation dataframe to merge
        - by: what to group by "Gene" or "dataset"
    """

    #merge baseline and habituation data
    combined_Tstats = pd.merge(baseline, habituation, on=by, how='left')
    combined_Tstats = combined_Tstats.sort_index() # sort by index

    # ------------ NORMALISATION STEPS TO BE MOVED TO DASHBOARD -------------------
    # # normalise combined dataframe by subtracting mean and div by sd
    # combined_Tstats_normalized = (combined_Tstats-combined_Tstats.mean())/combined_Tstats.std()

    # if by=="dataset" and Screen=="Neuron_Genes_Screen":
    #     combined_Tstats_normalized_2 = combined_Tstats-combined_Tstats[combined_Tstats.index=="N2_XJ1"].squeeze()
    # else :
    #     combined_Tstats_normalized_2 = combined_Tstats-combined_Tstats[combined_Tstats.index=="N2"].squeeze()  

    pop_cols(combined_Tstats) # reorder columns

    # Skip this step if data = psa
    if not psa:
        #rename columns of combined and normalized df
        combined_Tstats = rename_columns(combined_Tstats)
        # combined_Tstats_normalized_2=rename_columns(combined_Tstats_normalized_2)
        pop_cols(combined_Tstats) # reorder columns
        pop_last(combined_Tstats) # reorder columns

    # -------------- PIVOTING STEPS TO BE MOVED TO DASHBOARD ---------------------
    # # Melt the combined dataframe
    # combined_Tstats_melted=combined_Tstats.reset_index()
    # combined_Tstats_melted=pd.melt(combined_Tstats_melted, id_vars=[by],
    #                             var_name='Metric',
    #                             value_name='T_score')
    
    # # Sort the melted dataframe by T_score
    # combined_Tstats_melted_sorted=combined_Tstats_melted.sort_values(by=['T_score'])

    # # Melt the normalized dataframe
    # combined_Tstats_normalized_melted=combined_Tstats.reset_index()
    # combined_Tstats_normalized_melted=pd.melt(combined_Tstats_normalized_melted, id_vars=[by],
    #                                                var_name='Metric',
    #                                                value_name='T_score')

    # add Screen column to df and its melted version
    combined_Tstats['Screen']=Screen
    # combined_Tstats_normalized_melted['Screen']=Screen

    return combined_Tstats#, combined_Tstats_normalized_melted



## 4.1 Gene-level

- Pass Tap and baseline through merge_Tstats() as df1
- Pass PSA and baseline through merge_Tstats()as df2
- pd.merge df1 and df2 using all columns of baseline

In [47]:
# Baseline + Tap
combined_Tstats = merge_Tstats(PD_baseline_Tstats, PD_habituation_Tstats, "Gene")

In [48]:
# Baseline + PSA 
combined_Tstats_psa = merge_Tstats(
    PD_baseline_Tstats, psa_tstats, by="Gene", psa=True
)

In [49]:
# Baseline + Tap + PSA
final_tstat = pd.merge(combined_Tstats.reset_index(), combined_Tstats_psa.reset_index(), on = PD_baseline_Tstats.columns.to_list().append(['Gene','Screen']), how = 'inner')

final_tstat.head()

Unnamed: 0,Gene,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,AMshABLATE,8.436649,-72.062483,-19.016885,-170.402072,-185.50277,-72.427009,-129.051227,67.186598,60.225199,...,0.877889,5.280821,-0.945358,19.311291,-10.818214,-30.917813,14.061402,1.773753,-13.582627,-10.644029
1,N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ced-10,-96.420955,-590.22632,-328.825337,-178.516631,-121.15098,86.467808,-101.994376,324.132657,276.5116,...,7.733273,-20.98229,61.679598,56.974491,-34.116441,2.205571,-5.471755,-30.231187,4.401344,-3.54012
3,ced-5,-162.535954,-311.278255,-347.893201,-124.360515,-132.839687,6.824703,-87.457709,-59.527752,-97.326696,...,-18.351049,-17.330642,-28.759385,-20.13346,-32.222774,-27.17547,-16.797011,-41.02766,-40.017833,-39.638776
4,delm-1,-335.410406,-670.970972,-489.588565,-3.381671,-7.564952,92.161494,-7.161008,58.931913,9.494694,...,0.364854,-11.808009,49.94368,49.026101,3.453865,5.848819,-1.765663,-13.195271,13.604711,12.329933


In [50]:
# # Baseline + Tap + PSA melted
# final_tstat_melted = pd.concat([combined_Tstats_normalized_melted, combined_Tstats_psa_melted]).drop_duplicates()

# final_tstat_melted.head()

## 4.2 Allele level 


- Pass Tap and baseline through merge_Tstats() as df3
- Pass PSA and baseline through merge_Tstats()as df4
- pd.merge df3 and df4 using all columns of basline

In [51]:
# Baseline + Tap
combined_Tstats_allele = merge_Tstats(PD_baseline_Tstats_allele,PD_habituation_Tstats_allele, "dataset")

In [52]:
# Baseline + PSA 
combined_Tstats_psa_allele = merge_Tstats(
    PD_baseline_Tstats_allele, psa_tstats_allele, by="dataset", psa=True
)

In [53]:
# Baseline + Tap + PSA
final_tstat_allele = pd.merge(combined_Tstats_allele.reset_index(), combined_Tstats_psa_allele.reset_index(), on = PD_baseline_Tstats_allele.columns.to_list().append(['dataset','Screen']), how = 'outer')

final_tstat_allele.head()

Unnamed: 0,dataset,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,AMshABLATE_nsIs109,8.436649,-72.062483,-19.016885,-170.402072,-185.50277,-72.427009,-129.051227,67.186598,60.225199,...,0.877889,5.280821,-0.945358,19.311291,-10.818214,-30.917813,14.061402,1.773753,-13.582627,-10.644029
1,N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ced-10_n3246,-96.420955,-590.22632,-328.825337,-178.516631,-121.15098,86.467808,-101.994376,324.132657,276.5116,...,7.733273,-20.98229,61.679598,56.974491,-34.116441,2.205571,-5.471755,-30.231187,4.401344,-3.54012
3,ced-5_n2002,-162.535954,-311.278255,-347.893201,-124.360515,-132.839687,6.824703,-87.457709,-59.527752,-97.326696,...,-18.351049,-17.330642,-28.759385,-20.13346,-32.222774,-27.17547,-16.797011,-41.02766,-40.017833,-39.638776
4,delm-1_ok1226,-335.410406,-670.970972,-489.588565,-3.381671,-7.564952,92.161494,-7.161008,58.931913,9.494694,...,0.364854,-11.808009,49.94368,49.026101,3.453865,5.848819,-1.765663,-13.195271,13.604711,12.329933


In [54]:
# # Baseline + Tap + PSA melted
# final_tstat_melted_allele = pd.concat([combined_Tstats_normalized_melted_allele, combined_Tstats_psa_melted_allele]).drop_duplicates()

# final_tstat_melted_allele.head()

# 5. Save data to database (sqlite3)

#### A janky way to add data and update the sql 

1. Read table to pd.DataFrame
2. Add new data to pd.DataFrame
3. Replace old table with newly updated pd.DataFrame

# Primary Keys For Each SQL Table:

####  -- Gene_Allele_WormBaseID:
WBGene, WBAllele
#### -- alleleMSD:
dataset, Screen
#### -- gene_MSD:
Gene, Screen
#### -- allele_profile_data:
dataset, Metric, Screen
#### -- gene_profile_data:
Gene, Metric, Screen
#### -- tap_baseline_data:
Time, Plate_id, Date, Screen, dataset
#### -- tap_response_data:
plate, Date, Plate_id, Screen, taps, dataset, Gene, Allele
#### -- tstat_allele_data:
dataset, Screen
#### -- tstat_gene_data:
Gene, Screen
#### -- psa_summarized_data:
Plate_id,Date,Scree,dataset,Gene,Allele

In [55]:
print(tap_output.head(5))
print(baseline_output.head(5))

tap_output.Screen = Screen
baseline_output.Screen = Screen

print(tap_output.head(5))
print(baseline_output.head(5))

      time  dura   dist      prob     speed  plate      Date Plate_id  \
0  599.983  2.22  0.562  1.000000  0.253153      1  20240724    A0724   
1  609.979  1.45  0.371  0.545455  0.255862      1  20240724    A0724   
2  619.996  2.11  0.669  0.520000  0.317062      1  20240724    A0724   
3  629.971  1.59  0.422  0.809524  0.265409      1  20240724    A0724   
4  639.968  1.50  0.378  0.895833  0.252000      1  20240724    A0724   

              Screen  taps             dataset        Gene   Allele  
0  Glia_Genes_Screen   1.0  AMshABLATE_nsIs109  AMshABLATE  nsIs109  
1  Glia_Genes_Screen   2.0  AMshABLATE_nsIs109  AMshABLATE  nsIs109  
2  Glia_Genes_Screen   3.0  AMshABLATE_nsIs109  AMshABLATE  nsIs109  
3  Glia_Genes_Screen   4.0  AMshABLATE_nsIs109  AMshABLATE  nsIs109  
4  Glia_Genes_Screen   5.0  AMshABLATE_nsIs109  AMshABLATE  nsIs109  
         Time   n  Number  Instantaneous Speed  Interval Speed   Bias  \
2241  490.041  14      12               0.0270          0.0186  0.08

In [56]:

### This code will connect to PostgreSQL database and write non-duplicate data into the database tables.

# Loads database config values from database.ini file and validates that user and password are set.
config = load_config()
if (config['user'] == "" or config['password'] == ""):
    print("Please set your user and password in the database.ini file.")
    sys.exit(1)
    
# Creates a connection pool to PostgreSQL database using SQLAlchemy.
engine = create_engine(f"postgresql+psycopg://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}")

# Function to insert data into PostgreSQL table, skipping duplicates based on primary keys.
def postgres_skip_on_duplicate(pd_table, conn, keys, data_iter):
    data = [dict(zip(keys,row)) for row in data_iter]
    conn.execute(insert(pd_table.table).on_conflict_do_nothing(), data)

# --------- Write the dataframes to PostgreSQL tables -----------

# Complete tap response data
print("working on tap_output:") 
tap_output.to_sql('tap_response_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Complete baseline data  >NO
print("working on tap_baseline_data:") 
baseline_output.to_sql('tap_baseline_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Baseline + Tap + PSA combined tstat data by Gene
print("working on tstat_gene_data")
final_tstat.reset_index().to_sql('tstat_gene_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Baseline + Tap + PSA combined tstat data by Allele
print("working on tstat_allele_data")
final_tstat_allele.reset_index().to_sql('tstat_allele_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# MSD Baseline + Tap + PSA by Gene
print("working on gene_MSD")
combined_MSD.to_sql('gene_MSD', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# MSD Baseline + Tap + PSA by Allele
print("working on allele_MSD")
allele_combined_MSD.to_sql('allele_MSD', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Summarised PSA data (speed, kink, curve, etc.)
print("working on psa_data:") 
psa_data.to_sql('psa_summarised_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# # Melted Baseline + Tap + PSA combined tstat data by Gene
# print("working on gene_profile_data")
# final_tstat_melted.to_sql('gene_profile_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# # Melted Baseline + Tap + PSA combined tstat data by Allele
# print("working on allele_profile_data")
# final_tstat_melted_allele.to_sql('allele_profile_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

working on tap_output:
working on tap_baseline_data:
working on tstat_gene_data
working on tstat_allele_data
working on gene_MSD
working on allele_MSD
working on psa_data:


### Use the below cell to just replace/update one table:

In [None]:
# Loads database config values from database.ini file and validates that user and password are set.
config = load_config()
if (config['user'] == "" or config['password'] == ""):
    print("Please set your user and password in the database.ini file.")
    sys.exit(1)
    
# Creates a connection pool to PostgreSQL database using SQLAlchemy.
engine = create_engine(f"postgresql+psycopg://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}")

# Function to insert data into PostgreSQL table, skipping duplicates based on primary keys.
def postgres_skip_on_duplicate(pd_table, conn, keys, data_iter):
    data = [dict(zip(keys,row)) for row in data_iter]
    conn.execute(insert(pd_table.table).on_conflict_do_nothing(), data)


# Complete tap response data
print("working on tap_output:") 
tap_output.to_sql('tap_response_data', engine, if_exists='replace', index=False, method=None)

In [None]:
# # USE THIS CELL TO UPDATE ALL THE NEED TALBES (Also have baseline_output on the second line)

# conn=sqlite3.connect('/Users/lavanya/Desktop/Lavanya_Test/data_updated2.db')

# tap_output.to_sql('tap_response_data', conn, if_exists='append', index=False)

# baseline_output.to_sql('tap_baseline_data', conn, if_exists='append', index=False)

# combined_Tstats_normalize_2.reset_index().to_sql('tstat_gene_data', conn, if_exists='append', index=False)

# combined_Tstats_normalize_allele_2.reset_index().to_sql('tstat_allele_data', conn, if_exists='append', index=False)

# combined_Tstats_normalized_melted.to_sql('gene_profile_data', conn, if_exists='append', index=False)

# combined_Tstats_normalized_melted_allele.to_sql('allele_profile_data', conn, if_exists='append', index=False)

# combined_MSD.to_sql('gene_MSD', conn, if_exists='append', index=False)

# allele_combined_MSD.to_sql('allele_MSD', conn, if_exists='append', index=False)

# # combined_Tstats_melted_sorted.to_sql('allele_phenotype_data', conn, if_exists='replace', index=False)

# print(conn.total_changes)

# conn.close()


# # Want to test edge cases of pd.to_sql functionality#############