# 1. Imports and File selection 

In [1]:
import io
import ipywidgets as widgets
import math
import numpy
import psycopg
import pandas as pd
import requests
import sqlite3
import sys
import tqdm
import warnings

from config import load_config
from ipyfilechooser import FileChooser
from scipy import stats
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import insert
from sqlite3 import Error
from sqlite3 import IntegrityError

## Select Baseline .csv File

In [2]:
starting_directory = '/Users'
baseline_chooser = FileChooser(starting_directory)
display(baseline_chooser)

FileChooser(path='/Users', filename='', title='', show_hidden=False, select_desc='Select', change_desc='Change…

## Select Tap .csv File

In [3]:
tap_chooser=FileChooser('/Users')
display(tap_chooser)

FileChooser(path='/Users', filename='', title='', show_hidden=False, select_desc='Select', change_desc='Change…

## Select Post Stimulus Arousal .csv File

In [4]:
psa_chooser = FileChooser('/Users')
display(psa_chooser)

FileChooser(path='/Users', filename='', title='', show_hidden=False, select_desc='Select', change_desc='Change…

In [5]:
screens = ['PD_Screen', 'ASD_Screen', 'G-Proteins_Screen', 'Glia_Genes_Screen', 
           'Neuron_Genes_Screen', 'PD_GWAS_Locus71_Screen', 'ASD_WGS_Screen']

screen_chooser = widgets.Select(options=screens, value=screens[0], description='Screen:')
display(screen_chooser)

Select(description='Screen:', options=('PD_Screen', 'ASD_Screen', 'G-Proteins_Screen', 'Glia_Genes_Screen', 'N…

In [25]:
Screen=screen_chooser.value
folder_path=baseline_chooser.selected_path
print(folder_path)

/Users/gurmehak/Documents/RankinLab/Test_Datasets/PDScreen_TapHab_August15_2022


## Read baseline, tap and post stimulus arousal (psa) data

In [26]:
# Read the baseline file
baseline_output = pd.read_csv(baseline_chooser.selected, index_col=0)#.drop(columns=['index'])

print(f"\nShape of the baseline .csv file: {baseline_output.shape}")

# Print the first five rows of the file
baseline_output.head()


Shape of the baseline .csv file: (30487, 21)


Unnamed: 0,Time,n,Number,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,...,Kink,Curve,Crab,Pathlength,Plate_id,Date,Screen,dataset,Gene,Allele
1870,490.016,14,12,0.0823,0.1195,0.25,0.1078,1.0908,0.142641,6.1,...,31.1,25.0,0.0066,11.893,B0811ab,20220815,PD_Screen,N2,N2,N2
1871,490.056,14,12,0.0736,0.1024,0.25,0.1059,1.088,0.14015,5.3,...,30.9,24.7,0.0064,11.896,B0811ab,20220815,PD_Screen,N2,N2,N2
1872,490.103,14,12,0.0784,0.1024,0.25,0.105,1.0914,0.138935,5.2,...,31.0,24.6,0.0057,11.898,B0811ab,20220815,PD_Screen,N2,N2,N2
1873,490.144,14,12,0.097,0.1118,0.25,0.1054,1.0935,0.140575,5.8,...,30.6,24.5,0.0103,11.901,B0811ab,20220815,PD_Screen,N2,N2,N2
1874,490.186,14,12,0.0994,0.1197,0.25,0.111,1.1026,0.146894,5.6,...,31.4,24.4,0.0097,11.904,B0811ab,20220815,PD_Screen,N2,N2,N2


In [27]:
# Read the tap file
tap_output = pd.read_csv(tap_chooser.selected, index_col=0)

print(f"\nShape of the tap .csv file: {tap_output.shape}")

# Print the first five rows of the file
tap_output.head()


Shape of the tap .csv file: (395, 13)


Unnamed: 0,time,dura,dist,prob,speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.985,2.83,0.696,0.928571,0.245936,1,20220815,B0811ab,PD_Screen,1.0,N2,N2,N2
1,609.993,2.98,0.746,0.857143,0.250336,1,20220815,B0811ab,PD_Screen,2.0,N2,N2,N2
2,619.699,1.97,0.536,0.8,0.272081,1,20220815,B0811ab,PD_Screen,3.0,N2,N2,N2
3,629.956,2.57,0.686,0.9,0.266926,1,20220815,B0811ab,PD_Screen,4.0,N2,N2,N2
4,639.957,1.34,0.383,0.909091,0.285821,1,20220815,B0811ab,PD_Screen,5.0,N2,N2,N2


In [28]:
# Read the psa file
psa_output = pd.read_csv(psa_chooser.selected, index_col=0)

print(f"\nShape of the tap .csv file: {psa_output.shape}")

# Print the first five rows of the file
psa_output.head()


Shape of the tap .csv file: (402, 24)


Unnamed: 0,Experiment,Tap_num,Plate_id,Date,Screen,dataset,Gene,Allele,Time,n,...,Tap,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,1,1,B0811ab,20220815,PD_Screen,N2,N2,N2,607,16.925,...,0.0,0.112155,1.072315,0.14181,7.34,0.34405,48.1075,36.13,0.015625,6.499075
1,1,2,B0811ab,20220815,PD_Screen,N2,N2,N2,617,16.972973,...,0.0,0.111914,1.051191,0.137795,16.145946,0.315459,53.440541,32.040541,0.026011,6.607365
2,1,3,B0811ab,20220815,PD_Screen,N2,N2,N2,627,16.629032,...,0.0,0.109982,1.07951,0.140502,12.866129,0.293274,51.256452,30.583871,0.02571,7.168435
3,1,4,B0811ab,20220815,PD_Screen,N2,N2,N2,637,13.134328,...,0.0,0.105619,1.042824,0.133841,20.716418,0.33309,58.326866,31.059701,0.026849,8.156448
4,1,5,B0811ab,20220815,PD_Screen,N2,N2,N2,647,16.108108,...,0.0,0.103418,1.07535,0.137081,13.786486,0.281851,39.606757,29.005405,0.027261,9.523203


# 2. DataFrame preparation

### 2.1. Tap Data

In [29]:
# Dataframe for first tap
PD_first_tap = (
    tap_output[(tap_output.taps==1)]
    .reset_index().drop(columns="index")
    .rename(columns={"dura": "init_dura", "prob": "init_prob", "speed": "init_speed"}, errors="raise")
)

PD_first_tap.head()

Unnamed: 0,time,init_dura,dist,init_prob,init_speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.985,2.83,0.696,0.928571,0.245936,1,20220815,B0811ab,PD_Screen,1.0,N2,N2,N2
1,599.991,2.14,0.524,0.9375,0.24486,2,20220815,A0811aa,PD_Screen,1.0,N2,N2,N2
2,599.981,2.96,0.783,1.0,0.264527,3,20220815,A0811ad,PD_Screen,1.0,N2,N2,N2
3,599.962,2.38,0.501,0.913043,0.210504,4,20220815,B0811ae,PD_Screen,1.0,N2,N2,N2
4,599.972,2.52,0.622,0.863636,0.246825,5,20220815,C0811ac,PD_Screen,1.0,N2,N2,N2


In [30]:
# Dataframe for recovery taps
PD_recov_taps = (
    tap_output[(tap_output.taps==31)]
    .reset_index().drop(columns="index")
    .rename(columns={"dura": "recov_dura", "prob": "recov_prob", "speed":"recov_speed"})
)

PD_recov_taps.head()

Unnamed: 0,time,recov_dura,dist,recov_prob,recov_speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,1189.987,2.61,0.464,0.545455,0.177778,1,20220815,B0811ab,PD_Screen,31.0,N2,N2,N2
1,1189.979,1.97,0.445,0.692308,0.225888,2,20220815,A0811aa,PD_Screen,31.0,N2,N2,N2
2,1189.962,1.79,0.61,0.882353,0.340782,3,20220815,A0811ad,PD_Screen,31.0,N2,N2,N2
3,1189.978,1.7,0.44,0.761905,0.258824,4,20220815,B0811ae,PD_Screen,31.0,N2,N2,N2
4,1189.966,2.01,0.561,0.607143,0.279104,5,20220815,C0811ac,PD_Screen,31.0,N2,N2,N2


In [31]:
# Dataframe for last three taps
PD_final_taps = (
    tap_output[((tap_output.taps >= 28) & (tap_output.taps <= 30))]
    .groupby(["dataset", "Date","Plate_id","Screen","Gene","Allele","plate"])
    .mean()
    .reset_index()
    .rename(columns={"dura": "final_dura", "prob": "final_prob", "speed": "final_speed"}, errors="raise")
)

PD_final_taps.head()

Unnamed: 0,dataset,Date,Plate_id,Screen,Gene,Allele,plate,time,final_dura,dist,final_prob,final_speed,taps
0,N2,20220815,A0811aa,PD_Screen,N2,N2,2,879.973,0.99,0.233333,0.302832,0.221688,29.0
1,N2,20220815,A0811ad,PD_Screen,N2,N2,3,879.989333,0.6,0.124,0.319444,0.202159,29.0
2,N2,20220815,B0811ab,PD_Screen,N2,N2,1,879.936333,1.233333,0.206333,0.355556,0.162658,29.0
3,N2,20220815,B0811ae,PD_Screen,N2,N2,4,879.99,0.846667,0.196,0.314762,0.23771,29.0
4,N2,20220815,C0811ac,PD_Screen,N2,N2,5,879.968333,1.346667,0.341,0.381481,0.248545,29.0


In [32]:
# Dataframe to analyse habituation behaviour after merging first tap and final taps

PD_habit_levels = pd.merge(
    PD_first_tap, 
    PD_final_taps, 
    on =['dataset', 'plate', "Plate_id", "Screen", "Gene", "Allele", "Date"], how ='left'
).drop(columns=['time_x','time_y','dist_x','dist_y', 'taps_x', 'taps_y']).dropna()

PD_habit_levels['habit_dura'] = PD_habit_levels['init_dura'] - PD_habit_levels['final_dura']

PD_habit_levels['habit_prob'] = PD_habit_levels['init_prob'] - PD_habit_levels['final_prob']

PD_habit_levels['habit_speed'] = PD_habit_levels['init_speed'] - PD_habit_levels['final_speed']

In [33]:
# Continue to analyse habituation behaviour after merging with recovery taps

if PD_recov_taps.empty:
    PD_habituation = pd.merge(PD_habit_levels, PD_recov_taps, on =['dataset','plate',"Plate_id","Screen","Gene","Allele","Date"], how ='outer')
else:
    PD_habituation = pd.merge(PD_habit_levels, PD_recov_taps, on =['dataset','plate',"Plate_id","Screen","Gene","Allele","Date"], how ='left')

if Screen not in ['Neuron_Genes_Screen', 'G-Proteins_Screen']:
    PD_habituation = PD_habituation.dropna() 

PD_habituation['recovery_dura']=(PD_habituation.recov_dura-PD_habituation.init_dura)/PD_habituation.init_dura*100

PD_habituation['recovery_prob']=(PD_habituation.recov_prob-PD_habituation.init_prob)/PD_habituation.init_prob*100

PD_habituation['recovery_speed']=(PD_habituation.recov_speed-PD_habituation.init_speed)/PD_habituation.init_speed*100

PD_habituation['memory_retention_dura']=(PD_habituation.recov_dura-PD_habituation.final_dura)

PD_habituation['memory_retention_prob']=(PD_habituation.recov_prob-PD_habituation.final_prob)

PD_habituation['memory_retention_speed']=(PD_habituation.recov_speed-PD_habituation.final_speed)


# Rename `PD_habituation` to `tap_data` based on the condition below
if Screen in ['Neuron_Genes_Screen', 'G-Proteins_Screen']:
    tap_data=PD_habituation.dropna(subset = ['init_dura', 'init_prob', 'init_speed', 'plate', 'Date', 'Plate_id',
       'Screen', 'dataset', 'Gene', 'Allele', 'final_dura', 'final_prob',
       'final_speed', 'habit_dura', 'habit_prob', 'habit_speed'])
else:
    tap_data=PD_habituation.dropna() 


# Display final dataframe
tap_data.head()


Unnamed: 0,init_dura,init_prob,init_speed,plate,Date,Plate_id,Screen,dataset,Gene,Allele,...,dist,recov_prob,recov_speed,taps,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,2.83,0.928571,0.245936,1,20220815,B0811ab,PD_Screen,N2,N2,N2,...,0.464,0.545455,0.177778,31.0,-7.773852,-41.258741,-27.713921,1.376667,0.189899,0.015119
1,2.14,0.9375,0.24486,2,20220815,A0811aa,PD_Screen,N2,N2,N2,...,0.445,0.692308,0.225888,31.0,-7.943925,-26.153846,-7.747898,0.98,0.389475,0.0042
2,2.96,1.0,0.264527,3,20220815,A0811ad,PD_Screen,N2,N2,N2,...,0.61,0.882353,0.340782,31.0,-39.527027,-11.764706,28.826958,1.19,0.562908,0.138623
3,2.38,0.913043,0.210504,4,20220815,B0811ae,PD_Screen,N2,N2,N2,...,0.44,0.761905,0.258824,31.0,-28.571429,-16.553288,22.954092,0.853333,0.447143,0.021114
4,2.52,0.863636,0.246825,5,20220815,C0811ac,PD_Screen,N2,N2,N2,...,0.561,0.607143,0.279104,31.0,-20.238095,-29.699248,13.077698,0.663333,0.225661,0.03056


### 2.2. PSA data

In [34]:
# function to calculate Inidial, Final, Peak, ect values for specified column (metric)

def summary_metrics(df, metric = 'Instantaneous Speed'):

    initial = df[metric].iloc[0]
    recovery = df[metric].iloc[-1]
    peak = df[metric].max()
    mean = df[metric].mean()
    peak_id = df[metric].values.argmax()
    initial_to_peak = df[metric].iloc[: peak_id+1].mean()
    peak_to_recovery = df[metric].iloc[peak_id:].mean()
    

    return pd.Series({
        f'PSA Initial {metric}': initial, 
        f'PSA Recovery {metric}': recovery, 
        f'PSA Peak {metric}': peak,
        f'PSA Initial_to_peak {metric}': initial_to_peak, 
        f'PSA Peak_to_recovery {metric}': peak_to_recovery,
        f'PSA Average {metric}': mean
        })

In [35]:
warnings.filterwarnings('ignore')

# columns to summarize
metrics_to_summarize = ['Instantaneous Speed', 'Bias', 'Angular Speed', 'Aspect Ratio', 'Kink', 'Curve', 'Crab']

# standard columns
group_cols = ['Experiment', 'Plate_id', 'Date', 'Screen', 'dataset', 'Gene', 'Allele']

# pass each column to summarise through `summary_metrics` function and merge the summarised values to psa_output
psa_data = psa_output[group_cols]
for metric in metrics_to_summarize:
    summary = psa_output.groupby(group_cols).apply(lambda x: summary_metrics(x, metric)).reset_index()
    psa_data = pd.merge(psa_data, summary, on=group_cols, how='left')

In [36]:
psa_data.head()

Unnamed: 0,Experiment,Plate_id,Date,Screen,dataset,Gene,Allele,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,1,B0811ab,20220815,PD_Screen,N2,N2,N2,0.125645,0.170966,0.266666,...,36.13,36.13,28.7957,28.7957,0.015625,0.020022,0.027261,0.024291,0.018463,0.019119
1,1,B0811ab,20220815,PD_Screen,N2,N2,N2,0.125645,0.170966,0.266666,...,36.13,36.13,28.7957,28.7957,0.015625,0.020022,0.027261,0.024291,0.018463,0.019119
2,1,B0811ab,20220815,PD_Screen,N2,N2,N2,0.125645,0.170966,0.266666,...,36.13,36.13,28.7957,28.7957,0.015625,0.020022,0.027261,0.024291,0.018463,0.019119
3,1,B0811ab,20220815,PD_Screen,N2,N2,N2,0.125645,0.170966,0.266666,...,36.13,36.13,28.7957,28.7957,0.015625,0.020022,0.027261,0.024291,0.018463,0.019119
4,1,B0811ab,20220815,PD_Screen,N2,N2,N2,0.125645,0.170966,0.266666,...,36.13,36.13,28.7957,28.7957,0.015625,0.020022,0.027261,0.024291,0.018463,0.019119


# 3. Run Statistics (T-Test and mean sample distance) on Data

## 3.1 Generate dataframes conditioned by `baseline` (True/False) and `allele` (True/False)

In [78]:
def get_output_byplate(output, baseline=["true", "false", "psa"], allele = [False, True]):
    """
    Aggregates data by 'Plate_id','Date','Screen','dataset','Gene','Allele'

    Parameters:
        output (pd.DataFrame): Input DataFrame (either baseline_output or tap_data)
        baseline (boolean): whether data is baseline (True) or tap response (False)
        allele (boolean): group by allele (True) or group by gene (False)

    Returns:
        A DataFrame with plate-level averages
    """
    
    # columns to delete if baseline = true
    if baseline == "true":
        drop_col = ['Plate_id','n','Number','Time','Screen','Date','Allele']
    # columns to delete if baseline = false
    elif baseline == "false":
        drop_col = ['Plate_id','Screen','Date','Allele','dist','plate','time',
                       'taps','recov_dura','recov_prob','recov_speed']
    # columns to delete if baseline = psa
    else: 
        drop_col = ['Experiment', 'Plate_id', 'Date', 'Screen', 'Allele']

    drop_col.append('Gene') if allele else drop_col.append('dataset')
     
    output_byplate = output.groupby(
        by=['Plate_id','Date','Screen','dataset','Gene','Allele'],
        as_index=False).mean().drop(columns=drop_col)
    
    return output_byplate

#### 3.1.1 `baseline` = True, `allele` = False

In [None]:
baseline_output_byplate=get_output_byplate(baseline_output, baseline= "true", allele=False)

print(f"Shape: {baseline_output_byplate.shape}")

baseline_output_byplate.head()

Shape: (13, 13)


Unnamed: 0,Gene,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,N2,0.033536,0.032413,0.030834,0.116511,1.231836,0.169018,1.909915,0.261825,47.466667,29.442805,0.005735,8.756541
1,N2,0.05908,0.066365,0.203949,0.111072,1.19486,0.153823,3.502214,0.261859,49.367271,28.56715,0.008075,4.712773
2,hipr-1,0.111014,0.130682,0.426766,0.104552,1.100872,0.136805,10.201907,0.406782,63.183664,35.107551,0.016845,21.158225
3,hipr-1,0.0272,0.028163,0.00846,0.09632,0.937245,0.108482,2.342027,0.279167,48.979768,27.0252,0.004914,9.26598
4,N2,0.08401,0.103804,0.275993,0.108908,1.085004,0.14279,5.541694,0.272735,44.438571,28.031377,0.011272,8.938643


#### 3.1.2 `baseline` = False, `allele` = False

In [None]:
tap_data_byplate=get_output_byplate(tap_data, baseline="false", allele=False)

print(f"Shape: {tap_data_byplate.shape}")

tap_data_byplate.head()

Shape: (13, 16)


Unnamed: 0,Gene,init_dura,init_prob,init_speed,final_dura,final_prob,final_speed,habit_dura,habit_prob,habit_speed,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,N2,2.14,0.9375,0.24486,0.99,0.302832,0.221688,1.15,0.634668,0.023172,-7.943925,-26.153846,-7.747898,0.98,0.389475,0.0042
1,N2,2.96,1.0,0.264527,0.6,0.319444,0.202159,2.36,0.680556,0.062368,-39.527027,-11.764706,28.826958,1.19,0.562908,0.138623
2,hipr-1,2.07,0.8,0.396135,2.18,0.5,0.174771,-0.11,0.3,0.221365,-10.628019,25.0,-18.264338,-0.33,0.5,0.149013
3,hipr-1,2.53,0.826087,0.204348,0.976667,0.734271,0.203894,1.553333,0.091816,0.000454,-4.743083,21.052632,3.354816,1.433333,0.265729,0.00731
4,N2,2.83,0.928571,0.245936,1.233333,0.355556,0.162658,1.596667,0.573016,0.083278,-7.773852,-41.258741,-27.713921,1.376667,0.189899,0.015119


#### 3.1.3 `baseline` = True, `allele` = True

In [79]:
baseline_output_allele_byplate = get_output_byplate(baseline_output,baseline="true", allele=True)

print(f"Shape: {baseline_output_allele_byplate.shape}")

baseline_output_allele_byplate.head()

Shape: (13, 13)


Unnamed: 0,dataset,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,N2,0.033536,0.032413,0.030834,0.116511,1.231836,0.169018,1.909915,0.261825,47.466667,29.442805,0.005735,8.756541
1,N2,0.05908,0.066365,0.203949,0.111072,1.19486,0.153823,3.502214,0.261859,49.367271,28.56715,0.008075,4.712773
2,hipr-1_ok1081,0.111014,0.130682,0.426766,0.104552,1.100872,0.136805,10.201907,0.406782,63.183664,35.107551,0.016845,21.158225
3,hipr-1_tm10120,0.0272,0.028163,0.00846,0.09632,0.937245,0.108482,2.342027,0.279167,48.979768,27.0252,0.004914,9.26598
4,N2,0.08401,0.103804,0.275993,0.108908,1.085004,0.14279,5.541694,0.272735,44.438571,28.031377,0.011272,8.938643


#### 3.1.4 `baseline` = False, `allele` = True

In [80]:
tap_data_allele_byplate = get_output_byplate(tap_data, baseline="false", allele=True)

print(f"Shape: {tap_data_allele_byplate.shape}")

tap_data_allele_byplate.head()

Shape: (13, 16)


Unnamed: 0,dataset,init_dura,init_prob,init_speed,final_dura,final_prob,final_speed,habit_dura,habit_prob,habit_speed,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,N2,2.14,0.9375,0.24486,0.99,0.302832,0.221688,1.15,0.634668,0.023172,-7.943925,-26.153846,-7.747898,0.98,0.389475,0.0042
1,N2,2.96,1.0,0.264527,0.6,0.319444,0.202159,2.36,0.680556,0.062368,-39.527027,-11.764706,28.826958,1.19,0.562908,0.138623
2,hipr-1_ok1081,2.07,0.8,0.396135,2.18,0.5,0.174771,-0.11,0.3,0.221365,-10.628019,25.0,-18.264338,-0.33,0.5,0.149013
3,hipr-1_tm10120,2.53,0.826087,0.204348,0.976667,0.734271,0.203894,1.553333,0.091816,0.000454,-4.743083,21.052632,3.354816,1.433333,0.265729,0.00731
4,N2,2.83,0.928571,0.245936,1.233333,0.355556,0.162658,1.596667,0.573016,0.083278,-7.773852,-41.258741,-27.713921,1.376667,0.189899,0.015119


In [42]:
# tap_data_allele_byplate[tap_data_allele_byplate.dataset=='N2_XJ1']

#### 3.1.5 `baseline` = "psa" , `allele` = False

In [83]:
psa_data_byplate = get_output_byplate(psa_data, baseline="psa", allele=False)

print(f"Shape: {psa_data_byplate.shape}")

psa_data_byplate.head()

Shape: (13, 43)


Unnamed: 0,Gene,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,N2,0.15386,0.141464,0.284323,0.256036,0.219366,0.232648,0.481764,0.539307,1.0,...,36.784722,36.784722,29.546121,29.546121,0.021233,0.019003,0.031349,0.026835,0.019316,0.019655
1,N2,0.193287,0.214873,0.281526,0.251662,0.230287,0.230703,0.660563,0.905987,0.991716,...,36.138028,36.138028,29.514369,29.514369,0.02493,0.022515,0.033245,0.029087,0.020009,0.020168
2,hipr-1,0.172732,0.222835,0.351954,0.284509,0.261104,0.263459,0.529114,0.857,1.0,...,38.510127,38.510127,31.4698,31.4698,0.028865,0.023517,0.057941,0.041115,0.031142,0.032207
3,hipr-1,0.117753,0.07112,0.249123,0.200501,0.185769,0.185151,0.303446,0.372135,0.989895,...,35.427027,35.427027,27.91832,27.91832,0.0161,0.009481,0.027043,0.021572,0.014943,0.01498
4,N2,0.125645,0.170966,0.266666,0.21573,0.198955,0.199476,0.350025,0.591224,1.0,...,36.13,36.13,28.7957,28.7957,0.015625,0.020022,0.027261,0.024291,0.018463,0.019119


#### 3.1.6 `baseline` = "psa" , `allele` = True

In [85]:
psa_data_allele_byplate = get_output_byplate(psa_data, baseline="psa", allele=True)

print(f"Shape: {psa_data_allele_byplate.shape}")

psa_data_allele_byplate.head()

Shape: (13, 43)


Unnamed: 0,dataset,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
0,N2,0.15386,0.141464,0.284323,0.256036,0.219366,0.232648,0.481764,0.539307,1.0,...,36.784722,36.784722,29.546121,29.546121,0.021233,0.019003,0.031349,0.026835,0.019316,0.019655
1,N2,0.193287,0.214873,0.281526,0.251662,0.230287,0.230703,0.660563,0.905987,0.991716,...,36.138028,36.138028,29.514369,29.514369,0.02493,0.022515,0.033245,0.029087,0.020009,0.020168
2,hipr-1_ok1081,0.172732,0.222835,0.351954,0.284509,0.261104,0.263459,0.529114,0.857,1.0,...,38.510127,38.510127,31.4698,31.4698,0.028865,0.023517,0.057941,0.041115,0.031142,0.032207
3,hipr-1_tm10120,0.117753,0.07112,0.249123,0.200501,0.185769,0.185151,0.303446,0.372135,0.989895,...,35.427027,35.427027,27.91832,27.91832,0.0161,0.009481,0.027043,0.021572,0.014943,0.01498
4,N2,0.125645,0.170966,0.266666,0.21573,0.198955,0.199476,0.350025,0.591224,1.0,...,36.13,36.13,28.7957,28.7957,0.015625,0.020022,0.027261,0.024291,0.018463,0.019119


## 3.2 Calculate Mean Distances and CIs

In [43]:

def extract_phenotypes(df):
    ''' 
    Splits a multi-column DataFrame into a list of DataFrames, each containing one phenotype

    input: 
        df (pd.DataFrame): dataframe with multiple columns (1st column is the index, the other are phenotypes)

    returns:
        list_phenotypes_df: list with 2 columns - one for index and one for phenotype, 
            for how many phenotypes there are in the input
    '''
    list_phenotypes_df = []
    index = df.columns[0]
    for i in df.columns[1:]:
        list_phenotypes_df.append(df[[index, i]].copy())

    return list_phenotypes_df



def ci95(df):
    """
    input: df of 4 columns: index, mean, count, std

    returns: df of 6 columns: index, mean, count, std, ci95_hi, ci95_low

    """
    for metric in df.columns.levels[0]:
        if metric == 'Gene':
            pass
        else:
            ci95_hi = []
            ci95_lo = []
            for i in df[metric].index:
                m = df[metric]['mean'].loc[i]
                c = df[metric]['count'].loc[i]
                s = df[metric]['sem'].loc[i]
                ci95_hi.append(stats.t.interval(confidence=0.95, df=c-1, loc=m, scale=s)[1])
                ci95_lo.append(stats.t.interval(confidence=0.95, df=c-1, loc=m, scale=s)[0])
            df[metric,'ci95_hi'] = ci95_hi
            df[metric,'ci95_lo'] = ci95_lo
            # df[metric,'ci95']=list(zip(ci95_lo,ci95_hi))
            
    return df



def calculate_MSD(list_of_dfs, by):
    new_list_of_dfs = []
    
    for df in list_of_dfs:
        # Get phenotype column name (assuming 2nd column is the metric)
        pheno_col = df.columns[1]
        
        # Calculate statistics
        stats = df.groupby(by)[df.columns[1]].agg(['mean', 'count', 'sem'])

        
        # Convert to MultiIndex if needed (more robust version)
        if not isinstance(stats.columns, pd.MultiIndex):
            stats.columns = pd.MultiIndex.from_tuples([(pheno_col, col) for col in stats.columns])
        
        # Calculate CI
        stats_2 = ci95(stats)
        
        # Get N2 control data
        if Screen == "Neuron_Genes_Screen":
            N2_mask = stats_2.index == 'N2' if by == "Gene" else stats_2.index.isin(['N2_XJ1','N2_N2'])
        else:
            N2_mask = stats_2.index == 'N2'
            
        N2_data = stats_2[N2_mask]
        
        # Subtract N2 values
        stats_2.iloc[:, 0] -= N2_data.iloc[0, 0]  # mean
        stats_2.iloc[:, 3] -= N2_data.iloc[0, 0]  # ci95_hi
        stats_2.iloc[:, 4] -= N2_data.iloc[0, 0]  # ci95_low
        
        new_list_of_dfs.append(stats_2)
    
    return new_list_of_dfs

In [44]:
def calculate_MSD(list_of_dfs, by):
    new_list_of_dfs = []
    
    for df in list_of_dfs:
        # Get phenotype column name (assuming 2nd column is the metric)
        pheno_col = df.columns[1]
        
        # Create proper MultiIndex structure
        stats = df.groupby(by)[df.columns[1]].agg(['mean', 'count', 'sem'])

        # Convert to MultiIndex if needed (more robust version)
        if not isinstance(stats.columns, pd.MultiIndex):
            stats.columns = pd.MultiIndex.from_tuples([(pheno_col, col) for col in stats.columns])
        
        # Calculate CIs
        stats_2 = ci95(stats)
        
        # Get N2 control data
        if Screen == "Neuron_Genes_Screen":
            N2_mask = stats_2.index == 'N2' if by == "Gene" else stats_2.index.isin(['N2_XJ1','N2_N2'])
        else:
            N2_mask = stats_2.index == 'N2'
            
        N2_data = stats_2[N2_mask]
        
        # Subtract N2 values
        stats_2.iloc[:, 0] -= N2_data.iloc[0, 0]  # mean
        stats_2.iloc[:, 3] -= N2_data.iloc[0, 0]  # ci95_hi
        stats_2.iloc[:, 4] -= N2_data.iloc[0, 0]  # ci95_low
        
        new_list_of_dfs.append(stats_2)
    
    return new_list_of_dfs

In [45]:
def get_MSD(list_MSD):
    '''
    input: List of dataframes, each representing a phenotype with calculated MSD.

    returns: Single combined dataframe joining all input dataframes with MSD values.
    '''
    for a in list_MSD:
        if a.columns.levels[0] == list_MSD[0].columns.levels[0]:
            MSD=a
        else:
            MSD=MSD.join(a)
    return MSD

In [46]:
def get_combined_MSD(baseline_byplate,tap_byplate, by=['Gene','dataset']):
    """
    Combines MSD datafram from baseline plates and tap plates

    input:
        - baseline_byplate: baseline data by plate
        - tap_byplate: tap data by plate
        - by: what to group by "Gene" or "dataset"
    returns:
        - combined MSD dataframe
    """
    list_baseline_MSD=calculate_MSD(extract_phenotypes(baseline_byplate), by=by)

    list_tap_MSD=calculate_MSD(extract_phenotypes(tap_byplate), by=by)

    baseline_MSD = get_MSD(list_baseline_MSD)
    
    tap_MSD = get_MSD(list_tap_MSD)

    combined_MSD = baseline_MSD.join(tap_MSD, on=by)

    combined_MSD=combined_MSD.rename(columns={"habit_dura":"Habituation of Response Duration",
                                         "habit_prob": "Habituation of Respones Probability",
                                         "habit_speed":"Habituation of Response Speed",
                                         "init_dura": "Initial Response Duration",
                                         "init_prob": "Initial Response Probability",
                                         "init_speed": "Initial Response Speed",
                                         "final_dura": "Final Response Duration",
                                         "final_prob": "Final Response Probability",
                                         "final_speed": "Final Response Speed",
                                         "recovery_dura": "Spontaneous Recovery of Response Duration",
                                         "recovery_prob": "Spontaneous Recovery of Response Probability",
                                         "recovery_speed": "Spontaneous Recovery of Response Speed",
                                         "memory_retention_dura": "Memory Retention of Response Duration",
                                         "memory_retention_prob": "Memory Retention of Response Probability",
                                         "memory_retention_speed": "Memory Retention of Response Speed"})

    combined_MSD=combined_MSD.reset_index()
    combined_MSD.columns = combined_MSD.columns.to_flat_index().str.join('-')
    combined_MSD=combined_MSD.rename(columns={by+"-": by})
    combined_MSD['Screen']=Screen
    
    return combined_MSD

### 3.2.1 Gene-level SMD

In [47]:
combined_MSD=get_combined_MSD(baseline_output_byplate,
                              tap_data_byplate, 
                              by='Gene')

combined_MSD.head()

Unnamed: 0,Gene,Instantaneous Speed-mean,Instantaneous Speed-count,Instantaneous Speed-sem,Instantaneous Speed-ci95_hi,Instantaneous Speed-ci95_lo,Interval Speed-mean,Interval Speed-count,Interval Speed-sem,Interval Speed-ci95_hi,...,Memory Retention of Response Probability-count,Memory Retention of Response Probability-sem,Memory Retention of Response Probability-ci95_hi,Memory Retention of Response Probability-ci95_lo,Memory Retention of Response Speed-mean,Memory Retention of Response Speed-count,Memory Retention of Response Speed-sem,Memory Retention of Response Speed-ci95_hi,Memory Retention of Response Speed-ci95_lo,Screen
0,N2,0.0,5,0.009221,0.025602,-0.025602,0.0,5,0.011803,0.032771,...,5,0.069487,0.192928,-0.192928,0.0,5,0.02455,0.068163,-0.068163,PD_Screen
1,hipr-1,-0.016382,8,0.01048,0.0084,-0.041163,-0.018904,8,0.013263,0.012458,...,8,0.03787,0.018558,-0.160538,-0.004841,8,0.018488,0.038876,-0.048558,PD_Screen


### 3.2.2 Allele-level SMD

In [48]:
allele_combined_MSD=get_combined_MSD(baseline_output_allele_byplate,
                                     tap_data_allele_byplate, 
                                     by='dataset')

allele_combined_MSD.head()

Unnamed: 0,dataset,Instantaneous Speed-mean,Instantaneous Speed-count,Instantaneous Speed-sem,Instantaneous Speed-ci95_hi,Instantaneous Speed-ci95_lo,Interval Speed-mean,Interval Speed-count,Interval Speed-sem,Interval Speed-ci95_hi,...,Memory Retention of Response Probability-count,Memory Retention of Response Probability-sem,Memory Retention of Response Probability-ci95_hi,Memory Retention of Response Probability-ci95_lo,Memory Retention of Response Speed-mean,Memory Retention of Response Speed-count,Memory Retention of Response Speed-sem,Memory Retention of Response Speed-ci95_hi,Memory Retention of Response Speed-ci95_lo,Screen
0,N2,0.0,5,0.009221,0.025602,-0.025602,0.0,5,0.011803,0.032771,...,5,0.069487,0.192928,-0.192928,0.0,5,0.02455,0.068163,-0.068163,PD_Screen
1,hipr-1_ok1081,0.002161,4,0.016581,0.05493,-0.050608,0.006861,4,0.019126,0.06773,...,4,0.055742,0.175766,-0.179028,0.030483,4,0.02562,0.112016,-0.051051,PD_Screen
2,hipr-1_tm10120,-0.034924,4,0.002895,-0.025712,-0.044136,-0.044669,4,0.003538,-0.033409,...,4,0.01945,-0.078451,-0.202247,-0.040164,4,0.010338,-0.007266,-0.073063,PD_Screen


## 3.3 T-Stat analysis

In [49]:
def baseline_metrics(by=["Gene","dataset"]):
    """
    Create a list of empty dataframe and list of metrics for baseline analysis

    input:
        by (list): what to group by "Gene" or "dataset"
        
    returns:
        list_baseline_Tstats: dataframes to store t-statistics
        list_baseline_metrics: dataframes to store metic names
    """
    PD_baseline_instantspeed_T=pd.DataFrame(columns = [by,"Instantaneous Speed"])
    PD_baseline_intspeed_T=pd.DataFrame(columns = [by,"Interval Speed"])
    PD_baseline_bias_T=pd.DataFrame(columns = [by,"Bias"])
    PD_baseline_morphwidth_T=pd.DataFrame(columns = [by,"Morphwidth"])
    PD_baseline_midline_T=pd.DataFrame(columns = [by,"Midline"])
    PD_baseline_area_T=pd.DataFrame(columns = [by,"Area"])
    PD_baseline_angularspeed_T=pd.DataFrame(columns = [by,"Angular Speed"])
    PD_baseline_aspectratio_T=pd.DataFrame(columns = [by,"Aspect Ratio"])
    PD_baseline_kink_T=pd.DataFrame(columns = [by,"Kink"])
    PD_baseline_curve_T=pd.DataFrame(columns = [by,"Curve"])
    PD_baseline_crab_T=pd.DataFrame(columns = [by,"Crab"])
    PD_baseline_pathlength_T=pd.DataFrame(columns = [by,"Pathlength"])

    list_baseline_Tstats=[PD_baseline_instantspeed_T,
                        PD_baseline_intspeed_T,
                        PD_baseline_bias_T,
                        PD_baseline_morphwidth_T,
                        PD_baseline_midline_T,
                        PD_baseline_area_T,
                        PD_baseline_angularspeed_T,
                        PD_baseline_aspectratio_T,
                        PD_baseline_kink_T,
                        PD_baseline_curve_T,
                        PD_baseline_crab_T,
                        PD_baseline_pathlength_T]

    list_baseline_metrics=["Instantaneous Speed",
                        "Interval Speed",
                        "Bias",
                        "Morphwidth",
                        "Midline",
                        "Area",
                        "Angular Speed",
                        "Aspect Ratio",
                        "Kink",
                        "Curve",
                        "Crab",
                        "Pathlength"]
    
    return list_baseline_Tstats, list_baseline_metrics

In [50]:
def tap_metrics(by=["Gene","dataset"]):
    """
    Create a list of empty dataframes and list of metrics for tap analysis

    input:
        by (list): what to group by "Gene" or "dataset"
        
    returns:
        list_tap_Tstats: dataframes to store t-statistics
        list_tap_metrics: dataframes to store metic names
    """
    recovery_dura=pd.DataFrame(columns = [by,"Recovery Duration"])
    recovery_prob=pd.DataFrame(columns = [by,"Recovery Probability"])
    recovery_speed=pd.DataFrame(columns = [by,"Recovery Speed"])
    memory_retention_dura=pd.DataFrame(columns = [by,"Memory Retention Duration"])
    memory_retention_prob=pd.DataFrame(columns = [by,"Memory Retention Probability"])
    memory_retention_speed=pd.DataFrame(columns = [by,"Memory Retention Speed"])
    init_dura=pd.DataFrame(columns = [by,"Initial Duration"])
    init_prob=pd.DataFrame(columns = [by,"Initial Probability"])
    init_speed=pd.DataFrame(columns = [by,"Initial Speed"])
    final_dura=pd.DataFrame(columns = [by,"Final Duration"])
    final_prob=pd.DataFrame(columns = [by,"Final Probability"])
    final_speed=pd.DataFrame(columns = [by,"Final Speed"])
    hab_dura=pd.DataFrame(columns = [by,"Habituation of Duration"])
    hab_prob=pd.DataFrame(columns = [by,"Habituation of Probability"])
    hab_speed=pd.DataFrame(columns = [by,"Habituation of Speed"])
   
    # add additional columns from PSA

    list_tap_Tstats = [recovery_dura,
                    recovery_prob,
                    recovery_speed,
                    memory_retention_dura,
                    memory_retention_prob,
                    memory_retention_speed,
                    init_dura,
                    init_prob,
                    init_speed,
                    final_dura,
                    final_prob,
                    final_speed,
                    hab_dura,
                    hab_prob,
                    hab_speed]
    
    list_tap_metrics = ["recovery_dura",
                        "recovery_prob",
                        "recovery_speed",
                        "memory_retention_dura",
                        "memory_retention_prob",
                        "memory_retention_speed",
                        "init_dura",
                        "init_prob",
                        "init_speed",
                        "final_dura",
                        "final_prob",
                        "final_speed",
                        "habit_dura",
                        "habit_prob",
                        "habit_speed"]
    
    return list_tap_Tstats, list_tap_metrics

In [51]:
def psa_metrics(by=["Gene", "dataset"]):
    """
    Create a list of empty dataframes and list of metric names for PSA summary analysis.

    input:
        by (list): what to group by ("Gene" or "dataset")

    returns:
        list_psa_Tstats: list of empty DataFrames for t-statistics
        list_psa_metrics: list of metric names (short strings)
    """

    # PSA metric DataFrames
    psa_initial_speed = pd.DataFrame(columns=[by,"PSA Initial Instantaneous Speed"])
    psa_recovery_speed = pd.DataFrame(columns=[by,"PSA Recovery Instantaneous Speed"])
    psa_peak_speed = pd.DataFrame(columns=[by,"PSA Peak Instantaneous Speed"])
    psa_initial_to_peak_speed = pd.DataFrame(columns=[by,"PSA Initial_to_peak Instantaneous Speed"])
    psa_peak_to_recovery_speed = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Instantaneous Speed"])
    psa_avg_speed = pd.DataFrame(columns=[by,"PSA Average Instantaneous Speed"])

    psa_initial_bias = pd.DataFrame(columns=[by,"PSA Initial Bias"])
    psa_recovery_bias = pd.DataFrame(columns=[by,"PSA Recovery Bias"])
    psa_peak_bias = pd.DataFrame(columns=[by,"PSA Peak Bias"])
    psa_initial_to_peak_bias = pd.DataFrame(columns=[by,"PSA Initial_to_peak Bias"])
    psa_peak_to_recovery_bias = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Bias"])
    psa_avg_bias = pd.DataFrame(columns=[by,"PSA Average Bias"])

    psa_initial_ang_speed = pd.DataFrame(columns=[by,"PSA Initial Angular Speed"])
    psa_recovery_ang_speed = pd.DataFrame(columns=[by,"PSA Recovery Angular Speed"])
    psa_peak_ang_speed = pd.DataFrame(columns=[by,"PSA Peak Angular Speed"])
    psa_initial_to_peak_ang_speed = pd.DataFrame(columns=[by,"PSA Initial_to_peak Angular Speed"])
    psa_peak_to_recovery_ang_speed = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Angular Speed"])
    psa_avg_ang_speed = pd.DataFrame(columns=[by,"PSA Average Angular Speed"])

    psa_initial_aspect = pd.DataFrame(columns=[by,"PSA Initial Aspect Ratio"])
    psa_recovery_aspect = pd.DataFrame(columns=[by,"PSA Recovery Aspect Ratio"])
    psa_peak_aspect = pd.DataFrame(columns=[by,"PSA Peak Aspect Ratio"])
    psa_initial_to_peak_aspect = pd.DataFrame(columns=[by,"PSA Initial_to_peak Aspect Ratio"])
    psa_peak_to_recovery_aspect = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Aspect Ratio"])
    psa_avg_aspect = pd.DataFrame(columns=[by,"PSA Average Aspect Ratio"])

    psa_initial_kink = pd.DataFrame(columns=[by,"PSA Initial Kink"])
    psa_recovery_kink = pd.DataFrame(columns=[by,"PSA Recovery Kink"])
    psa_peak_kink = pd.DataFrame(columns=[by,"PSA Peak Kink"])
    psa_initial_to_peak_kink = pd.DataFrame(columns=[by,"PSA Initial_to_peak Kink"])
    psa_peak_to_recovery_kink = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Kink"])
    psa_avg_kink = pd.DataFrame(columns=[by,"PSA Average Kink"])

    psa_initial_curve = pd.DataFrame(columns=[by,"PSA Initial Curve"])
    psa_recovery_curve = pd.DataFrame(columns=[by,"PSA Recovery Curve"])
    psa_peak_curve = pd.DataFrame(columns=[by,"PSA Peak Curve"])
    psa_initial_to_peak_curve = pd.DataFrame(columns=[by,"PSA Initial_to_peak Curve"])
    psa_peak_to_recovery_curve = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Curve"])
    psa_avg_curve = pd.DataFrame(columns=[by,"PSA Average Curve"])

    psa_initial_crab = pd.DataFrame(columns=[by,"PSA Initial Crab"])
    psa_recovery_crab = pd.DataFrame(columns=[by,"PSA Recovery Crab"])
    psa_peak_crab = pd.DataFrame(columns=[by,"PSA Peak Crab"])
    psa_initial_to_peak_crab = pd.DataFrame(columns=[by,"PSA Initial_to_peak Crab"])
    psa_peak_to_recovery_crab = pd.DataFrame(columns=[by,"PSA Peak_to_recovery Crab"])
    psa_avg_crab = pd.DataFrame(columns=[by,"PSA Average Crab"])



    # Collect all into a list
    list_psa_Tstats = [
        psa_initial_speed, psa_recovery_speed, psa_peak_speed,
        psa_initial_to_peak_speed, psa_peak_to_recovery_speed, psa_avg_speed,

        psa_initial_bias, psa_recovery_bias, psa_peak_bias,
        psa_initial_to_peak_bias, psa_peak_to_recovery_bias, psa_avg_bias,

        psa_initial_ang_speed, psa_recovery_ang_speed, psa_peak_ang_speed,
        psa_initial_to_peak_ang_speed, psa_peak_to_recovery_ang_speed, psa_avg_ang_speed,

        psa_initial_aspect, psa_recovery_aspect, psa_peak_aspect,
        psa_initial_to_peak_aspect, psa_peak_to_recovery_aspect, psa_avg_aspect,

        psa_initial_kink, psa_recovery_kink, psa_peak_kink,
        psa_initial_to_peak_kink, psa_peak_to_recovery_kink, psa_avg_kink,

        psa_initial_curve, psa_recovery_curve, psa_peak_curve,
        psa_initial_to_peak_curve, psa_peak_to_recovery_curve, psa_avg_curve,

        psa_initial_crab, psa_recovery_crab, psa_peak_crab,
        psa_initial_to_peak_crab, psa_peak_to_recovery_crab, psa_avg_crab
    ]

    list_psa_metrics = [
    "PSA Initial Instantaneous Speed",
    "PSA Recovery Instantaneous Speed",
    "PSA Peak Instantaneous Speed",
    "PSA Initial_to_peak Instantaneous Speed",
    "PSA Peak_to_recovery Instantaneous Speed",
    "PSA Average Instantaneous Speed",

    "PSA Initial Bias",
    "PSA Recovery Bias",
    "PSA Peak Bias",
    "PSA Initial_to_peak Bias",
    "PSA Peak_to_recovery Bias",
    "PSA Average Bias",

    "PSA Initial Angular Speed",
    "PSA Recovery Angular Speed",
    "PSA Peak Angular Speed",
    "PSA Initial_to_peak Angular Speed",
    "PSA Peak_to_recovery Angular Speed",
    "PSA Average Angular Speed",

    "PSA Initial Aspect Ratio",
    "PSA Recovery Aspect Ratio",
    "PSA Peak Aspect Ratio",
    "PSA Initial_to_peak Aspect Ratio",
    "PSA Peak_to_recovery Aspect Ratio",
    "PSA Average Aspect Ratio",

    "PSA Initial Kink",
    "PSA Recovery Kink",
    "PSA Peak Kink",
    "PSA Initial_to_peak Kink",
    "PSA Peak_to_recovery Kink",
    "PSA Average Kink",

    "PSA Initial Curve",
    "PSA Recovery Curve",
    "PSA Peak Curve",
    "PSA Initial_to_peak Curve",
    "PSA Peak_to_recovery Curve",
    "PSA Average Curve",

    "PSA Initial Crab",
    "PSA Recovery Crab",
    "PSA Peak Crab",
    "PSA Initial_to_peak Crab",
    "PSA Peak_to_recovery Crab",
    "PSA Average Crab"
]
    
    
    return list_psa_Tstats, list_psa_metrics


In [52]:
def TTest(Type, DF_ref, output, by=["Gene", "dataset"]):
    """
    Perform two sample t-test for each unique Gene/dataset column in the Df_ref
    input: 
        - a:column name of values 
        - DF_ref:reference dataframe
        - output: output df to store results in 
        - by: what to group by "Gene" or "dataset"
        
    """
    for a in DF_ref[by].unique():
        Tstat_a = ttest_ind(DF_ref[DF_ref.dataset == a][Type], DF_ref[DF_ref.Allele.isin(["XJ1","N2"])][Type],equal_var=False)[0]
        Tstat_g = ttest_ind(DF_ref[DF_ref.Gene == a][Type], DF_ref[DF_ref.Gene == "N2"][Type],equal_var=False)[0]
        Tstat = Tstat_g if by=="Gene" else Tstat_a
        row = [a, Tstat]
        output.loc[len(output)] = row
    # print(output)

def do_TTest(by=["Gene", "dataset"], baseline=["true", "false", "psa"]):
    """
    Perform TTest function for each unique Gene/dataset column in baseline_output/tap_data
    
    input: 
        - by: what to group by "Gene" or "dataset"
        - baseline: whether or not to use baseline data

    returns: sorted T-statistics dataframe
    """

    if baseline=="true":
        list_Tstats, list_metrics = baseline_metrics(by)
        data = baseline_output
    elif baseline=="false":
        list_Tstats,list_metrics = tap_metrics(by)
        data = tap_data
    else:
        list_Tstats,list_metrics = psa_metrics(by)
        data = psa_data
    for x in data[by].unique():
        if Screen=="Neuron_Genes_Screen":
            condition = x in (["N2"] if by == "Gene" else ["N2_XJ1", "N2_N2"])
        else:
            condition = (x =="N2")
        if condition:
            pass
        else:
            output_gene=data[data[by]==x]
            gene_data=data[data['Date'].isin(output_gene['Date'].unique())]
            if Screen=="Neuron_Genes_Screen":
                gene_data_final = gene_data[gene_data[by].isin(['N2', x])] if by=="Gene" else gene_data[gene_data[by].isin(['N2_N2','N2_XJ1', x])]
            else:
                gene_data_final = gene_data[gene_data[by].isin(['N2', x])]

            for a,b in zip(list_metrics, list_Tstats):
                TTest(a, gene_data_final, b, by) # calls t test function
    
    PD_Tstats=pd.DataFrame()
    for a in list_Tstats:
        b=a.groupby([by], as_index=False).mean()
        if b.columns.values[1] == list_Tstats[0].columns.values[1]:
            PD_Tstats=b
        else:
            PD_Tstats=PD_Tstats.join(b.iloc[:,1])
            
    PD_Tstats=PD_Tstats.set_index(by)
    
    return PD_Tstats
            

### T-stat on Baseline data:

### 3.3.1 Allele-level T-stat analysis of baseline data

In [53]:
warnings.filterwarnings('ignore')

PD_baseline_Tstats_allele = do_TTest("dataset", baseline="true") # get sorted T-statistics DataFrame 

# PD_baseline_Tstats_allele_sorted=PD_baseline_Tstats_allele.sort_index()

PD_baseline_Tstats_allele.head()

Unnamed: 0_level_0,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hipr-1_ok1081,8.720217,17.272716,2.091868,-93.709991,-151.77574,-190.66633,28.561905,84.318945,43.915102,50.559487,35.51648,165.20275
hipr-1_tm10120,-149.130061,-135.875107,-121.209079,-107.524828,-362.066521,-339.738765,-65.349506,24.355089,-47.450876,-134.061984,-39.992945,-52.444177


### 3.3.2 Gene-level T-stat analysis of baseline data

In [54]:
warnings.filterwarnings('ignore')

PD_baseline_Tstats=do_TTest("Gene", baseline="true") # get sorted T-statistics DataFrame 

# PD_baseline_Tstats_sorted=PD_baseline_Tstats.sort_index()

PD_baseline_Tstats.head()

Unnamed: 0_level_0,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hipr-1,-46.583094,-38.802844,-46.56871,-120.799168,-228.506402,-253.233922,1.911753,71.74101,11.424131,-11.893595,2.659493,65.740765


### T-stat analysis for tap-response data:

### 3.3.3 Allele level T-stat analysis of tap response data

In [55]:
warnings.filterwarnings('ignore')

PD_habituation_Tstats_allele = do_TTest("dataset", baseline="false") # get sorted T-statistics DataFrame 

# PD_habituation_Tstats_allele_sorted=PD_habituation_Tstats_allele.sort_index()

PD_habituation_Tstats_allele.head()

Unnamed: 0_level_0,Recovery Duration,Recovery Probability,Recovery Speed,Memory Retention Duration,Memory Retention Probability,Memory Retention Speed,Initial Duration,Initial Probability,Initial Speed,Final Duration,Final Probability,Final Speed,Habituation of Duration,Habituation of Probability,Habituation of Speed
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hipr-1_ok1081,0.196572,3.99498,0.140053,-1.280902,-0.018308,0.859058,-0.551071,-1.173992,1.144826,1.097106,7.13957,0.378627,-1.020111,-4.161169,0.52416
hipr-1_tm10120,1.596165,7.425059,0.666986,0.500673,-1.945015,-1.507782,-0.565598,-4.166082,-5.10381,0.524222,10.158379,-0.429403,-0.630178,-11.167895,-1.96336


### 3.3.4 Gene-level T-stat analysis of Tap response data

In [56]:
warnings.filterwarnings('ignore')

PD_habituation_Tstats = do_TTest("Gene", baseline="false") # get sorted T-statistics DataFrame 

PD_habituation_Tstats_sorted=PD_habituation_Tstats.sort_index()

PD_habituation_Tstats

Unnamed: 0_level_0,Recovery Duration,Recovery Probability,Recovery Speed,Memory Retention Duration,Memory Retention Probability,Memory Retention Speed,Initial Duration,Initial Probability,Initial Speed,Final Duration,Final Probability,Final Speed,Habituation of Duration,Habituation of Probability,Habituation of Speed
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hipr-1,0.839034,6.279144,0.434678,-0.763127,-0.897052,-0.157515,-0.684199,-2.883587,-0.273891,1.096964,8.654085,0.080648,-1.106724,-6.794556,-0.24353


### T-stat analysis for psa data:

### 3.3.5 Allele level T-stat analysis of PSA data

In [57]:
warnings.filterwarnings('ignore')

psa_tstats_allele = do_TTest("dataset", baseline="psa") # get sorted T-statistics DataFrame 

psa_tstats_allele.head()

Unnamed: 0_level_0,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,PSA Initial_to_peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hipr-1_ok1081,0.602504,-3.62639,25.144748,23.150201,12.740997,18.092101,-2.245146,-3.690079,5.0183,-5.796811,...,5.002613,0.630892,29.74538,30.985187,5.929513,1.217049,37.860613,36.124877,76.24622,65.088265
hipr-1_tm10120,-9.440858,-32.441377,-20.068628,-11.168859,-17.755404,-17.124992,-7.810246,-22.932385,-2.201955,5.404033,...,-15.507447,-17.027849,-41.61336,-40.977091,-8.178221,-36.644505,-7.812564,-16.748773,-29.834931,-32.934787


### 3.3.6 Gene-level T-stat analysis of PSA data

In [58]:
warnings.filterwarnings('ignore')

psa_tstats = do_TTest("Gene", baseline="psa") # get sorted T-statistics DataFrame 

psa_tstats.head()

Unnamed: 0_level_0,PSA Initial Instantaneous Speed,PSA Recovery Instantaneous Speed,PSA Peak Instantaneous Speed,PSA Initial_to_peak Instantaneous Speed,PSA Peak_to_recovery Instantaneous Speed,PSA Average Instantaneous Speed,PSA Initial Bias,PSA Recovery Bias,PSA Peak Bias,PSA Initial_to_peak Bias,...,PSA Peak Curve,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hipr-1,-3.907776,-13.993124,2.237667,3.218483,-2.516068,-1.158818,-5.062816,-11.739209,1.318407,-0.486886,...,-4.103168,-7.431258,0.35839,1.256847,1.125706,-11.394267,11.868959,7.60901,6.289317,5.968087


# 4. Merging t-stat data into one dataset

In [59]:
def pop_cols(combined):
    """
    Reorders columns in the combined dataframe. 
    (pops specific columns["Area", "Midline", "Morphwidth", "Angular Speed"] and
    reinserts at different positions)

    input:
        combined: dataframe with columns to be reordered

    returns: 
        NA    
        
    """
    first_col=combined.pop("Area")
    combined.insert(0,"Area",first_col)

    first_col=combined.pop("Midline")
    combined.insert(0,"Midline",first_col)

    first_col=combined.pop("Morphwidth")
    combined.insert(0,"Morphwidth",first_col)

    first_col=combined.pop("Angular Speed")
    combined.insert(5,"Angular Speed",first_col)

def pop_last(combined):
    """
    Reorders the last three columns of the combined dataframe.
    input:
        combined: dataframe with columns to be reordered

    """
    last_col=combined.pop("Spontaneous Recovery of Response Duration")
    combined.insert(26,"Spontaneous Recovery of Response Duration",last_col)

    last_col=combined.pop("Spontaneous Recovery of Response Probability")
    combined.insert(26,"Spontaneous Recovery of Response Probability",last_col)

    last_col=combined.pop("Spontaneous Recovery of Response Speed")
    combined.insert(26,"Spontaneous Recovery of Response Speed",last_col)

    last_col=combined.pop("Memory Retention of Response Duration")
    combined.insert(26,"Memory Retention of Response Duration",last_col)

    last_col=combined.pop("Memory Retention of Response Probability")
    combined.insert(26,"Memory Retention of Response Probability",last_col)

    last_col=combined.pop("Memory Retention of Response Speed")
    combined.insert(26,"Memory Retention of Response Speed",last_col)

def rename_columns(df):
    '''
    Renames columns in the input dataframe
    input:
        combined: dataframe with columns to be renamed   
    returns:
        input dataframe with renamed columns 
    '''
    renames = {
        "Habituation of Duration": "Habituation of Response Duration",
        "Habituation of Probability": "Habituation of Respones Probability",
        "Habituation of Speed": "Habituation of Response Speed",
        "Initial Duration": "Initial Response Duration",
        "Initial Probability": "Initial Response Probability",
        "Initial Speed": "Initial Response Speed",
        "Final Duration": "Final Response Duration",
        "Final Probability": "Final Response Probability",
        "Final Speed": "Final Response Speed",
        "Recovery Duration": "Spontaneous Recovery of Response Duration",
        "Recovery Probability": "Spontaneous Recovery of Response Probability",
        "Recovery Speed": "Spontaneous Recovery of Response Speed",
        "Memory Retention Duration": "Memory Retention of Response Duration",
        "Memory Retention Probability": "Memory Retention of Response Probability",
        "Memory Retention Speed": "Memory Retention of Response Speed"
    }
    return df.rename(columns=renames)

def merge_Tstats(baseline, habituation, by=["Gene", "dataset"], Screen=Screen, psa=False):
    """
    merge baseline and tap response dataframes based on the Gene/dataset
    normalize the merged dataframe and then return it with melted version

    input:
        - baseline: baseline dataframe to merge
        - habituation: habituation dataframe to merge
        - by: what to group by "Gene" or "dataset"
    """

    #merge baseline and habituation data
    combined_Tstats = pd.merge(baseline, habituation, on=by, how='left')
    combined_Tstats = combined_Tstats.sort_index() # sort by index

    # ------------ NORMALISATION STEPS TO BE MOVED TO DASHBOARD -------------------
    # # normalise combined dataframe by subtracting mean and div by sd
    # combined_Tstats_normalized = (combined_Tstats-combined_Tstats.mean())/combined_Tstats.std()

    # if by=="dataset" and Screen=="Neuron_Genes_Screen":
    #     combined_Tstats_normalized_2 = combined_Tstats-combined_Tstats[combined_Tstats.index=="N2_XJ1"].squeeze()
    # else :
    #     combined_Tstats_normalized_2 = combined_Tstats-combined_Tstats[combined_Tstats.index=="N2"].squeeze()  

    pop_cols(combined_Tstats) # reorder columns

    # Skip this step if data = psa
    if not psa:
        #rename columns of combined and normalized df
        combined_Tstats = rename_columns(combined_Tstats)
        # combined_Tstats_normalized_2=rename_columns(combined_Tstats_normalized_2)
        pop_cols(combined_Tstats) # reorder columns
        pop_last(combined_Tstats) # reorder columns

    # -------------- PIVOTING STEPS TO BE MOVED TO DASHBOARD ---------------------
    # # Melt the combined dataframe
    # combined_Tstats_melted=combined_Tstats.reset_index()
    # combined_Tstats_melted=pd.melt(combined_Tstats_melted, id_vars=[by],
    #                             var_name='Metric',
    #                             value_name='T_score')
    
    # # Sort the melted dataframe by T_score
    # combined_Tstats_melted_sorted=combined_Tstats_melted.sort_values(by=['T_score'])

    # # Melt the normalized dataframe
    # combined_Tstats_normalized_melted=combined_Tstats.reset_index()
    # combined_Tstats_normalized_melted=pd.melt(combined_Tstats_normalized_melted, id_vars=[by],
    #                                                var_name='Metric',
    #                                                value_name='T_score')

    # add Screen column to df and its melted version
    combined_Tstats['Screen']=Screen
    # combined_Tstats_normalized_melted['Screen']=Screen

    return combined_Tstats#, combined_Tstats_normalized_melted



## 4.1 Gene-level

- Pass Tap and baseline through merge_Tstats() as df1
- Pass PSA and baseline through merge_Tstats()as df2
- pd.merge df1 and df2 using all columns of baseline

In [60]:
# Baseline + Tap
combined_Tstats = merge_Tstats(PD_baseline_Tstats, PD_habituation_Tstats, "Gene")

In [61]:
# Baseline + PSA 
combined_Tstats_psa = merge_Tstats(
    PD_baseline_Tstats, psa_tstats, by="Gene", psa=True
)

In [62]:
# Baseline + Tap + PSA
final_tstat = pd.merge(combined_Tstats, combined_Tstats_psa, on = PD_baseline_Tstats.columns.to_list(), how = 'outer')

final_tstat.head()

Unnamed: 0,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,Curve,...,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab,Screen_y
0,-120.799168,-228.506402,-253.233922,-46.583094,-38.802844,1.911753,-46.56871,71.74101,11.424131,-11.893595,...,-7.431258,0.35839,1.256847,1.125706,-11.394267,11.868959,7.60901,6.289317,5.968087,PD_Screen
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,PD_Screen


In [63]:
# # Baseline + Tap + PSA melted
# final_tstat_melted = pd.concat([combined_Tstats_normalized_melted, combined_Tstats_psa_melted]).drop_duplicates()

# final_tstat_melted.head()

## 4.2 Allele level 


- Pass Tap and baseline through merge_Tstats() as df3
- Pass PSA and baseline through merge_Tstats()as df4
- pd.merge df3 and df4 using all columns of basline

In [64]:
# Baseline + Tap
combined_Tstats_allele = merge_Tstats(PD_baseline_Tstats_allele,PD_habituation_Tstats_allele, "dataset")

In [65]:
# Baseline + PSA 
combined_Tstats_psa_allele = merge_Tstats(
    PD_baseline_Tstats_allele, psa_tstats_allele, by="dataset", psa=True
)

In [66]:
# Baseline + Tap + PSA
final_tstat_allele = pd.merge(combined_Tstats_allele, combined_Tstats_psa_allele, on = PD_baseline_Tstats_allele.columns.to_list(), how = 'outer')

final_tstat_allele.head()

Unnamed: 0,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,Curve,...,PSA Initial_to_peak Curve,PSA Peak_to_recovery Curve,PSA Average Curve,PSA Initial Crab,PSA Recovery Crab,PSA Peak Crab,PSA Initial_to_peak Crab,PSA Peak_to_recovery Crab,PSA Average Crab,Screen_y
0,-107.524828,-362.066521,-339.738765,-149.130061,-135.875107,-65.349506,-121.209079,24.355089,-47.450876,-134.061984,...,-17.027849,-41.61336,-40.977091,-8.178221,-36.644505,-7.812564,-16.748773,-29.834931,-32.934787,PD_Screen
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,PD_Screen
2,-93.709991,-151.77574,-190.66633,8.720217,17.272716,28.561905,2.091868,84.318945,43.915102,50.559487,...,0.630892,29.74538,30.985187,5.929513,1.217049,37.860613,36.124877,76.24622,65.088265,PD_Screen


In [67]:
# # Baseline + Tap + PSA melted
# final_tstat_melted_allele = pd.concat([combined_Tstats_normalized_melted_allele, combined_Tstats_psa_melted_allele]).drop_duplicates()

# final_tstat_melted_allele.head()

# 5. Save data to database (sqlite3)

#### A janky way to add data and update the sql 

1. Read table to pd.DataFrame
2. Add new data to pd.DataFrame
3. Replace old table with newly updated pd.DataFrame

# Primary Keys For Each SQL Table:

####  -- Gene_Allele_WormBaseID:
WBGene, WBAllele
#### -- alleleMSD:
dataset, Screen
#### -- gene_MSD:
Gene, Screen
#### -- allele_profile_data:
dataset, Metric, Screen
#### -- gene_profile_data:
Gene, Metric, Screen
#### -- tap_baseline_data:
Time, Plate_id, Date, Screen, dataset
#### -- tap_response_data:
plate, Date, Plate_id, Screen, taps, dataset
#### -- tstat_allele_data:
dataset, Screen
#### -- tstat_gene_data:
Gene, Screen

In [None]:
# Code might be useful as reference for accessing server???? Keep here just in case.

# # tap_url = 'https://osf.io/du9bj/files/osfstorage/650a2f9f1e76a4230e8a99a5?raw=true'
# tap_url='https://github.com/MyYummyPancake/NRSC510B/blob/main/tap_output.csv?raw=true'
# # s=requests.get(tap_url).content
# # tap_output=pd.read_csv(io.StringIO(s.decode('utf-8')))
# tap_output=pd.read_csv(tap_url, on_bad_lines='skip', index_col=0)
# print(tap_output)

In [None]:
print(tap_output.head(5))
print(baseline_output.head(5))

tap_output.Screen = Screen
baseline_output.Screen = Screen

print(tap_output.head(5))
print(baseline_output.head(5))

In [None]:

### This code will connect to PostgreSQL database and write non-duplicate data into the database tables.

# Loads database config values from database.ini file and validates that user and password are set.
config = load_config()
if (config['user'] == "" or config['password'] == ""):
    print("Please set your user and password in the database.ini file.")
    sys.exit(1)
    
# Creates a connection pool to PostgreSQL database using SQLAlchemy.
engine = create_engine(f"postgresql+psycopg://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}")

# Function to insert data into PostgreSQL table, skipping duplicates based on primary keys.
def postgres_skip_on_duplicate(pd_table, conn, keys, data_iter):
    data = [dict(zip(keys,row)) for row in data_iter]
    conn.execute(insert(pd_table.table).on_conflict_do_nothing(), data)

# --------- Write the dataframes to PostgreSQL tables -----------

# Complete tap response data
print("working on tap_output:") 
tap_output.to_sql('tap_response_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Complete baseline data
print("working on tap_baseline_data:") 
baseline_output.to_sql('tap_baseline_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Baseline + Tap + PSA combined tstat data by Gene
print("working on tstat_gene_data")
final_tstat.reset_index().to_sql('tstat_gene_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Baseline + Tap + PSA combined tstat data by Allele
print("working on tstat_allele_data")
final_tstat_allele.reset_index().to_sql('tstat_allele_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# # Melted Baseline + Tap + PSA combined tstat data by Gene
# print("working on gene_profile_data")
# final_tstat_melted.to_sql('gene_profile_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# # Melted Baseline + Tap + PSA combined tstat data by Allele
# print("working on allele_profile_data")
# final_tstat_melted_allele.to_sql('allele_profile_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# MSD Baseline + Tap by Gene
print("working on gene_MSD")
combined_MSD.to_sql('gene_MSD', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# MSD Baseline + Tap by Allele
print("working on allele_MSD")
allele_combined_MSD.to_sql('allele_MSD', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

# Summarised PSA data (speed, kink, curve, etc.)
print("working on psa_data:") 
psa_data.to_sql('psa_summarised_data', engine, if_exists='append', index=False, method=postgres_skip_on_duplicate)

In [None]:
# # USE THIS CELL TO UPDATE ALL THE NEED TALBES (Also have baseline_output on the second line)

# conn=sqlite3.connect('/Users/lavanya/Desktop/Lavanya_Test/data_updated2.db')

# tap_output.to_sql('tap_response_data', conn, if_exists='append', index=False)

# baseline_output.to_sql('tap_baseline_data', conn, if_exists='append', index=False)

# combined_Tstats_normalize_2.reset_index().to_sql('tstat_gene_data', conn, if_exists='append', index=False)

# combined_Tstats_normalize_allele_2.reset_index().to_sql('tstat_allele_data', conn, if_exists='append', index=False)

# combined_Tstats_normalized_melted.to_sql('gene_profile_data', conn, if_exists='append', index=False)

# combined_Tstats_normalized_melted_allele.to_sql('allele_profile_data', conn, if_exists='append', index=False)

# combined_MSD.to_sql('gene_MSD', conn, if_exists='append', index=False)

# allele_combined_MSD.to_sql('allele_MSD', conn, if_exists='append', index=False)

# # combined_Tstats_melted_sorted.to_sql('allele_phenotype_data', conn, if_exists='replace', index=False)

# print(conn.total_changes)

# conn.close()


# # Want to test edge cases of pd.to_sql functionality#############