# 1. Imports and File selection 

In [36]:
import sqlite3
from sqlite3 import Error
from sqlite3 import IntegrityError
from ipyfilechooser import FileChooser
import pandas as pd
import io
import requests
import numpy
from scipy import stats
import math
from scipy.stats import ttest_ind
import ipywidgets as widgets
import tqdm
import warnings

In [3]:
starting_directory = '/Users'
baseline_chooser = FileChooser(starting_directory)
display(baseline_chooser)

FileChooser(path='/Users', filename='', title='', show_hidden=False, select_desc='Select', change_desc='Change…

In [4]:
tap_chooser=FileChooser(starting_directory)
display(tap_chooser)

FileChooser(path='/Users', filename='', title='', show_hidden=False, select_desc='Select', change_desc='Change…

In [5]:
screens = ['PD_Screen', 'ASD_Screen', 'G-Proteins_Screen', 'Glia_Genes_Screen', 'Neuron_Genes_Screen']

screen_chooser = widgets.Select(options=screens, value=screens[0], description='Screen:')
display(screen_chooser)

Select(description='Screen:', options=('PD_Screen', 'ASD_Screen', 'G-Proteins_Screen', 'Glia_Genes_Screen', 'N…

In [6]:
Screen=screen_chooser.value
folder_path=baseline_chooser.selected_path
print(folder_path)

/Users/gurmehak/Documents/RankinLab/Test_Datasets/PDScreen_TapHab_August15_2022


In [7]:
# Read the baseline file
baseline_output = pd.read_csv(baseline_chooser.selected, index_col=0).drop(columns=['index'])

print(f"\nShape of the baseline .csv file: {baseline_output.shape}")

# Print the first five rows of the file
baseline_output.head()


Shape of the baseline .csv file: (30487, 21)


Unnamed: 0,Time,n,Number,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,...,Kink,Curve,Crab,Pathlength,Plate_id,Date,Screen,dataset,Gene,Allele
0,490.016,14,12,0.0823,0.1195,0.25,0.1078,1.0908,0.142641,6.1,...,31.1,25.0,0.0066,11.893,B0811ab,20220815,PD_Screen,N2,N2,N2
1,490.056,14,12,0.0736,0.1024,0.25,0.1059,1.088,0.14015,5.3,...,30.9,24.7,0.0064,11.896,B0811ab,20220815,PD_Screen,N2,N2,N2
2,490.103,14,12,0.0784,0.1024,0.25,0.105,1.0914,0.138935,5.2,...,31.0,24.6,0.0057,11.898,B0811ab,20220815,PD_Screen,N2,N2,N2
3,490.144,14,12,0.097,0.1118,0.25,0.1054,1.0935,0.140575,5.8,...,30.6,24.5,0.0103,11.901,B0811ab,20220815,PD_Screen,N2,N2,N2
4,490.186,14,12,0.0994,0.1197,0.25,0.111,1.1026,0.146894,5.6,...,31.4,24.4,0.0097,11.904,B0811ab,20220815,PD_Screen,N2,N2,N2


In [8]:
# Read the tap file
tap_output = pd.read_csv(tap_chooser.selected, index_col=0)

print(f"\nShape of the tap .csv file: {tap_output.shape}")

# Print the first five rows of the file
tap_output.head()


Shape of the tap .csv file: (395, 13)


Unnamed: 0,time,dura,dist,prob,speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.985,2.83,0.696,0.928571,0.245936,1,20220815,B0811ab,PD_Screen,1.0,N2,N2,N2
1,609.993,2.98,0.746,0.857143,0.250336,1,20220815,B0811ab,PD_Screen,2.0,N2,N2,N2
2,619.699,1.97,0.536,0.8,0.272081,1,20220815,B0811ab,PD_Screen,3.0,N2,N2,N2
3,629.956,2.57,0.686,0.9,0.266926,1,20220815,B0811ab,PD_Screen,4.0,N2,N2,N2
4,639.957,1.34,0.383,0.909091,0.285821,1,20220815,B0811ab,PD_Screen,5.0,N2,N2,N2


# 2. DataFrame preparation

In [9]:
# Dataframe for first tap
PD_first_tap = (
    tap_output[(tap_output.taps==1)]
    .reset_index().drop(columns="index")
    .rename(columns={"dura": "init_dura", "prob": "init_prob", "speed": "init_speed"}, errors="raise")
)

PD_first_tap.head()

Unnamed: 0,time,init_dura,dist,init_prob,init_speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,599.985,2.83,0.696,0.928571,0.245936,1,20220815,B0811ab,PD_Screen,1.0,N2,N2,N2
1,599.991,2.14,0.524,0.9375,0.24486,2,20220815,A0811aa,PD_Screen,1.0,N2,N2,N2
2,599.981,2.96,0.783,1.0,0.264527,3,20220815,A0811ad,PD_Screen,1.0,N2,N2,N2
3,599.962,2.38,0.501,0.913043,0.210504,4,20220815,B0811ae,PD_Screen,1.0,N2,N2,N2
4,599.972,2.52,0.622,0.863636,0.246825,5,20220815,C0811ac,PD_Screen,1.0,N2,N2,N2


In [10]:
# Dataframe for recovery taps
PD_recov_taps = (
    tap_output[(tap_output.taps==31)]
    .reset_index().drop(columns="index")
    .rename(columns={"dura": "recov_dura", "prob": "recov_prob", "speed":"recov_speed"})
)

PD_recov_taps.head()

Unnamed: 0,time,recov_dura,dist,recov_prob,recov_speed,plate,Date,Plate_id,Screen,taps,dataset,Gene,Allele
0,1189.987,2.61,0.464,0.545455,0.177778,1,20220815,B0811ab,PD_Screen,31.0,N2,N2,N2
1,1189.979,1.97,0.445,0.692308,0.225888,2,20220815,A0811aa,PD_Screen,31.0,N2,N2,N2
2,1189.962,1.79,0.61,0.882353,0.340782,3,20220815,A0811ad,PD_Screen,31.0,N2,N2,N2
3,1189.978,1.7,0.44,0.761905,0.258824,4,20220815,B0811ae,PD_Screen,31.0,N2,N2,N2
4,1189.966,2.01,0.561,0.607143,0.279104,5,20220815,C0811ac,PD_Screen,31.0,N2,N2,N2


In [11]:
# Dataframe for last three taps
PD_final_taps = (
    tap_output[((tap_output.taps >= 28) & (tap_output.taps <= 30))]
    .groupby(["dataset", "Date","Plate_id","Screen","Gene","Allele","plate"])
    .mean()
    .reset_index()
    .rename(columns={"dura": "final_dura", "prob": "final_prob", "speed": "final_speed"}, errors="raise")
)

PD_final_taps.head()

Unnamed: 0,dataset,Date,Plate_id,Screen,Gene,Allele,plate,time,final_dura,dist,final_prob,final_speed,taps
0,N2,20220815,A0811aa,PD_Screen,N2,N2,2,879.973,0.99,0.233333,0.302832,0.221688,29.0
1,N2,20220815,A0811ad,PD_Screen,N2,N2,3,879.989333,0.6,0.124,0.319444,0.202159,29.0
2,N2,20220815,B0811ab,PD_Screen,N2,N2,1,879.936333,1.233333,0.206333,0.355556,0.162658,29.0
3,N2,20220815,B0811ae,PD_Screen,N2,N2,4,879.99,0.846667,0.196,0.314762,0.23771,29.0
4,N2,20220815,C0811ac,PD_Screen,N2,N2,5,879.968333,1.346667,0.341,0.381481,0.248545,29.0


In [12]:
# Dataframe to analyse habituation behaviour after merging first tap and final taps

PD_habit_levels = pd.merge(
    PD_first_tap, 
    PD_final_taps, 
    on =['dataset', 'plate', "Plate_id", "Screen", "Gene", "Allele", "Date"], how ='left'
).drop(columns=['time_x','time_y','dist_x','dist_y', 'taps_x', 'taps_y']).dropna()

PD_habit_levels['habit_dura'] = PD_habit_levels['init_dura'] - PD_habit_levels['final_dura']

PD_habit_levels['habit_prob'] = PD_habit_levels['init_prob'] - PD_habit_levels['final_prob']

PD_habit_levels['habit_speed'] = PD_habit_levels['init_speed'] - PD_habit_levels['final_speed']


# PD_habit_levels

In [13]:
# Continue to analyse habituation behaviour after merging with recovery taps

if PD_recov_taps.empty:
    PD_habituation = pd.merge(PD_habit_levels, PD_recov_taps, on =['dataset','plate',"Plate_id","Screen","Gene","Allele","Date"], how ='outer')
else:
    PD_habituation = pd.merge(PD_habit_levels, PD_recov_taps, on =['dataset','plate',"Plate_id","Screen","Gene","Allele","Date"], how ='left')

if Screen not in ['Neuron_Genes_Screen', 'G-Proteins_Screen']:
    PD_habituation = PD_habituation.dropna() 

PD_habituation['recovery_dura']=(PD_habituation.recov_dura-PD_habituation.init_dura)/PD_habituation.init_dura*100

PD_habituation['recovery_prob']=(PD_habituation.recov_prob-PD_habituation.init_prob)/PD_habituation.init_prob*100

PD_habituation['recovery_speed']=(PD_habituation.recov_speed-PD_habituation.init_speed)/PD_habituation.init_speed*100

PD_habituation['memory_retention_dura']=(PD_habituation.recov_dura-PD_habituation.final_dura)

PD_habituation['memory_retention_prob']=(PD_habituation.recov_prob-PD_habituation.final_prob)

PD_habituation['memory_retention_speed']=(PD_habituation.recov_speed-PD_habituation.final_speed)


# Rename `PD_habituation` to `tap_data` based on the condition below
if Screen in ['Neuron_Genes_Screen', 'G-Proteins_Screen']:
    tap_data=PD_habituation.dropna(subset = ['init_dura', 'init_prob', 'init_speed', 'plate', 'Date', 'Plate_id',
       'Screen', 'dataset', 'Gene', 'Allele', 'final_dura', 'final_prob',
       'final_speed', 'habit_dura', 'habit_prob', 'habit_speed'])
else:
    tap_data=PD_habituation.dropna() 


# Display final dataframe
tap_data


Unnamed: 0,init_dura,init_prob,init_speed,plate,Date,Plate_id,Screen,dataset,Gene,Allele,...,dist,recov_prob,recov_speed,taps,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,2.83,0.928571,0.245936,1,20220815,B0811ab,PD_Screen,N2,N2,N2,...,0.464,0.545455,0.177778,31.0,-7.773852,-41.258741,-27.713921,1.376667,0.189899,0.015119
1,2.14,0.9375,0.24486,2,20220815,A0811aa,PD_Screen,N2,N2,N2,...,0.445,0.692308,0.225888,31.0,-7.943925,-26.153846,-7.747898,0.98,0.389475,0.0042
2,2.96,1.0,0.264527,3,20220815,A0811ad,PD_Screen,N2,N2,N2,...,0.61,0.882353,0.340782,31.0,-39.527027,-11.764706,28.826958,1.19,0.562908,0.138623
3,2.38,0.913043,0.210504,4,20220815,B0811ae,PD_Screen,N2,N2,N2,...,0.44,0.761905,0.258824,31.0,-28.571429,-16.553288,22.954092,0.853333,0.447143,0.021114
4,2.52,0.863636,0.246825,5,20220815,C0811ac,PD_Screen,N2,N2,N2,...,0.561,0.607143,0.279104,31.0,-20.238095,-29.699248,13.077698,0.663333,0.225661,0.03056
5,2.07,0.8,0.396135,1,20220815,A0811bc,PD_Screen,hipr-1_ok1081,hipr-1,ok1081,...,0.599,1.0,0.323784,31.0,-10.628019,25.0,-18.264338,-0.33,0.5,0.149013
6,2.5,0.578947,0.2196,2,20220815,C0811bb,PD_Screen,hipr-1_ok1081,hipr-1,ok1081,...,0.832,0.75,0.317557,31.0,4.8,29.545455,44.607127,1.476667,0.241422,0.052565
7,1.63,1.0,0.26319,3,20220815,B0811bd,PD_Screen,hipr-1_ok1081,hipr-1,ok1081,...,0.318,1.0,0.230435,31.0,-15.337423,0.0,-12.445525,0.39,0.393939,0.045252
8,3.24,0.9,0.269753,4,20220815,C0811be,PD_Screen,hipr-1_ok1081,hipr-1,ok1081,...,0.507,0.888889,0.32293,31.0,-51.54321,-1.234568,19.713157,0.506667,0.310185,0.042794
9,2.28,0.7,0.175877,1,20220815,B0811cc,PD_Screen,hipr-1_tm10120,hipr-1,tm10120,...,0.489,0.88,0.216372,31.0,-0.877193,25.714286,23.024298,0.953333,0.213362,0.028495


In [14]:
# # tap_url = 'https://osf.io/du9bj/files/osfstorage/650a2f9f1e76a4230e8a99a5?raw=true'
# tap_url='https://github.com/MyYummyPancake/NRSC510B/blob/main/tap_output.csv?raw=true'
# # s=requests.get(tap_url).content
# # tap_output=pd.read_csv(io.StringIO(s.decode('utf-8')))
# tap_output=pd.read_csv(tap_url, on_bad_lines='skip', index_col=0)
# print(tap_output)

In [15]:
# print(tap_output['Gene'].unique())
# print(len(tap_output['Gene'].unique()))
# print(baseline_output['Gene'].unique())
# print(len(baseline_output['Gene'].unique()))

In [16]:
# for x in tap_output['Gene'].unique():
#     tap_output_gene=tap_output[tap_output['Gene']==x]
#     gene_tap_data=tap_output[tap_output['Date'].isin(tap_output_gene['Date'].unique())]
#     gene_tap_data_final=gene_tap_data[gene_tap_data['Gene'].isin(['N2', x])]
#     gene_tap_data_final['taps']=gene_tap_data_final['taps'].astype(int)


# for x in baseline_output['Gene'].unique():
#     baseline_output_gene=baseline_output[baseline_output['Gene']==x]
#     gene_baseline_data=baseline_output[baseline_output['Date'].isin(baseline_output_gene['Date'].unique())]
#     gene_baseline_data_final=gene_baseline_data[gene_baseline_data['Gene'].isin(['N2', x])]
#     for a,b in zip(list_baseline_metrics, list_baseline_Tstats):
#         TTest_Allele(x, a, baseline_output, b)


# 3. Run Statistics (T-Test and sample-mean distance) on Data

## 3.1 Generate dataframes conditioned by `baseline` (True/False) and `allele` (True/False)

In [17]:
def get_output_byplate(output, baseline, allele):
    """
    Aggregates data by 'Plate_id','Date','Screen','dataset','Gene','Allele'

    Parameters:
        output (pd.DataFrame): Input DataFrame (either baseline_output or tap_data)
        baseline (boolean): whether data is baseline (True) or tap response (False)
        allele (boolean): group by allele (True) or group by gene (False)

    Returns:
        A DataFrame with plate-level averages
    """

    # columns to delete if baseline = True
    drop_col_base=['Plate_id','n','Number','Time','Screen','Date','Allele']

    # columns to delete if baseline = False
    drop_col_taps=['Plate_id','Screen','Date','Allele','dist','plate','time',
                   'taps','recov_dura','recov_prob','recov_speed']
    
    drop_col = drop_col_base if baseline else drop_col_taps

    drop_col.append('Gene') if allele else drop_col.append('dataset')
     
    output_byplate = output.groupby(
        by=['Plate_id','Date','Screen','dataset','Gene','Allele'],
        as_index=False).mean().drop(columns=drop_col)
    
    return output_byplate

#### 3.1.1 `baseline` = True, `allele` = False

In [18]:
baseline_output_byplate=get_output_byplate(baseline_output, baseline= True, allele=False)

print(f"Shape: {baseline_output_byplate.shape}")

baseline_output_byplate.head()

Shape: (13, 13)


Unnamed: 0,Gene,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,N2,0.033536,0.032413,0.030834,0.116511,1.231836,0.169018,1.909915,0.261825,47.466667,29.442805,0.005735,8.756541
1,N2,0.05908,0.066365,0.203949,0.111072,1.19486,0.153823,3.502214,0.261859,49.367271,28.56715,0.008075,4.712773
2,hipr-1,0.111014,0.130682,0.426766,0.104552,1.100872,0.136805,10.201907,0.406782,63.183664,35.107551,0.016845,21.158225
3,hipr-1,0.0272,0.028163,0.00846,0.09632,0.937245,0.108482,2.342027,0.279167,48.979768,27.0252,0.004914,9.26598
4,N2,0.08401,0.103804,0.275993,0.108908,1.085004,0.14279,5.541694,0.272735,44.438571,28.031377,0.011272,8.938643


#### 3.1.2 `baseline` = False, `allele` = False

In [19]:
tap_data_byplate=get_output_byplate(tap_data, baseline=False, allele=False)

print(f"Shape: {tap_data_byplate.shape}")

tap_data_byplate.head()

Shape: (13, 16)


Unnamed: 0,Gene,init_dura,init_prob,init_speed,final_dura,final_prob,final_speed,habit_dura,habit_prob,habit_speed,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,N2,2.14,0.9375,0.24486,0.99,0.302832,0.221688,1.15,0.634668,0.023172,-7.943925,-26.153846,-7.747898,0.98,0.389475,0.0042
1,N2,2.96,1.0,0.264527,0.6,0.319444,0.202159,2.36,0.680556,0.062368,-39.527027,-11.764706,28.826958,1.19,0.562908,0.138623
2,hipr-1,2.07,0.8,0.396135,2.18,0.5,0.174771,-0.11,0.3,0.221365,-10.628019,25.0,-18.264338,-0.33,0.5,0.149013
3,hipr-1,2.53,0.826087,0.204348,0.976667,0.734271,0.203894,1.553333,0.091816,0.000454,-4.743083,21.052632,3.354816,1.433333,0.265729,0.00731
4,N2,2.83,0.928571,0.245936,1.233333,0.355556,0.162658,1.596667,0.573016,0.083278,-7.773852,-41.258741,-27.713921,1.376667,0.189899,0.015119


#### 3.1.3 `baseline` = True, `allele` = True

In [20]:
baseline_output_allele_byplate = get_output_byplate(baseline_output,baseline=True, allele=True)

print(f"Shape: {baseline_output_allele_byplate.shape}")

baseline_output_allele_byplate.head()

Shape: (13, 13)


Unnamed: 0,dataset,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
0,N2,0.033536,0.032413,0.030834,0.116511,1.231836,0.169018,1.909915,0.261825,47.466667,29.442805,0.005735,8.756541
1,N2,0.05908,0.066365,0.203949,0.111072,1.19486,0.153823,3.502214,0.261859,49.367271,28.56715,0.008075,4.712773
2,hipr-1_ok1081,0.111014,0.130682,0.426766,0.104552,1.100872,0.136805,10.201907,0.406782,63.183664,35.107551,0.016845,21.158225
3,hipr-1_tm10120,0.0272,0.028163,0.00846,0.09632,0.937245,0.108482,2.342027,0.279167,48.979768,27.0252,0.004914,9.26598
4,N2,0.08401,0.103804,0.275993,0.108908,1.085004,0.14279,5.541694,0.272735,44.438571,28.031377,0.011272,8.938643


#### 3.1.4 `baseline` = False, `allele` = True

In [21]:
tap_data_allele_byplate = get_output_byplate(tap_data, baseline=False, allele=True)

print(f"Shape: {tap_data_allele_byplate.shape}")

tap_data_allele_byplate.head()

Shape: (13, 16)


Unnamed: 0,dataset,init_dura,init_prob,init_speed,final_dura,final_prob,final_speed,habit_dura,habit_prob,habit_speed,recovery_dura,recovery_prob,recovery_speed,memory_retention_dura,memory_retention_prob,memory_retention_speed
0,N2,2.14,0.9375,0.24486,0.99,0.302832,0.221688,1.15,0.634668,0.023172,-7.943925,-26.153846,-7.747898,0.98,0.389475,0.0042
1,N2,2.96,1.0,0.264527,0.6,0.319444,0.202159,2.36,0.680556,0.062368,-39.527027,-11.764706,28.826958,1.19,0.562908,0.138623
2,hipr-1_ok1081,2.07,0.8,0.396135,2.18,0.5,0.174771,-0.11,0.3,0.221365,-10.628019,25.0,-18.264338,-0.33,0.5,0.149013
3,hipr-1_tm10120,2.53,0.826087,0.204348,0.976667,0.734271,0.203894,1.553333,0.091816,0.000454,-4.743083,21.052632,3.354816,1.433333,0.265729,0.00731
4,N2,2.83,0.928571,0.245936,1.233333,0.355556,0.162658,1.596667,0.573016,0.083278,-7.773852,-41.258741,-27.713921,1.376667,0.189899,0.015119


In [22]:
# tap_data_allele_byplate[tap_data_allele_byplate.dataset=='N2_XJ1']

## 3.2 Calculate Mean Distances and CIs

In [23]:

def extract_phenotypes(df):
    ''' 
    Splits a multi-column DataFrame into a list of DataFrames, each containing one phenotype

    input: 
        df (pd.DataFrame): dataframe with multiple columns (1st column is the index, the other are phenotypes)

    returns:
        list_phenotypes_df: list with 2 columns - one for index and one for phenotype, 
            for how many phenotypes there are in the input
    '''
    list_phenotypes_df = []
    index = df.columns[0]
    for i in df.columns[1:]:
        list_phenotypes_df.append(df[[index, i]].copy())

    return list_phenotypes_df



def ci95(df):
    """
    input: df of 4 columns: index, mean, count, std

    returns: df of 6 columns: index, mean, count, std, ci95_hi, ci95_low

    """
    for metric in df.columns.levels[0]:
        if metric == 'Gene':
            pass
        else:
            ci95_hi = []
            ci95_lo = []
            for i in df[metric].index:
                m = df[metric]['mean'].loc[i]
                c = df[metric]['count'].loc[i]
                s = df[metric]['sem'].loc[i]
                ci95_hi.append(stats.t.interval(confidence=0.95, df=c-1, loc=m, scale=s)[1])
                ci95_lo.append(stats.t.interval(confidence=0.95, df=c-1, loc=m, scale=s)[0])
            df[metric,'ci95_hi'] = ci95_hi
            df[metric,'ci95_lo'] = ci95_lo
            # df[metric,'ci95']=list(zip(ci95_lo,ci95_hi))
            
    return df



def calculate_MSD(list_of_dfs, by):
    new_list_of_dfs = []
    
    for df in list_of_dfs:
        # Get phenotype column name (assuming 2nd column is the metric)
        pheno_col = df.columns[1]
        
        # Calculate statistics
        stats = df.groupby(by)[df.columns[1]].agg(['mean', 'count', 'sem'])

        
        # DEBUGGED: Convert to MultiIndex if needed (more robust version)
        if not isinstance(stats.columns, pd.MultiIndex):
            stats.columns = pd.MultiIndex.from_tuples([(pheno_col, col) for col in stats.columns])
        
        # Calculate CI
        stats_2 = ci95(stats)
        
        # Get N2 control data
        if Screen == "Neuron_Genes_Screen":
            N2_mask = stats_2.index == 'N2' if by == "Gene" else stats_2.index.isin(['N2_XJ1','N2_N2'])
        else:
            N2_mask = stats_2.index == 'N2'
            
        N2_data = stats_2[N2_mask]
        
        # Subtract N2 values
        stats_2.iloc[:, 0] -= N2_data.iloc[0, 0]  # mean
        stats_2.iloc[:, 3] -= N2_data.iloc[0, 0]  # ci95_hi
        stats_2.iloc[:, 4] -= N2_data.iloc[0, 0]  # ci95_low
        
        new_list_of_dfs.append(stats_2)
    
    return new_list_of_dfs



def get_MSD(list_MSD):
    '''
    input: List of dataframes, each representing a phenotype with calculated MSD.

    returns: Single combined dataframe joining all input dataframes with MSD values.
    '''
    for a in list_MSD:
        if a.columns.levels[0] == list_MSD[0].columns.levels[0]:
            MSD=a
        else:
            MSD=MSD.join(a)
    return MSD

In [24]:
def get_combined_MSD(baseline_byplate,tap_byplate, by=['Gene','dataset']):
    """
    Combines MSD datafram from baseline plates and tap plates

    input:
        - baseline_byplate: baseline data by plate
        - tap_byplate: tap data by plate
        - by: what to group by "Gene" or "dataset"
    returns:
        - combined MSD dataframe
    """
    list_baseline_MSD=calculate_MSD(extract_phenotypes(baseline_byplate), by=by)

    list_tap_MSD=calculate_MSD(extract_phenotypes(tap_byplate), by=by)

    baseline_MSD = get_MSD(list_baseline_MSD)
    
    tap_MSD = get_MSD(list_tap_MSD)

    combined_MSD = baseline_MSD.join(tap_MSD, on=by)

    combined_MSD=combined_MSD.rename(columns={"habit_dura":"Habituation of Response Duration",
                                         "habit_prob": "Habituation of Respones Probability",
                                         "habit_speed":"Habituation of Response Speed",
                                         "init_dura": "Initial Response Duration",
                                         "init_prob": "Initial Response Probability",
                                         "init_speed": "Initial Response Speed",
                                         "final_dura": "Final Response Duration",
                                         "final_prob": "Final Response Probability",
                                         "final_speed": "Final Response Speed",
                                         "recovery_dura": "Spontaneous Recovery of Response Duration",
                                         "recovery_prob": "Spontaneous Recovery of Response Probability",
                                         "recovery_speed": "Spontaneous Recovery of Response Speed",
                                         "memory_retention_dura": "Memory Retention of Response Duration",
                                         "memory_retention_prob": "Memory Retention of Response Probability",
                                         "memory_retention_speed": "Memory Retention of Response Speed"})

    combined_MSD=combined_MSD.reset_index()
    combined_MSD.columns = combined_MSD.columns.to_flat_index().str.join('-')
    combined_MSD=combined_MSD.rename(columns={by+"-": by})
    combined_MSD['Screen']=Screen
    
    return combined_MSD

In [25]:
def calculate_MSD(list_of_dfs, by):
    new_list_of_dfs = []
    
    for df in list_of_dfs:
        # Get phenotype column name (assuming 2nd column is the metric)
        pheno_col = df.columns[1]
        
        # Create proper MultiIndex structure
        stats = df.groupby(by)[df.columns[1]].agg(['mean', 'count', 'sem'])

        
        # Convert to MultiIndex if needed (more robust version)
        if not isinstance(stats.columns, pd.MultiIndex):
            stats.columns = pd.MultiIndex.from_tuples([(pheno_col, col) for col in stats.columns])
        
        # Now ci95() will work
        stats_2 = ci95(stats)
        
        # Get N2 control data
        if Screen == "Neuron_Genes_Screen":
            N2_mask = stats_2.index == 'N2' if by == "Gene" else stats_2.index.isin(['N2_XJ1','N2_N2'])
        else:
            N2_mask = stats_2.index == 'N2'
            
        N2_data = stats_2[N2_mask]
        
        # Subtract N2 values
        stats_2.iloc[:, 0] -= N2_data.iloc[0, 0]  # mean
        stats_2.iloc[:, 3] -= N2_data.iloc[0, 0]  # ci95_hi
        stats_2.iloc[:, 4] -= N2_data.iloc[0, 0]  # ci95_low
        
        new_list_of_dfs.append(stats_2)
    
    return new_list_of_dfs

### 3.2.1 Gene-level SMD

In [45]:
combined_MSD=get_combined_MSD(baseline_output_byplate,
                              tap_data_byplate, 
                              by='Gene')

combined_MSD.head()

Unnamed: 0,Gene,Instantaneous Speed-mean,Instantaneous Speed-count,Instantaneous Speed-sem,Instantaneous Speed-ci95_hi,Instantaneous Speed-ci95_lo,Interval Speed-mean,Interval Speed-count,Interval Speed-sem,Interval Speed-ci95_hi,...,Memory Retention of Response Probability-count,Memory Retention of Response Probability-sem,Memory Retention of Response Probability-ci95_hi,Memory Retention of Response Probability-ci95_lo,Memory Retention of Response Speed-mean,Memory Retention of Response Speed-count,Memory Retention of Response Speed-sem,Memory Retention of Response Speed-ci95_hi,Memory Retention of Response Speed-ci95_lo,Screen
0,N2,0.0,5,0.009221,0.025602,-0.025602,0.0,5,0.011803,0.032771,...,5,0.069487,0.192928,-0.192928,0.0,5,0.02455,0.068163,-0.068163,PD_Screen
1,hipr-1,-0.016382,8,0.01048,0.0084,-0.041163,-0.018904,8,0.013263,0.012458,...,8,0.03787,0.018558,-0.160538,-0.004841,8,0.018488,0.038876,-0.048558,PD_Screen


### 3.2.2 Allele-level SMD

In [44]:
allele_combined_MSD=get_combined_MSD(baseline_output_allele_byplate,
                                     tap_data_allele_byplate, 
                                     by='dataset')

allele_combined_MSD.head()

Unnamed: 0,dataset,Instantaneous Speed-mean,Instantaneous Speed-count,Instantaneous Speed-sem,Instantaneous Speed-ci95_hi,Instantaneous Speed-ci95_lo,Interval Speed-mean,Interval Speed-count,Interval Speed-sem,Interval Speed-ci95_hi,...,Memory Retention of Response Probability-count,Memory Retention of Response Probability-sem,Memory Retention of Response Probability-ci95_hi,Memory Retention of Response Probability-ci95_lo,Memory Retention of Response Speed-mean,Memory Retention of Response Speed-count,Memory Retention of Response Speed-sem,Memory Retention of Response Speed-ci95_hi,Memory Retention of Response Speed-ci95_lo,Screen
0,N2,0.0,5,0.009221,0.025602,-0.025602,0.0,5,0.011803,0.032771,...,5,0.069487,0.192928,-0.192928,0.0,5,0.02455,0.068163,-0.068163,PD_Screen
1,hipr-1_ok1081,0.002161,4,0.016581,0.05493,-0.050608,0.006861,4,0.019126,0.06773,...,4,0.055742,0.175766,-0.179028,0.030483,4,0.02562,0.112016,-0.051051,PD_Screen
2,hipr-1_tm10120,-0.034924,4,0.002895,-0.025712,-0.044136,-0.044669,4,0.003538,-0.033409,...,4,0.01945,-0.078451,-0.202247,-0.040164,4,0.010338,-0.007266,-0.073063,PD_Screen


## 3.3 T-Stat analysis

In [None]:
def baseline_metrics(by=["Gene","dataset"]):
    """
    Create a list of empty dataframe and list of metrics for baseline analysis

    input:
        by (list): what to group by "Gene" or "dataset"
        
    returns:
        list_baseline_Tstats: dataframes to store t-statistics
        list_baseline_metrics: dataframes to store metic names
    """
    PD_baseline_instantspeed_T=pd.DataFrame(columns = [by,"Instantaneous Speed"])
    PD_baseline_intspeed_T=pd.DataFrame(columns = [by,"Interval Speed"])
    PD_baseline_bias_T=pd.DataFrame(columns = [by,"Bias"])
    PD_baseline_morphwidth_T=pd.DataFrame(columns = [by,"Morphwidth"])
    PD_baseline_midline_T=pd.DataFrame(columns = [by,"Midline"])
    PD_baseline_area_T=pd.DataFrame(columns = [by,"Area"])
    PD_baseline_angularspeed_T=pd.DataFrame(columns = [by,"Angular Speed"])
    PD_baseline_aspectratio_T=pd.DataFrame(columns = [by,"Aspect Ratio"])
    PD_baseline_kink_T=pd.DataFrame(columns = [by,"Kink"])
    PD_baseline_curve_T=pd.DataFrame(columns = [by,"Curve"])
    PD_baseline_crab_T=pd.DataFrame(columns = [by,"Crab"])
    PD_baseline_pathlength_T=pd.DataFrame(columns = [by,"Pathlength"])

    list_baseline_Tstats=[PD_baseline_instantspeed_T,
                        PD_baseline_intspeed_T,
                        PD_baseline_bias_T,
                        PD_baseline_morphwidth_T,
                        PD_baseline_midline_T,
                        PD_baseline_area_T,
                        PD_baseline_angularspeed_T,
                        PD_baseline_aspectratio_T,
                        PD_baseline_kink_T,
                        PD_baseline_curve_T,
                        PD_baseline_crab_T,
                        PD_baseline_pathlength_T]

    list_baseline_metrics=["Instantaneous Speed",
                        "Interval Speed",
                        "Bias",
                        "Morphwidth",
                        "Midline",
                        "Area",
                        "Angular Speed",
                        "Aspect Ratio",
                        "Kink",
                        "Curve",
                        "Crab",
                        "Pathlength"]
    
    return list_baseline_Tstats, list_baseline_metrics

In [None]:
def tap_metrics(by=["Gene","dataset"]):
    """
    Create a list of empty dataframes and list of metrics for tap analysis

    input:
        by (list): what to group by "Gene" or "dataset"
        
    returns:
        list_tap_Tstats: dataframes to store t-statistics
        list_tap_metrics: dataframes to store metic names
    """
    recovery_dura=pd.DataFrame(columns = [by,"Recovery Duration"])
    recovery_prob=pd.DataFrame(columns = [by,"Recovery Probability"])
    recovery_speed=pd.DataFrame(columns = [by,"Recovery Speed"])
    memory_retention_dura=pd.DataFrame(columns = [by,"Memory Retention Duration"])
    memory_retention_prob=pd.DataFrame(columns = [by,"Memory Retention Probability"])
    memory_retention_speed=pd.DataFrame(columns = [by,"Memory Retention Speed"])
    init_dura=pd.DataFrame(columns = [by,"Initial Duration"])
    init_prob=pd.DataFrame(columns = [by,"Initial Probability"])
    init_speed=pd.DataFrame(columns = [by,"Initial Speed"])
    final_dura=pd.DataFrame(columns = [by,"Final Duration"])
    final_prob=pd.DataFrame(columns = [by,"Final Probability"])
    final_speed=pd.DataFrame(columns = [by,"Final Speed"])
    hab_dura=pd.DataFrame(columns = [by,"Habituation of Duration"])
    hab_prob=pd.DataFrame(columns = [by,"Habituation of Probability"])
    hab_speed=pd.DataFrame(columns = [by,"Habituation of Speed"])

    list_tap_Tstats = [recovery_dura,
                    recovery_prob,
                    recovery_speed,
                    memory_retention_dura,
                    memory_retention_prob,
                    memory_retention_speed,
                    init_dura,
                    init_prob,
                    init_speed,
                    final_dura,
                    final_prob,
                    final_speed,
                    hab_dura,
                    hab_prob,
                    hab_speed]
    
    list_tap_metrics = ["recovery_dura",
                        "recovery_prob",
                        "recovery_speed",
                        "memory_retention_dura",
                        "memory_retention_prob",
                        "memory_retention_speed",
                        "init_dura",
                        "init_prob",
                        "init_speed",
                        "final_dura",
                        "final_prob",
                        "final_speed",
                        "habit_dura",
                        "habit_prob",
                        "habit_speed"]
    
    return list_tap_Tstats, list_tap_metrics

In [None]:
def TTest(Type, DF_ref, output, by=["Gene", "dataset"]):
    """
    Perform two sample t-test for each unique Gene/dataset column in the Df_ref
    input: 
        - a:column name of values 
        - DF_ref:reference dataframe
        - output: output df to store results in 
        - by: what to group by "Gene" or "dataset"
        
    """
    for a in DF_ref[by].unique():
        Tstat_a =ttest_ind(DF_ref[DF_ref.dataset == a][Type], DF_ref[DF_ref.Allele.isin(["XJ1","N2"])][Type],equal_var=False)[0]
        Tstat_g= ttest_ind(DF_ref[DF_ref.Gene == a][Type], DF_ref[DF_ref.Gene == "N2"][Type],equal_var=False)[0]
        Tstat = Tstat_g if by=="Gene" else Tstat_a
        row=[a, Tstat]
        output.loc[len(output)]=row
    # print(output)

def do_TTest(by=["Gene", "dataset"], baseline=["true", "false"]):
    """
    Perform TTest function for each unique Gene/dataset column in baseline_output/tap_data
    
    input: 
        - by: what to group by "Gene" or "dataset"
        - baseline: whether or not to use baseline data

    returns: sorted T-statistics dataframe
    """

    if baseline=="true":
        list_Tstats, list_metrics = baseline_metrics(by)
        data = baseline_output
    else:
        list_Tstats,list_metrics = tap_metrics(by)
        data =tap_data
    for x in data[by].unique():
        if Screen=="Neuron_Genes_Screen":
            condition = x in (["N2"] if by == "Gene" else ["N2_XJ1", "N2_N2"])
        else:
            condition = (x =="N2")
        if condition:
            pass
        else:
            output_gene=data[data[by]==x]
            gene_data=data[data['Date'].isin(output_gene['Date'].unique())]
            if Screen=="Neuron_Genes_Screen":
                gene_data_final = gene_data[gene_data[by].isin(['N2', x])] if by=="Gene" else gene_data[gene_data[by].isin(['N2_N2','N2_XJ1', x])]
            else:
                gene_data_final = gene_data[gene_data[by].isin(['N2', x])]

            for a,b in zip(list_metrics, list_Tstats):
                TTest(a, gene_data_final, b, by) #calls t test function
    
    PD_Tstats=pd.DataFrame()
    for a in list_Tstats:
        b=a.groupby([by], as_index=False).mean()
        if b.columns.values[1] == list_Tstats[0].columns.values[1]:
            PD_Tstats=b
        else:
            PD_Tstats=PD_Tstats.join(b.iloc[:,1])
            
    PD_Tstats=PD_Tstats.set_index(by)
    
    return PD_Tstats
            

### T-stat on Baseline data:

### 3.3.1 Allele-level T-stat analysis of baseline data

In [None]:
warnings.filterwarnings('ignore')

PD_baseline_Tstats_allele = do_TTest("dataset", baseline="true") # get sorted T-statistics DataFrame 

# PD_baseline_Tstats_allele_sorted=PD_baseline_Tstats_allele.sort_index()

PD_baseline_Tstats_allele.head()

Unnamed: 0_level_0,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hipr-1_ok1081,8.720217,17.272716,2.091868,-93.709991,-151.77574,-190.66633,28.561905,84.318945,43.915102,50.559487,35.51648,165.20275
hipr-1_tm10120,-149.130061,-135.875107,-121.209079,-107.524828,-362.066521,-339.738765,-65.349506,24.355089,-47.450876,-134.061984,-39.992945,-52.444177


### 3.3.2 Gene-level T-stat analysis of baseline data

In [None]:
warnings.filterwarnings('ignore')

PD_baseline_Tstats=do_TTest("Gene", baseline="true") # get sorted T-statistics DataFrame 

# PD_baseline_Tstats_sorted=PD_baseline_Tstats.sort_index()

PD_baseline_Tstats.head()

Unnamed: 0_level_0,Instantaneous Speed,Interval Speed,Bias,Morphwidth,Midline,Area,Angular Speed,Aspect Ratio,Kink,Curve,Crab,Pathlength
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hipr-1,-46.583094,-38.802844,-46.56871,-120.799168,-228.506402,-253.233922,1.911753,71.74101,11.424131,-11.893595,2.659493,65.740765


### T-stat analysis for tap-response data:

### 3.3.3 Allele level T-stat analysis of tap response data

In [41]:
warnings.filterwarnings('ignore')

PD_habituation_Tstats_allele = do_TTest("dataset", baseline="false") # get sorted T-statistics DataFrame 

# PD_habituation_Tstats_allele_sorted=PD_habituation_Tstats_allele.sort_index()

PD_habituation_Tstats_allele.head()

Unnamed: 0_level_0,Recovery Duration,Recovery Probability,Recovery Speed,Memory Retention Duration,Memory Retention Probability,Memory Retention Speed,Initial Duration,Initial Probability,Initial Speed,Final Duration,Final Probability,Final Speed,Habituation of Duration,Habituation of Probability,Habituation of Speed
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hipr-1_ok1081,0.196572,3.99498,0.140053,-1.280902,-0.018308,0.859058,-0.551071,-1.173992,1.144826,1.097106,7.13957,0.378627,-1.020111,-4.161169,0.52416
hipr-1_tm10120,1.596165,7.425059,0.666986,0.500673,-1.945015,-1.507782,-0.565598,-4.166082,-5.10381,0.524222,10.158379,-0.429403,-0.630178,-11.167895,-1.96336


### 3.3.4 Gene-level T-stat analysis of Tap response data

In [43]:
warnings.filterwarnings('ignore')

PD_habituation_Tstats = do_TTest("Gene", baseline="false") # get sorted T-statistics DataFrame 

PD_habituation_Tstats_sorted=PD_habituation_Tstats.sort_index()

PD_habituation_Tstats

Unnamed: 0_level_0,Recovery Duration,Recovery Probability,Recovery Speed,Memory Retention Duration,Memory Retention Probability,Memory Retention Speed,Initial Duration,Initial Probability,Initial Speed,Final Duration,Final Probability,Final Speed,Habituation of Duration,Habituation of Probability,Habituation of Speed
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hipr-1,0.839034,6.279144,0.434678,-0.763127,-0.897052,-0.157515,-0.684199,-2.883587,-0.273891,1.096964,8.654085,0.080648,-1.106724,-6.794556,-0.24353


# 4. Merging t-stat data into one dataset

In [54]:
def pop_cols(combined):
    """
    Reorders columns in the combined dataframe. 
    (pops specific columns["Area", "Midline", "Morphwidth", "Angular Speed"] and
    reinserts at different positions)

    input:
        combined: dataframe with columns to be reordered

    returns: 
        NA    
        
    """
    first_col=combined.pop("Area")
    combined.insert(0,"Area",first_col)

    first_col=combined.pop("Midline")
    combined.insert(0,"Midline",first_col)

    first_col=combined.pop("Morphwidth")
    combined.insert(0,"Morphwidth",first_col)

    first_col=combined.pop("Angular Speed")
    combined.insert(5,"Angular Speed",first_col)

def pop_last(combined):
    """
    Reorders the last three columns of the combined dataframe.
    input:
        combined: dataframe with columns to be reordered

    """
    last_col=combined.pop("Spontaneous Recovery of Response Duration")
    combined.insert(26,"Spontaneous Recovery of Response Duration",last_col)

    last_col=combined.pop("Spontaneous Recovery of Response Probability")
    combined.insert(26,"Spontaneous Recovery of Response Probability",last_col)

    last_col=combined.pop("Spontaneous Recovery of Response Speed")
    combined.insert(26,"Spontaneous Recovery of Response Speed",last_col)

    last_col=combined.pop("Memory Retention of Response Duration")
    combined.insert(26,"Memory Retention of Response Duration",last_col)

    last_col=combined.pop("Memory Retention of Response Probability")
    combined.insert(26,"Memory Retention of Response Probability",last_col)

    last_col=combined.pop("Memory Retention of Response Speed")
    combined.insert(26,"Memory Retention of Response Speed",last_col)

def rename_columns(df):
    '''
    Renames columns in the input dataframe
    input:
        combined: dataframe with columns to be renamed   
    returns:
        input dataframe with renamed columns 
    '''
    renames = {
        "Habituation of Duration": "Habituation of Response Duration",
        "Habituation of Probability": "Habituation of Respones Probability",
        "Habituation of Speed": "Habituation of Response Speed",
        "Initial Duration": "Initial Response Duration",
        "Initial Probability": "Initial Response Probability",
        "Initial Speed": "Initial Response Speed",
        "Final Duration": "Final Response Duration",
        "Final Probability": "Final Response Probability",
        "Final Speed": "Final Response Speed",
        "Recovery Duration": "Spontaneous Recovery of Response Duration",
        "Recovery Probability": "Spontaneous Recovery of Response Probability",
        "Recovery Speed": "Spontaneous Recovery of Response Speed",
        "Memory Retention Duration": "Memory Retention of Response Duration",
        "Memory Retention Probability": "Memory Retention of Response Probability",
        "Memory Retention Speed": "Memory Retention of Response Speed"
    }
    return df.rename(columns=renames)

def merge_Tstats(baseline, habituation, by=["Gene", "dataset"], Screen='PD_Screen'):
    """
    merge two dataframes based on the Gene/dataset
    normalize the merged dataframe and then return it with melted version

    input:
        - baseline: baseline dataframe to merge
        - habituation: habituation dataframe to merge
        - by: what to group by "Gene" or "dataset"
    """

    #merge baseline and habituation data
    combined_Tstats = pd.merge(baseline, habituation, on=by, how='left')
    combined_Tstats = combined_Tstats.sort_index() # sort by index

    #normalise combined dataframe by subtracting mean and div by sd
    combined_Tstats_normalized = (combined_Tstats-combined_Tstats.mean())/combined_Tstats.std()

    if by=="dataset" and Screen=="Neuron_Genes_Screen":
        combined_Tstats_normalized_2 = combined_Tstats_normalized-combined_Tstats_normalized[combined_Tstats_normalized.index=="N2_XJ1"].squeeze()
    else :
        combined_Tstats_normalized_2 = combined_Tstats_normalized-combined_Tstats_normalized[combined_Tstats_normalized.index=="N2"].squeeze()  

    pop_cols(combined_Tstats) #reorder columns

    #rename columns of combined and normalized df
    combined_Tstats=rename_columns(combined_Tstats)
    combined_Tstats_normalized_2=rename_columns(combined_Tstats_normalized_2)
    
    pop_cols(combined_Tstats_normalized_2) # reorder columns
    pop_last(combined_Tstats_normalized_2) # reorder columns

    # Melt the combined dataframe
    combined_Tstats_melted=combined_Tstats.reset_index()
    combined_Tstats_melted=pd.melt(combined_Tstats_melted, id_vars=[by],
                                var_name='Metric',
                                value_name='T_score')
    
    # Sort the melted dataframe by T_score
    combined_Tstats_melted_sorted=combined_Tstats_melted.sort_values(by=['T_score'])

    # Melt the normalized dataframe
    combined_Tstats_normalized_melted=combined_Tstats_normalized_2.reset_index()
    combined_Tstats_normalized_melted=pd.melt(combined_Tstats_normalized_melted, id_vars=[by],
                                                   var_name='Metric',
                                                   value_name='T_score')

    #add Screen column to df and its melted version
    combined_Tstats_normalized_2['Screen']=Screen
    combined_Tstats_normalized_melted['Screen']=Screen

    return combined_Tstats_normalized_2,combined_Tstats_normalized_melted

## 4.1 Gene-level

In [55]:
combined_Tstats_normalize_2, combined_Tstats_normalized_melted = merge_Tstats(PD_baseline_Tstats,PD_habituation_Tstats, "Gene")

In [56]:
combined_Tstats_normalize_2.head()

Unnamed: 0_level_0,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,Curve,...,Habituation of Response Duration,Habituation of Respones Probability,Habituation of Response Speed,Spontaneous Recovery of Response Duration,Spontaneous Recovery of Response Probability,Spontaneous Recovery of Response Speed,Memory Retention of Response Duration,Memory Retention of Response Probability,Memory Retention of Response Speed,Screen
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,PD_Screen
hipr-1,-1.414214,-1.414214,-1.414214,-1.414214,-1.414214,1.414214,-1.414214,1.414214,1.414214,-1.414214,...,-1.414214,-1.414214,-1.414214,1.414214,1.414214,1.414214,-1.414214,-1.414214,-1.414214,PD_Screen


In [57]:
combined_Tstats_normalized_melted.head()

Unnamed: 0,Gene,Metric,T_score,Screen
0,N2,Morphwidth,0.0,PD_Screen
1,hipr-1,Morphwidth,-1.414214,PD_Screen
2,N2,Midline,0.0,PD_Screen
3,hipr-1,Midline,-1.414214,PD_Screen
4,N2,Area,0.0,PD_Screen


## 4.2 Allele level

In [58]:
combined_Tstats_normalize_allele_2, combined_Tstats_normalized_melted_allele = merge_Tstats(PD_baseline_Tstats_allele,PD_habituation_Tstats_allele, "dataset")

In [59]:
combined_Tstats_normalize_allele_2.head()

Unnamed: 0_level_0,Morphwidth,Midline,Area,Instantaneous Speed,Interval Speed,Angular Speed,Bias,Aspect Ratio,Kink,Curve,...,Habituation of Response Duration,Habituation of Respones Probability,Habituation of Response Speed,Spontaneous Recovery of Response Duration,Spontaneous Recovery of Response Probability,Spontaneous Recovery of Response Speed,Memory Retention of Response Duration,Memory Retention of Response Probability,Memory Retention of Response Speed,Screen
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,PD_Screen
hipr-1_ok1081,-1.601861,-0.83476,-1.119636,0.098284,0.205923,0.593289,0.029633,1.943077,0.961061,0.52994,...,-1.981764,-0.737267,0.399721,0.22585,1.075044,0.398223,-1.394071,-0.01638,0.716989,PD_Screen
hipr-1_tm10120,-1.838009,-1.99135,-1.995022,-1.680816,-1.619884,-1.357443,-1.717044,0.561248,-1.03844,-1.405172,...,-1.224243,-1.978704,-1.497245,1.833897,1.998073,1.896481,0.544909,-1.740183,-1.25843,PD_Screen


In [60]:
combined_Tstats_normalized_melted_allele.head()

Unnamed: 0,dataset,Metric,T_score,Screen
0,N2,Morphwidth,0.0,PD_Screen
1,hipr-1_ok1081,Morphwidth,-1.601861,PD_Screen
2,hipr-1_tm10120,Morphwidth,-1.838009,PD_Screen
3,N2,Midline,0.0,PD_Screen
4,hipr-1_ok1081,Midline,-0.83476,PD_Screen


# 5. Save data to database (sqlite3)

#### A janky way to add data and update the sql 

1. Read table to pd.DataFrame
2. Add new data to pd.DataFrame
3. Replace old table with newly updated pd.DataFrame

In [None]:
# USE THIS CELL TO UPDATE ALL THE NEED TALBES (Also have baseline_output on the second line)

conn=sqlite3.connect('/Users/lavanya/Desktop/Lavanya_Test/data_updated2.db')

tap_output.to_sql('tap_response_data', conn, if_exists='append', index=False)

baseline_output.to_sql('tap_baseline_data', conn, if_exists='append', index=False)

combined_Tstats_normalize_2.reset_index().to_sql('tstat_gene_data', conn, if_exists='append', index=False)

combined_Tstats_normalize_allele_2.reset_index().to_sql('tstat_allele_data', conn, if_exists='append', index=False)

combined_Tstats_normalized_melted.to_sql('gene_profile_data', conn, if_exists='append', index=False)

combined_Tstats_normalized_melted_allele.to_sql('allele_profile_data', conn, if_exists='append', index=False)

combined_MSD.to_sql('gene_MSD', conn, if_exists='append', index=False)

allele_combined_MSD.to_sql('allele_MSD', conn, if_exists='append', index=False)

# combined_Tstats_melted_sorted.to_sql('allele_phenotype_data', conn, if_exists='replace', index=False)

print(conn.total_changes)

conn.close()


# Want to test edge cases of pd.to_sql functionality#############

7522633
