In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import shapiro, boxcox, ttest_ind, mannwhitneyu, pearsonr, friedmanchisquare, kruskal, kstest, \
    lognorm, gamma, weibull_min, probplot, f_oneway, linregress, norm, spearmanr, ttest_1samp, wilcoxon 
import seaborn as sns
import os
import ast
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.formula.api import ols, mixedlm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scikit_posthocs as sp
import itertools

In [76]:
input_dir = "Results" # input directory

number = 1 # results to analyze (subfolder name, can be an integer or string)

recording_order = (15, 2, 1, 6, 10, 4)

In [None]:
dfs = {}

results_dir = os.path.join(input_dir, str(number))
files = [file for file in os.listdir(results_dir) if file.endswith(('.xlsx', '.xls')) and not file.startswith('OVERVIEW')]

for file in files:
    file_path = os.path.join(results_dir, file)
    dfs[file.split('.')[0]] = pd.read_excel(file_path)

framenames = list(dfs.keys())
results = framenames[0]
results_mtt = framenames[1]
results_tt = framenames[2]

print(f"Found {framenames} in {results_dir}.")

framenames.append('RESULTS_MERGED')
framenames.append('RESUlTS_MERGED_MTT')
framenames.append('RESULTS_MERGED_TT')

Found ['RESULTS', 'RESULTS_MTT', 'RESULTS_TT'] in Results\1.


# Distributions

In [78]:
# read experiment column and create a list for every experiment without duplicates
# run shapiro on every experiment with sex split and save the results as booleans into a dictionary
# run correct tests on every metric, using the normality dictionary for guidance

In [79]:
experiments = dfs[results]['experiment'].unique().tolist()
variables = dfs[results].columns[4:].tolist()
parameters = dfs[results].columns[:4].tolist()
print(f"Found {len(experiments)} experiments, {len(variables)} variables and {len(parameters)} parameters:")
print(" "+', '.join(experiments))
print(" "+', '.join(variables))
print(" "+', '.join(parameters))

Found 8 experiments, 7 variables and 4 parameters:
 ASR_control, gap_depth, tone_in_noise, gap_duration_4, gap_duration_8, gap_duration_10, gap_duration_20, gap_duration_50
 reactionTime, peakTime, difference, peakValue, RMS, tau, AUC
 animal, sex, date, experiment


In [93]:
non_parametric_dfs = {}
for name, df in dfs.items():
    if not name.endswith('_MERGED'):
        continue  # Only process merged dataframes
    non_parametric = pd.DataFrame(columns=['experiment', 'var'])
    not_enough_data = 0
    for var in variables:
        for exp in experiments:
            for sex in ['male', 'female']:
                data = df[(df['sex'] == sex) & (df['experiment'] == exp)][var].dropna()
                if len(data) > 2:
                    stat, p = shapiro(data)
                    if p < 0.05:
                        non_parametric = pd.concat(
                            [non_parametric, pd.DataFrame({'experiment': [exp], 'var': [var]})],
                            ignore_index=True
                        )
                else:
                    not_enough_data += 1
    non_parametric_dfs[name] = non_parametric
    #print(f"Non-parametric entries in {name}: {len(non_parametric)}")
    #if not_enough_data != 0: print(f"Warning, not enough data for {not_enough_data} entries.")

  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


In [94]:
print(len(non_parametric_dfs['RESULTS_MERGED']))
print(len(non_parametric_dfs['RESULTS_MTT_MERGED']))
print(len(non_parametric_dfs['RESULTS_TT_MERGED']))

20
20
22


In [98]:
gap_durations = ['gap_duration_4', 'gap_duration_8', 'gap_duration_10', 'gap_duration_20', 'gap_duration_50']

# Concatenate and drop duplicates as before
dfs_to_merge = [
    non_parametric_dfs['RESULTS_MERGED'],
    non_parametric_dfs['RESULTS_MTT_MERGED'],
    non_parametric_dfs['RESULTS_TT_MERGED']
]
non_parametric = pd.concat(dfs_to_merge, ignore_index=True).drop_duplicates()

# Extend: for each row with "gap_duration" in experiment, add all gap_duration_* for that var
rows_to_add = []
for _, row in non_parametric.iterrows():
    if "gap_duration" in row['experiment']:
        for gap_exp in gap_durations:
            if gap_exp != row['experiment']:
                new_row = row.copy()
                new_row['experiment'] = gap_exp
                rows_to_add.append(new_row)

# Add the new rows and drop duplicates again
if rows_to_add:
    non_parametric = pd.concat([non_parametric, pd.DataFrame(rows_to_add)], ignore_index=True).drop_duplicates()

print(non_parametric)

         experiment           var
0       ASR_control  reactionTime
1    gap_duration_4  reactionTime
2    gap_duration_8  reactionTime
3   gap_duration_10  reactionTime
4   gap_duration_50  reactionTime
5       ASR_control      peakTime
6         gap_depth      peakTime
7     tone_in_noise      peakTime
8    gap_duration_4      peakTime
9    gap_duration_8      peakTime
10  gap_duration_10      peakTime
11  gap_duration_20      peakTime
12  gap_duration_50      peakTime
13      ASR_control    difference
14  gap_duration_20    difference
15  gap_duration_50    difference
16        gap_depth           AUC
17        gap_depth           RMS
18        gap_depth  reactionTime
19    tone_in_noise  reactionTime
20  gap_duration_20  reactionTime
21        gap_depth    difference
22  gap_duration_10    difference
23   gap_duration_4           tau
24  gap_duration_10           AUC
61   gap_duration_4    difference
62   gap_duration_8    difference
77   gap_duration_8           tau
78  gap_durati

In [81]:
# input dfs:
# df: main df, merged in date and reps
# date_df: df with date, merged in reps
# reps_df: df with reps, merged in date
# df_top_10: top 10 df, merged in date and reps
# minus_top_10: df with top 10 removed, merged in date and reps

# df
# tt, mtt, dt, rp
# tt_dt, tt_rp, mt_dt, mt_rp
# dt_rp, tt_dt_rp, mt_dt_rp<

# averaging across reps and dates only occurs in Analyzer
# Peakfinder only outputs a merged df for overview
# it also gives a top_10, minus_top_10 and main dataframe, each with repetitions and dates not averages


# abbreviations: 
    # tt: top 10
    # mtt: minus top 10
    # dt: date
    # rp: reps
    # examples: tt_dt_rp: top 10 with date and reps | dt_rp: date and reps | df: fully merged df

    # test_df is the currently tested df and can change depending on previous test results

# test order depends on each previous result (as in if we can't merge across days, then use date_df for rest, for example)

# compare date df to df
# compare top_10 df to minus_top_10 df
# compare reps df to df
# compare strength metrics between males and females
# compare reaction time metrics between males and females
# compare time of day (i.e. animal number in order 15, 2, 1, 6, 10, 4) for all metrics
# compare experiment for all metrics