In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import shapiro, boxcox, ttest_ind, mannwhitneyu, pearsonr, friedmanchisquare, kruskal, kstest, \
    lognorm, gamma, weibull_min, probplot, f_oneway, linregress, norm, spearmanr, ttest_1samp, wilcoxon 
import seaborn as sns
import os
import ast
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.formula.api import ols, mixedlm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scikit_posthocs as sp
import itertools

In [None]:
input_dir = "Results" # input directory

number = 3 # results to analyze (subfolder name, can be an integer or string)

variables = ['reactionTime', 'peakTime', 'difference', 'peakValue', 'RMS', 'tau', 'AUC']

recording_order = (15, 2, 1, 6, 10, 4)

In [145]:
results_dir = os.path.join(input_dir, str(number))
files = [file for file in os.listdir(results_dir) if file.endswith(('.xlsx', '.xls'))]

file_path = os.path.join(results_dir, files[0])
df = pd.read_excel(file_path)

if len([f for f in os.listdir(results_dir) if os.path.isfile(os.path.join(results_dir, f))]) == 2:
    file_path_top_10 = os.path.join(results_dir, files[1])
    df_top_10 = pd.read_excel(file_path_top_10)
else:
    df_top_10 = None

# Distributions

In [146]:
# read experiment column and create a list for every experiment without duplicates
# run shapiro on every experiment with sex split and save the results as booleans into a dictionary
# run correct tests on every metric, using the normality dictionary for guidance

In [147]:
experiments = df['experiment'].unique().tolist()
print(f"Found {len(experiments)} experiments:")
print(', '.join(experiments))

Found 8 experiments:
ASR_control, gap_depth, tone_in_noise, gap_duration_4, gap_duration_8, gap_duration_10, gap_duration_20, gap_duration_50


In [None]:
non_parametric = pd.DataFrame(columns=['experiment', 'var'])

for var in variables:
    for exp in df['experiment'].unique():
        for sex in ['male', 'female']:
            data = df[(df['sex'] == sex) & (df['experiment'] == exp)][var].dropna()
            if len(data) > 2:  # Shapiro needs at least 3 values
                stat, p = shapiro(data)
                if p < 0.05:
                    non_parametric = pd.concat([non_parametric, pd.DataFrame({'experiment': [exp], 'var': [var]})], ignore_index=True)
                    print(f"{exp} | {sex}: Non-normal distribution (p={p:.3f})")
            else:
                raise Exception(f"{exp} | {sex}: Not enough data")

print("--------------------------------------------------------------")
print(f"Non-parametric entries after adjustment: {len(non_parametric)}")
print(non_parametric)

gap_duration_8 | male: Non-normal distribution (p=0.007)
gap_duration_10 | male: Non-normal distribution (p=0.041)
gap_duration_50 | male: Non-normal distribution (p=0.006)
ASR_control | female: Non-normal distribution (p=0.001)
tone_in_noise | female: Non-normal distribution (p=0.000)
gap_duration_4 | female: Non-normal distribution (p=0.002)
gap_duration_10 | male: Non-normal distribution (p=0.032)
gap_duration_10 | female: Non-normal distribution (p=0.044)
gap_duration_50 | male: Non-normal distribution (p=0.024)
gap_duration_50 | female: Non-normal distribution (p=0.000)
gap_duration_4 | female: Non-normal distribution (p=0.049)
gap_duration_50 | male: Non-normal distribution (p=0.007)
gap_duration_50 | female: Non-normal distribution (p=0.001)
gap_duration_4 | female: Non-normal distribution (p=0.050)
gap_depth | male: Non-normal distribution (p=0.041)
Non-parametric entries after adjustment: 15
         experiment           var
0    gap_duration_8  reactionTime
1   gap_duration_1

In [None]:
# input dfs:
# df: main df, merged in date and reps
# date_df: df with date, merged in reps
# reps_df: df with reps, merged in date
# df_top_10: top 10 df, merged in date and reps
# minus_top_10: df with top 10 removed, merged in date and reps

# df
# tt, mtt, dt, rp
# tt_dt, tt_rp, mt_dt, mt_rp
# dt_rp, tt_dt_rp, mt_dt_rp<

# averaging across reps and dates only occurs in Analyzer
# Peakfinder only outputs a merged df for overview
# it also gives a top_10, minus_top_10 and main dataframe, each with repetitions and dates not averages


# abbreviations: 
    # tt: top 10
    # mtt: minus top 10
    # dt: date
    # rp: reps
    # examples: tt_dt_rp: top 10 with date and reps | dt_rp: date and reps | df: fully merged df

    # test_df is the currently tested df and can change depending on previous test results

# test order depends on each previous result (as in if we can't merge across days, then use date_df for rest, for example)

# compare date df to df
# compare top_10 df to minus_top_10 df
# compare reps df to df
# compare strength metrics between males and females
# compare reaction time metrics between males and females
# compare time of day (i.e. animal number in order 15, 2, 1, 6, 10, 4) for all metrics
# compare experiment for all metrics

In [None]:
test_df = df.copy()