In [1]:
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats

from constants import MORPHOLOGICAL, TEMPORAL, SPATIAL, feature_names

In [2]:
pd.options.display.max_columns = 500

In [3]:
PATH = 'ml/results_rf_290322.csv'
BASE = 'ml/results_rf_310322_chance_balanced.csv'

df = pd.read_csv(PATH, index_col=0)
df = df[df.restriction == 'complete']
df = df[df.chunk_size == 0]
 
df_base = pd.read_csv(BASE, index_col=0)
df_base = df_base[df_base.restriction == 'complete']
df_base = df_base[df_base.chunk_size == 0]

## Spatial

In [4]:
spatial_df = df[df.modality == 'spatial']
spatial_df = spatial_df.dropna(how='all', axis=1)
keep = [f'test feature {i+1}' for i in SPATIAL[:-1]]
drop = [c for c in spatial_df.columns if c not in keep]
spatial_df = spatial_df.drop(columns=drop)
mapper = {f'test feature {i+1}': feature_names[i] for i in SPATIAL[:-1]}
spatial_df = spatial_df.rename(columns=mapper)

In [5]:
spatial_df.describe()

Unnamed: 0,spatial_dispersion_count,spatial_dispersion_sd,spatial_dispersion_area,dep_red,dep_sd,fzc_red,fzc_sd,szc_red,szc_sd,dep_graph_avg_speed,dep_graph_slowest_path,dep_graph_fastest_path,fzc_graph_avg_speed,fzc_graph_slowest_path,fzc_graph_fastest_path,szc_graph_avg_speed,szc_graph_slowest_path,szc_graph_fastest_path
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.008157,0.022639,0.012875,0.038441,0.034296,0.069903,0.046033,0.010683,0.019578,0.01759,0.020301,0.012056,0.021556,0.063163,0.012582,0.021149,0.015083,0.012384
std,0.003486,0.005773,0.004187,0.009396,0.007402,0.010528,0.006553,0.002496,0.005734,0.004641,0.005943,0.003428,0.005174,0.00819,0.003555,0.005211,0.004028,0.003693
min,0.002394,0.011486,0.005234,0.020363,0.019161,0.047404,0.03214,0.006844,0.007898,0.007571,0.010005,0.006634,0.005972,0.038346,0.005654,0.009828,0.008478,0.004772
25%,0.005683,0.018831,0.008811,0.030304,0.029692,0.061267,0.041863,0.008139,0.016065,0.014211,0.016627,0.009288,0.018613,0.058623,0.010653,0.016863,0.012389,0.009731
50%,0.007561,0.02187,0.013089,0.037615,0.033859,0.070965,0.045569,0.010798,0.018779,0.017128,0.019148,0.012274,0.021413,0.062288,0.011859,0.021352,0.015126,0.012881
75%,0.010354,0.027174,0.015569,0.045413,0.038427,0.077899,0.050663,0.012171,0.023099,0.019669,0.023761,0.013883,0.024288,0.068641,0.014512,0.024704,0.01759,0.013849
max,0.018238,0.034083,0.020488,0.061534,0.056202,0.088816,0.061116,0.01809,0.037043,0.02759,0.037424,0.021799,0.034057,0.079624,0.022949,0.032519,0.026987,0.020881


In [6]:
spatial_df_base = df_base[df_base.modality == 'spatial']
spatial_df_base = spatial_df_base.dropna(how='all', axis=1)
spatial_df_base = spatial_df_base.drop(columns=drop)
spatial_df_base = spatial_df_base.rename(columns=mapper)

for col in spatial_df.columns:
    col_test = spatial_df[col].to_numpy()
    col_base = spatial_df_base[col].to_numpy()
    
    test_median, test_prec25, test_prec75 = np.percentile(col_test, [50, 25, 75])
    base_median, base_prec25, base_prec75 = np.percentile(col_base, [50, 25, 75])
    
    print(f"Median of test {col} column is {test_median} [{test_prec25}, {test_prec75}]")
    print(f"Median of base {col} column is {base_median} [{base_prec25}, {base_prec75}]")
    
    statistic, p_val = stats.mannwhitneyu(col_test, col_base, alternative='greater')
    print(f"Mann-Whitney statistical test results for feature {col} are p-value={p_val} (statistic={statistic})")
    print()

Median of test spatial_dispersion_count column is 0.007560866764329715 [0.005683382003977987, 0.010354124025802968]
Median of base spatial_dispersion_count column is 0.007041026756129176 [0.0007181572344090141, 0.017858875895120335]
Mann-Whitney statistical test results for feature spatial_dispersion_count are p-value=0.3157800554343734 (statistic=1320.0)

Median of test spatial_dispersion_sd column is 0.021870090322510485 [0.018831197043526413, 0.027174286800734096]
Median of base spatial_dispersion_sd column is 0.024264596939157078 [0.016083419370950938, 0.03660531439831923]
Mann-Whitney statistical test results for feature spatial_dispersion_sd are p-value=0.8982980874234672 (statistic=1066.0)

Median of test spatial_dispersion_area column is 0.013088653565821467 [0.008810958959543064, 0.015569175283565736]
Median of base spatial_dispersion_area column is 0.02369019140630764 [0.013339157592098035, 0.03398820932875382]
Mann-Whitney statistical test results for feature spatial_dispers

In [7]:
spatial_families = {'SPD': ['spatial_dispersion_count', 'spatial_dispersion_sd', 'spatial_dispersion_area'],
                  'time_lag': ['dep_red', 'dep_sd','fzc_red', 'fzc_sd', 'szc_red', 'szc_sd'],
                  'graph': ['dep_graph_avg_speed', 'dep_graph_slowest_path', 'dep_graph_fastest_path',
                           'fzc_graph_avg_speed', 'fzc_graph_slowest_path', 'fzc_graph_fastest_path',
                           'szc_graph_avg_speed', 'szc_graph_slowest_path', 'szc_graph_fastest_path']}

for fam in spatial_families:
    spatial_df[fam] = spatial_df[spatial_families[fam]].sum(axis=1)

In [8]:
spatial_fams_df = spatial_df[[fam for fam in spatial_families]]
spatial_fams_df.describe()

Unnamed: 0,SPD,time_lag,graph
count,50.0,50.0,50.0
mean,0.043671,0.218934,0.195864
std,0.009686,0.017582,0.016523
min,0.022169,0.18665,0.152139
25%,0.036502,0.205508,0.187862
50%,0.044168,0.216428,0.199188
75%,0.049776,0.228643,0.205598
max,0.067595,0.279442,0.247232


In [9]:
spd = spatial_fams_df['SPD'].to_numpy()
time_lag =  spatial_fams_df['time_lag'].to_numpy()
graph = spatial_fams_df['graph'].to_numpy()
statistic, p_val = stats.mannwhitneyu(spd, time_lag)
print(f"Mann-Whitney statistical test results for spd vs time lag are p-value={p_val} (statistic={statistic})")
statistic, p_val = stats.mannwhitneyu(spd, graph)
print(f"Mann-Whitney statistical test results for spd vs graph are p-value={p_val} (statistic={statistic})")

Mann-Whitney statistical test results for spd vs time lag are p-value=3.533035965194466e-18 (statistic=0.0)
Mann-Whitney statistical test results for spd vs graph are p-value=3.533035965194466e-18 (statistic=0.0)


## Temporal

In [10]:
temporal_df = df[df.modality == 'temporal']
temporal_df = temporal_df.dropna(how='all', axis=1)
keep = [f'test feature {i+1}' for i in TEMPORAL[:-1]]
drop = [c for c in temporal_df.columns if c not in keep]
temporal_df = temporal_df.drop(columns=drop)
mapper = {f'test feature {i+1}': feature_names[i] for i in TEMPORAL[:-1]}
temporal_df = temporal_df.rename(columns=mapper)

In [11]:
temporal_df.describe()

Unnamed: 0,firing_rate,d_kl_start,d_kl_mid,jump,psd_center,der_psd_center,rise_time,unif_dist
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.052899,0.105035,0.03342,0.092405,0.012436,0.0095,0.078004,0.107604
std,0.011018,0.018072,0.010276,0.017967,0.004968,0.002914,0.018191,0.017256
min,0.006244,0.06303,0.006426,0.068679,0.005715,0.004536,0.045992,0.046523
25%,0.046924,0.096629,0.029169,0.083318,0.009824,0.007329,0.068723,0.103108
50%,0.05188,0.104068,0.034059,0.090258,0.011709,0.009118,0.076061,0.108339
75%,0.058698,0.113917,0.038758,0.096716,0.014569,0.011002,0.088071,0.114505
max,0.078281,0.186239,0.068116,0.185832,0.037576,0.018495,0.136932,0.164365


In [12]:
temporal_df_base = df_base[df_base.modality == 'temporal']
temporal_df_base = temporal_df_base.dropna(how='all', axis=1)
temporal_df_base = temporal_df_base.drop(columns=drop)
temporal_df_base = temporal_df_base.rename(columns=mapper)

for col in temporal_df.columns:
    col_test = temporal_df[col].to_numpy()
    col_base = temporal_df_base[col].to_numpy()
    
    test_median, test_prec25, test_prec75 = np.percentile(col_test, [50, 25, 75])
    base_median, base_prec25, base_prec75 = np.percentile(col_base, [50, 25, 75])
    
    print(f"Median of test {col} column is {test_median} [{test_prec25}, {test_prec75}]")
    print(f"Median of base {col} column is {base_median} [{base_prec25}, {base_prec75}]")
    
    statistic, p_val = stats.mannwhitneyu(col_test, col_base, alternative='greater')
    print(f"Mann-Whitney statistical test results for feature {col} are p-value={p_val} (statistic={statistic})")
    print()

Median of test firing_rate column is 0.051879928673604185 [0.04692356464597692, 0.05869761229930266]
Median of base firing_rate column is 0.03829658380013462 [0.023315651285245355, 0.06439050672051325]
Mann-Whitney statistical test results for feature firing_rate are p-value=0.004817937304755006 (statistic=1626.0)

Median of test d_kl_start column is 0.1040682573644285 [0.09662923922992477, 0.1139172146512519]
Median of base d_kl_start column is 0.04139203064877442 [0.024169827705760805, 0.06424576508042038]
Mann-Whitney statistical test results for feature d_kl_start are p-value=9.20877522024638e-12 (statistic=2225.0)

Median of test d_kl_mid column is 0.034059464422271715 [0.029169156976765566, 0.03875761177911381]
Median of base d_kl_mid column is 0.04006437864454516 [0.021583831853368155, 0.07213037960234733]
Mann-Whitney statistical test results for feature d_kl_mid are p-value=0.952713809874565 (statistic=1008.0)

Median of test jump column is 0.09025846809002697 [0.0833182092598

In [13]:
temporal_families = {'short': ['d_kl_start', 'unif_dist', 'rise_time'],
                     'long': ['d_kl_mid', 'jump'],
                     'general': ['firing_rate', 'psd_center', 'der_psd_center']}

for fam in temporal_families:
    temporal_df[fam] = temporal_df[temporal_families[fam]].sum(axis=1)

In [14]:
temporal_fams_df = temporal_df[[fam for fam in temporal_families]]
temporal_fams_df.describe()

Unnamed: 0,short,long,general
count,50.0,50.0,50.0
mean,0.290643,0.125825,0.074835
std,0.016133,0.01672,0.012568
min,0.236666,0.098497,0.030454
25%,0.281345,0.115836,0.067393
50%,0.292104,0.124713,0.074867
75%,0.30012,0.132389,0.083284
max,0.344703,0.192518,0.099593


In [15]:
hf = temporal_fams_df['short'].to_numpy()
lf =  temporal_fams_df['long'].to_numpy()
wb = temporal_fams_df['general'].to_numpy()
statistic, p_val = stats.mannwhitneyu(hf, lf)
print(f"Mann-Whitney statistical test results for short vs long are p-value={p_val} (statistic={statistic})")
statistic, p_val = stats.mannwhitneyu(hf, wb)
print(f"Mann-Whitney statistical test results for short vs wb are p-value={p_val} (statistic={statistic})")

Mann-Whitney statistical test results for short vs long are p-value=3.533035965194466e-18 (statistic=0.0)
Mann-Whitney statistical test results for short vs wb are p-value=3.533035965194466e-18 (statistic=0.0)


## WF (Morphological)

In [16]:
morph_df = df[df.modality == 'morphological']
morph_df = morph_df.dropna(how='all', axis=1)
keep = [f'test feature {i+1}' for i in MORPHOLOGICAL[:-1]]
drop = [c for c in morph_df.columns if c not in keep]
morph_df = morph_df.drop(columns=drop)
mapper = {f'test feature {i+1}': feature_names[i] for i in MORPHOLOGICAL[:-1]}
morph_df = morph_df.rename(columns=mapper)

In [17]:
morph_df.describe()

Unnamed: 0,break_measure,fwhm,get_acc,max_speed,peak2peak,trough2peak,rise_coef,smile_cry
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.008521,0.035224,0.101185,0.009032865,0.097699,0.190561,0.01137827,0.035826
std,0.003545,0.017608,0.030634,0.004271006,0.022694,0.039112,0.004846682,0.012157
min,0.00062,0.003057,0.032343,4.0365680000000004e-17,0.025971,0.115236,1.912679e-17,0.005013
25%,0.006373,0.026276,0.083422,0.006592689,0.08753,0.163373,0.008671741,0.029316
50%,0.009037,0.033434,0.09802,0.009046122,0.098019,0.19232,0.01082665,0.036026
75%,0.01069,0.037255,0.115389,0.01077517,0.109689,0.217798,0.01450952,0.042025
max,0.015998,0.119573,0.205952,0.02382298,0.157351,0.321972,0.02576309,0.058694


In [18]:
morph_df_base = df_base[df_base.modality == 'morphological']
morph_df_base = morph_df_base.dropna(how='all', axis=1)
morph_df_base = morph_df_base.drop(columns=drop)
morph_df_base = morph_df_base.rename(columns=mapper)

for col in morph_df.columns:
    col_test = morph_df[col].to_numpy()
    col_base = morph_df_base[col].to_numpy()
    
    test_median, test_prec25, test_prec75 = np.percentile(col_test, [50, 25, 75])
    base_median, base_prec25, base_prec75 = np.percentile(col_base, [50, 25, 75])
    
    print(f"Median of test {col} column is {test_median} [{test_prec25}, {test_prec75}]")
    print(f"Median of base {col} column is {base_median} [{base_prec25}, {base_prec75}]")
    
    statistic, p_val = stats.mannwhitneyu(col_test, col_base, alternative='greater')
    print(f"Mann-Whitney statistical test results for feature {col} are p-value={p_val} (statistic={statistic})")
    print()

Median of test break_measure column is 0.009037026611095764 [0.006373333199739064, 0.010689796671595322]
Median of base break_measure column is 0.04680470119665257 [0.03347800825164413, 0.07255631741175245]
Mann-Whitney statistical test results for feature break_measure are p-value=0.9999999999999947 (statistic=129.0)

Median of test fwhm column is 0.03343398969362932 [0.026275573482633006, 0.03725518846725583]
Median of base fwhm column is 0.03414247525118232 [0.018709312045894398, 0.05936908344658501]
Mann-Whitney statistical test results for feature fwhm are p-value=0.6361332590062931 (statistic=1200.0)

Median of test get_acc column is 0.09802039850451001 [0.08342243129659441, 0.11538873204148653]
Median of base get_acc column is 0.05326338083463089 [0.036494628521940374, 0.0773086367972419]
Mann-Whitney statistical test results for feature get_acc are p-value=2.9103215693128533e-09 (statistic=2095.0)

Median of test max_speed column is 0.009046122356535251 [0.006592688723489834, 0

In [19]:
morph_families = {'WF': ['trough2peak', 'peak2peak', 'fwhm', 'rise_coef'],
                  'first': ['max_speed'],
                  'second': ['break_measure', 'smile_cry', 'get_acc']}

for fam in morph_families:
    morph_df[fam] = morph_df[morph_families[fam]].sum(axis=1)

In [20]:
morph_fams_df = morph_df[[fam for fam in morph_families]]
morph_fams_df.describe()

Unnamed: 0,WF,first,second
count,50.0,50.0,50.0
mean,0.334862,0.009032865,0.145532
std,0.039035,0.004271006,0.032946
min,0.257241,4.0365680000000004e-17,0.064161
25%,0.305781,0.006592689,0.124677
50%,0.338627,0.009046122,0.144082
75%,0.361445,0.01077517,0.163922
max,0.414542,0.02382298,0.255883


In [29]:
from perm_test import calc_perm_test
wf = morph_fams_df['WF'].to_numpy()
first =  morph_fams_df['first'].to_numpy()
calc_perm_test(wf, first, reps=1000)
#statistic, p_val = stats.mannwhitneyu(wf, second)
#print(f"Mann-Whitney statistical test results for wf vs second derivative are p-value={p_val} (statistic={statistic})")

0.000999000999000999

In [25]:
statistic, p_val = stats.mannwhitneyu([0.83, 0.85, 0.88], [0.26, 0.28, 0.55, 0.69])

In [26]:
p_val

0.02591496360895484

In [None]:
from perm_test import calc_perm_test
calc_perm_test([5, 6, 7], [1, 2, 3], reps=10000)