In [1]:
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats

from constants import MORPHOLOGICAL, TEMPORAL, SPATIAL, feature_names

In [2]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

In [3]:
PATH = 'ml/results_rf_290322.csv'
BASE = 'ml/results_rf_100522_chance_balanced.csv'

df = pd.read_csv(PATH, index_col=0)
df = df[df.restriction == 'complete']
 
df_base = pd.read_csv(BASE, index_col=0)
df_base = df_base[df_base.restriction == 'complete']

## Spatial

In [4]:
spatial_df = df[df.chunk_size == 25]
spatial_df = spatial_df[spatial_df.modality == 'spatial']

spatial_df = spatial_df.dropna(how='all', axis=1)
keep = [f'test feature {i+1}' for i in SPATIAL[:-1]]
drop = [c for c in spatial_df.columns if c not in keep]
spatial_df = spatial_df.drop(columns=drop)
mapper = {f'test feature {i+1}': feature_names[i] for i in SPATIAL[:-1]}
spatial_df = spatial_df.rename(columns=mapper)

In [5]:
spatial_df.describe()

Unnamed: 0,spatial_dispersion_count,spatial_dispersion_sd,spatial_dispersion_area,dep_red,dep_sd,fzc_red,fzc_sd,szc_red,szc_sd,dep_graph_avg_speed,dep_graph_slowest_path,dep_graph_fastest_path,fzc_graph_avg_speed,fzc_graph_slowest_path,fzc_graph_fastest_path,szc_graph_avg_speed,szc_graph_slowest_path,szc_graph_fastest_path
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.019258,0.058443,0.031728,0.035669,0.027845,0.063139,0.051633,0.02874,0.045023,0.00964,0.014251,0.006413,0.020086,0.0723,0.005395,0.013227,0.012959,0.012497
std,0.006339,0.017121,0.010504,0.008144,0.00534,0.009957,0.007045,0.004305,0.005264,0.003124,0.004353,0.002231,0.004397,0.009955,0.001999,0.002891,0.002577,0.002751
min,0.009053,0.030911,0.011543,0.022099,0.017391,0.046091,0.038347,0.020238,0.033775,0.003332,0.006407,0.002771,0.014084,0.051748,0.002638,0.007905,0.005728,0.007664
25%,0.014467,0.043755,0.023549,0.029141,0.024118,0.057414,0.046568,0.025986,0.041168,0.00773,0.011179,0.004526,0.017391,0.064755,0.003529,0.010944,0.011207,0.010384
50%,0.018991,0.056425,0.031634,0.034724,0.027476,0.062116,0.051159,0.028232,0.045011,0.009005,0.014082,0.006408,0.019002,0.071313,0.005008,0.012786,0.013312,0.012437
75%,0.021894,0.067822,0.03911,0.040799,0.030855,0.068011,0.056136,0.030989,0.048901,0.011428,0.016427,0.008287,0.022048,0.078043,0.007222,0.015316,0.014641,0.01423
max,0.036648,0.111211,0.057228,0.05894,0.041908,0.09403,0.06958,0.03968,0.055964,0.017001,0.024763,0.0112,0.032934,0.094637,0.009885,0.025115,0.019429,0.019433


In [6]:
spatial_df_base = df_base[df_base.chunk_size == 25]
spatial_df_base = spatial_df_base[spatial_df_base.modality == 'spatial']

spatial_df_base = spatial_df_base.dropna(how='all', axis=1)
spatial_df_base = spatial_df_base.drop(columns=drop)
spatial_df_base = spatial_df_base.rename(columns=mapper)

for col in spatial_df.columns:
    col_test = spatial_df[col].to_numpy()
    col_base = spatial_df_base[col].to_numpy()
    
    test_median, test_prec25, test_prec75 = np.percentile(col_test, [50, 25, 75])
    base_median, base_prec25, base_prec75 = np.percentile(col_base, [50, 25, 75])
    
    print(f"Median of test {col} column is {test_median} [{test_prec25}, {test_prec75}]")
    print(f"Median of base {col} column is {base_median} [{base_prec25}, {base_prec75}]")
    
    statistic, p_val = stats.mannwhitneyu(col_test, col_base, alternative='greater')
    print(f"Mann-Whitney statistical test results for feature {col} are p-value={p_val} (statistic={statistic})")
    print()

Median of test spatial_dispersion_count column is 0.018991344565626643 [0.014467204223043697, 0.02189368017189274]
Median of base spatial_dispersion_count column is 0.000762749869746702 [0.00013649913635856316, 0.004435113408341001]
Mann-Whitney statistical test results for feature spatial_dispersion_count are p-value=9.75846411252353e-18 (statistic=2483.0)

Median of test spatial_dispersion_sd column is 0.05642479098099444 [0.04375465974480055, 0.06782213230753473]
Median of base spatial_dispersion_sd column is 0.003232878519698299 [0.0014147326186980903, 0.014838949714394723]
Mann-Whitney statistical test results for feature spatial_dispersion_sd are p-value=4.495934685445001e-17 (statistic=2457.0)

Median of test spatial_dispersion_area column is 0.031633850801470936 [0.023548669027378483, 0.039109812237353846]
Median of base spatial_dispersion_area column is 0.0031471374716517016 [0.0018681125281366621, 0.014960100193386085]
Mann-Whitney statistical test results for feature spatial

In [7]:
spatial_families = {'value-based': ['spatial_dispersion_count', 'spatial_dispersion_sd', 'spatial_dispersion_area'],
                  'time-based': ['dep_red', 'dep_sd','fzc_red', 'fzc_sd', 'szc_red', 'szc_sd'],
                  'graph-based': ['dep_graph_avg_speed', 'dep_graph_slowest_path', 'dep_graph_fastest_path',
                           'fzc_graph_avg_speed', 'fzc_graph_slowest_path', 'fzc_graph_fastest_path',
                           'szc_graph_avg_speed', 'szc_graph_slowest_path', 'szc_graph_fastest_path']}

for fam in spatial_families:
    spatial_df[fam] = spatial_df[spatial_families[fam]].sum(axis=1)

In [8]:
spatial_fams_df = spatial_df[[fam for fam in spatial_families]]
spatial_fams_df.describe()

Unnamed: 0,value-based,time-based,graph-based
count,50.0,50.0,50.0
mean,0.109428,0.252049,0.166768
std,0.026085,0.014809,0.010581
min,0.061902,0.222347,0.143385
25%,0.087887,0.241442,0.16119
50%,0.113492,0.251294,0.16661
75%,0.128816,0.261832,0.172597
max,0.162594,0.291679,0.190046


In [9]:
value_based = spatial_fams_df['value-based'].to_numpy()
time_based =  spatial_fams_df['time-based'].to_numpy()
graph_based = spatial_fams_df['graph-based'].to_numpy()
statistic, p_val = stats.mannwhitneyu(value_based, time_based)
print(f"Mann-Whitney statistical test results for spd vs time lag are p-value={p_val} (statistic={statistic})")
statistic, p_val = stats.mannwhitneyu(value_based, graph_based)
print(f"Mann-Whitney statistical test results for spd vs graph are p-value={p_val} (statistic={statistic})")
statistic, p_val = stats.mannwhitneyu(time_based, graph_based)
print(f"Mann-Whitney statistical test results for time lag vs graph are p-value={p_val} (statistic={statistic})")

Mann-Whitney statistical test results for spd vs time lag are p-value=3.533035965194466e-18 (statistic=0.0)
Mann-Whitney statistical test results for spd vs graph are p-value=2.819367238236675e-17 (statistic=35.0)
Mann-Whitney statistical test results for time lag vs graph are p-value=3.533035965194466e-18 (statistic=0.0)


## Temporal

In [10]:
temporal_df = df[df.chunk_size == 800]
temporal_df = temporal_df[temporal_df.modality == 'temporal']

temporal_df = temporal_df.dropna(how='all', axis=1)
keep = [f'test feature {i+1}' for i in TEMPORAL[:-1]]
drop = [c for c in temporal_df.columns if c not in keep]
temporal_df = temporal_df.drop(columns=drop)
mapper = {f'test feature {i+1}': feature_names[i] for i in TEMPORAL[:-1]}
temporal_df = temporal_df.rename(columns=mapper)

In [11]:
temporal_df.describe()

Unnamed: 0,firing_rate,d_kl_start,d_kl_mid,jump,psd_center,der_psd_center,rise_time,unif_dist
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.087031,0.044065,0.152247,0.053496,0.014397,0.005942,0.054236,0.113284
std,0.01746,0.011069,0.015606,0.009466,0.003299,0.001262,0.010251,0.014072
min,0.026596,0.019195,0.111697,0.022683,0.00828,0.003852,0.033203,0.08716
25%,0.076712,0.038511,0.141429,0.048797,0.011963,0.005116,0.047422,0.104253
50%,0.086425,0.044727,0.150269,0.053942,0.014737,0.005567,0.054282,0.111365
75%,0.101863,0.050708,0.162917,0.059275,0.016648,0.006645,0.061086,0.118013
max,0.115412,0.071568,0.19583,0.075442,0.02239,0.00945,0.083833,0.14445


In [12]:
temporal_df_base = df_base[df_base.chunk_size == 800]
temporal_df_base = temporal_df_base[temporal_df_base.modality == 'temporal']

temporal_df_base = temporal_df_base.dropna(how='all', axis=1)
temporal_df_base = temporal_df_base.drop(columns=drop)
temporal_df_base = temporal_df_base.rename(columns=mapper)

for col in temporal_df.columns:
    col_test = temporal_df[col].to_numpy()
    col_base = temporal_df_base[col].to_numpy()
    
    test_median, test_prec25, test_prec75 = np.percentile(col_test, [50, 25, 75])
    base_median, base_prec25, base_prec75 = np.percentile(col_base, [50, 25, 75])
    
    print(f"Median of test {col} column is {test_median} [{test_prec25}, {test_prec75}]")
    print(f"Median of base {col} column is {base_median} [{base_prec25}, {base_prec75}]")
    
    statistic, p_val = stats.mannwhitneyu(col_test, col_base, alternative='greater')
    print(f"Mann-Whitney statistical test results for feature {col} are p-value={p_val} (statistic={statistic})")
    print()

Median of test firing_rate column is 0.08642455502192373 [0.07671230385376865, 0.10186277054343819]
Median of base firing_rate column is 0.016516217464446245 [0.006505234578000194, 0.03322154780299706]
Mann-Whitney statistical test results for feature firing_rate are p-value=4.432167636937588e-16 (statistic=2417.0)

Median of test d_kl_start column is 0.04472722787576548 [0.03851128997385544, 0.05070818368305733]
Median of base d_kl_start column is 0.015568285000797887 [0.007320871806926052, 0.029712829906454713]
Mann-Whitney statistical test results for feature d_kl_start are p-value=7.13292606270825e-09 (statistic=2073.0)

Median of test d_kl_mid column is 0.15026869252651015 [0.14142938489459253, 0.16291650829465304]
Median of base d_kl_mid column is 0.017431048456719445 [0.008097744102484788, 0.03219746897456569]
Mann-Whitney statistical test results for feature d_kl_mid are p-value=3.533035965194466e-18 (statistic=2500.0)

Median of test jump column is 0.053942103048228295 [0.0487

In [13]:
temporal_families = {'short': ['d_kl_start', 'unif_dist', 'rise_time'],
                     'long': ['d_kl_mid', 'jump'],
                     'wb': ['firing_rate', 'psd_center', 'der_psd_center']}

for fam in temporal_families:
    temporal_df[fam] = temporal_df[temporal_families[fam]].sum(axis=1)

In [14]:
temporal_fams_df = temporal_df[[fam for fam in temporal_families]]
temporal_fams_df.describe()

Unnamed: 0,short,long,wb
count,50.0,50.0,50.0
mean,0.211585,0.205743,0.10737
std,0.017423,0.013835,0.018577
min,0.177947,0.170321,0.050085
25%,0.198735,0.195974,0.097235
50%,0.21229,0.205285,0.105786
75%,0.220138,0.214322,0.12105
max,0.249922,0.237449,0.140289


In [15]:
short = temporal_fams_df['short'].to_numpy()
long =  temporal_fams_df['long'].to_numpy()
wb = temporal_fams_df['wb'].to_numpy()
statistic, p_val = stats.mannwhitneyu(short, long)
print(f"Mann-Whitney statistical test results for short vs long are p-value={p_val} (statistic={statistic})")
statistic, p_val = stats.mannwhitneyu(short, wb)
print(f"Mann-Whitney statistical test results for short vs wb are p-value={p_val} (statistic={statistic})")
statistic, p_val = stats.mannwhitneyu(long, wb)
print(f"Mann-Whitney statistical test results for short vs wb are p-value={p_val} (statistic={statistic})")

Mann-Whitney statistical test results for short vs long are p-value=0.05150971731880441 (statistic=1013.0)
Mann-Whitney statistical test results for short vs wb are p-value=3.533035965194466e-18 (statistic=0.0)
Mann-Whitney statistical test results for short vs wb are p-value=3.533035965194466e-18 (statistic=0.0)


## WF (Morphological)

In [16]:
morph_df = df[df.chunk_size == 800]
morph_df = morph_df[morph_df.modality == 'morphological']

morph_df = morph_df.dropna(how='all', axis=1)
keep = [f'test feature {i+1}' for i in MORPHOLOGICAL[:-1]]
drop = [c for c in morph_df.columns if c not in keep]
morph_df = morph_df.drop(columns=drop)
mapper = {f'test feature {i+1}': feature_names[i] for i in MORPHOLOGICAL[:-1]}
morph_df = morph_df.rename(columns=mapper)

In [17]:
morph_df.describe()

Unnamed: 0,break_measure,fwhm,get_acc,max_speed,peak2peak,trough2peak,rise_coef,smile_cry
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.006434,0.029031,0.104781,0.009399,0.128916,0.20362,0.009885,0.032507
std,0.00207,0.012791,0.021722,0.004087,0.022366,0.032143,0.004837,0.012497
min,0.002446,0.006071,0.049581,0.002093,0.061892,0.155509,0.001187,0.009446
25%,0.004901,0.022513,0.090842,0.006481,0.123358,0.17906,0.006995,0.025205
50%,0.00629,0.026655,0.105666,0.00833,0.130038,0.198458,0.009638,0.031454
75%,0.007447,0.03334,0.114034,0.011358,0.14111,0.216723,0.011474,0.041643
max,0.013381,0.087564,0.16535,0.020086,0.167905,0.312614,0.028532,0.065323


In [18]:
morph_df_base = df_base[df_base.chunk_size == 800]
morph_df_base = morph_df_base[morph_df_base.modality == 'morphological']

morph_df_base = morph_df_base.dropna(how='all', axis=1)
morph_df_base = morph_df_base.drop(columns=drop)
morph_df_base = morph_df_base.rename(columns=mapper)

for col in morph_df.columns:
    col_test = morph_df[col].to_numpy()
    col_base = morph_df_base[col].to_numpy()
    
    test_median, test_prec25, test_prec75 = np.percentile(col_test, [50, 25, 75])
    base_median, base_prec25, base_prec75 = np.percentile(col_base, [50, 25, 75])
    
    print(f"Median of test {col} column is {test_median} [{test_prec25}, {test_prec75}]")
    print(f"Median of base {col} column is {base_median} [{base_prec25}, {base_prec75}]")
    
    statistic, p_val = stats.mannwhitneyu(col_test, col_base, alternative='greater')
    print(f"Mann-Whitney statistical test results for feature {col} are p-value={p_val} (statistic={statistic})")
    print()

Median of test break_measure column is 0.006290068559986869 [0.0049014040395007055, 0.0074465328017902545]
Median of base break_measure column is 0.018596447807124633 [0.010007567980126846, 0.03007820472513449]
Mann-Whitney statistical test results for feature break_measure are p-value=0.9999999999355599 (statistic=318.0)

Median of test fwhm column is 0.026655396674595464 [0.02251320875155285, 0.03333963498351692]
Median of base fwhm column is 0.0074789350527292115 [0.005019649586453379, 0.01565499470816915]
Mann-Whitney statistical test results for feature fwhm are p-value=1.3178582755491657e-09 (statistic=2114.0)

Median of test get_acc column is 0.10566587422029056 [0.09084170386223323, 0.11403398687374647]
Median of base get_acc column is 0.014869591514665801 [0.009090014454518798, 0.03500368640539315]
Mann-Whitney statistical test results for feature get_acc are p-value=2.2485208149267606e-16 (statistic=2429.0)

Median of test max_speed column is 0.008329826025534585 [0.006481367

In [19]:
morph_families = {'WF': ['trough2peak', 'peak2peak', 'fwhm', 'rise_coef'],
                  'first': ['max_speed'],
                  'second': ['break_measure', 'smile_cry', 'get_acc']}

for fam in morph_families:
    morph_df[fam] = morph_df[morph_families[fam]].sum(axis=1)

In [20]:
morph_fams_df = morph_df[[fam for fam in morph_families]]
morph_fams_df.describe()

Unnamed: 0,WF,first,second
count,50.0,50.0,50.0
mean,0.371452,0.009399,0.143722
std,0.02682,0.004087,0.027379
min,0.309516,0.002093,0.069636
25%,0.354583,0.006481,0.128598
50%,0.370537,0.00833,0.14271
75%,0.388478,0.011358,0.159242
max,0.44701,0.020086,0.216445


In [21]:
org = morph_fams_df['WF'].to_numpy()
first =  morph_fams_df['first'].to_numpy()
second = morph_fams_df['second'].to_numpy()
statistic, p_val = stats.mannwhitneyu(org, first)
print(f"Mann-Whitney statistical test results for WF vs first are p-value={p_val} (statistic={statistic})")
statistic, p_val = stats.mannwhitneyu(org, second)
print(f"Mann-Whitney statistical test results for WF vs second are p-value={p_val} (statistic={statistic})")

Mann-Whitney statistical test results for WF vs first are p-value=3.533035965194466e-18 (statistic=0.0)
Mann-Whitney statistical test results for WF vs second are p-value=3.533035965194466e-18 (statistic=0.0)
