# Individual subject averages



In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import joblib
from glob import glob
from pathlib import Path
import json
import os.path as op
import pyarrow.feather as feather

import mne
import mne.stats
from mne.stats import linear_regression, fdr_correction
from mne.viz import plot_compare_evokeds
from mne.channels import find_ch_adjacency, make_1020_channel_selections

from mne.stats import spatio_temporal_cluster_test
from scipy.stats import ttest_ind, zscore
from statsmodels.stats.multitest import multipletests

mne.set_log_level(verbose='error')

pd.set_option('display.max_rows', None)

## Read config file

In [None]:
bids_root = '../..'

config_file = op.join(bids_root, 'config.json')
config = json.load(open(config_file))

study_name = config['Study']['Name']
study_name = config['Study']['TaskName']
data_type = config['EEG']['data_type']
eog = config['EEG']['eog']
montage_fname = config['EEG']['montage']

epoch_p =  config['Preprocessing']['Epoch']
outlier_thresh = config['Analysis']['outlier_thresh']

baseline = eval(epoch_p['baseline'])

# comp =  {k: v for d in config['components'] for k, v in d.items()}
components = config['Analysis']['components']


## Paths

In [None]:
source_path = op.join(bids_root, 'derivatives', 'erp_preprocessing')

derivatives_path = op.join(bids_root, 'derivatives', 'erp_indiv_subject_analysis')
if Path(derivatives_path).exists() == False:
    Path(derivatives_path).mkdir(parents=True)
    
out_path = op.join(derivatives_path, 'data')
if Path(out_path).exists() == False:
    Path(out_path).mkdir(parents=True)

report_path = op.join(derivatives_path, 'reports')
if Path(report_path).exists() == False:
    Path(report_path).mkdir(parents=True)

fig_path = op.join(derivatives_path, 'figures')
if Path(fig_path).exists() == False:
    Path(fig_path).mkdir(parents=True) 
   
waveplot_figsize = (18, 6)
fig_format = 'pdf'
waveplot_stem = fig_path + '/waveforms_'
   
    
epochs_suffix = '-epo.fif'

## List of subjects

In [None]:
prefix = 'sub-'
subjects = sorted([s[-7:] for s in glob(source_path + '/' + prefix + '*')])
print("n subjects = ", len(subjects))
print(subjects)

## Define conditions and labels

In [None]:
conditions = ['Angry/Grey/target', 'Angry/Grey/nontarget',
              'Angry/Red/target', 'Angry/Red/nontarget',
              'Neutral/Grey/target', 'Neutral/Grey/nontarget',
              'Neutral/Red/target', 'Neutral/Red/nontarget',
              'target', 'nontarget'
             ]

coi = ['target', 'nontarget']

contrasts = {'Angry/Grey':['Angry/Grey/target', 'Angry/Grey/nontarget'],
             'Angry/Red':['Angry/Red/target', 'Angry/Red/nontarget'],
             'Neutral/Grey':['Neutral/Grey/target', 'Neutral/Grey/nontarget'],
             'Neutral/Red':['Neutral/Red/target', 'Neutral/Red/nontarget'],
             'Target-Nontarget':['target', 'nontarget']
            }

## Load in data

In [None]:
epochs = {}
for subject in subjects:
    subj_path = op.join(source_path, subject, 'eeg')
    epochs[subject] = mne.read_epochs(str(subj_path + '/' + subject + '_task-' + task + '-epo.fif'),
                                         verbose=None, 
                                         preload=True)


## Create evokeds
Averages over trials for each subject

In [None]:
evoked = {}
for subject in subjects:
    evoked[subject] = {cond:epochs[subject][cond].average().apply_baseline(baseline)
                       for cond in conditions
                      }

## Create Difference waves


In [None]:
diff = {}
for subject in subjects:
    diff[subject] = {contr:mne.combine_evoked([evoked[subject][contrasts[contr][0]], 
                                               -evoked[subject][contrasts[contr][1]]],
                                              weights='equal')
                     for contr in contrasts}

## Grand Averages

In [None]:
gavg = {}
for cond in conditions:
    gavg[cond] = mne.grand_average([epochs[subject][cond].average() for subject in subjects])

## Plot montage


In [None]:
gavg[conditions[0]].plot_sensors(show_names=True) #.savefig(fig_path + '/montage.' + fig_format)
plt.show()

## Define region of interest (ROI)

Cluster of electrodes around the vertex, where we expect N400 to be largest

In [None]:
# convoluted unpacking from yaml
rois = {k: v for d in config['rois'] for k, v in d.items()}
for roi, chs in rois.items():
    rois[roi]= [c.split(', ') for c in chs][0]

#### Create mask identifying ROI electrodes

In [None]:
chs = pd.Series(gavg[conditions[0]].ch_names)

roi_elec = [i for c in rois.values() for i in c ]
mask = chs.isin(roi_elec).to_numpy()
num_tp = gavg[conditions[0]].data.shape[1]
mask = np.repeat(mask[:, np.newaxis], num_tp, axis=1)

---
## Visualization

## Joint plot of grand averages

In [None]:
uv_range = 5
ylim = 6

for cond in sorted(conditions):
    gavg[cond].plot_joint(title=(cond), 
                             ts_args={'hline':[0], 
                                      'ylim':{'eeg':[-ylim, ylim]}
                                      },
                             topomap_args={'sensors':False, 'contours':False, 
                                           'vmin':-uv_range, 'vmax':uv_range}
                            ) #.savefig(jointplot_stem + cond + '.' + fig_format)     

## Plot grand averaged waveforms
Across all participants

In [None]:
evk = {}
for cond in conditions:
    evk[cond] = [epochs[subject][cond].average() for subject in subjects]
    
ylim = {'eeg':[-3.25, 3.5]}
panels = 3 

for contr, conds in contrasts.items():
    fig, axs = plt.subplots(1, panels, figsize=waveplot_figsize)    
    ax = 0
    if ax < panels - 1:
        show=False
    else:
        show=True
    for roi, chans in rois.items():
        mne.viz.plot_compare_evokeds({c:evk[c] for c in conds},
                                    picks=chans,
                                    combine='mean',
                                    title='Grand Average',
                                    ylim=ylim,
                                    legend='upper right',
                                    show_sensors='lower right',
                                    axes=axs[ax], show=show
                                );     
        ax += 1
    fig.savefig(waveplot_stem + 'grandavg_' + '_'.join(contr.split('/')) + '.' + fig_format)

## Difference waves

In [None]:
evoked_diff = {}

for contr, conds in contrasts.items():
    evoked_diff[contr] = [mne.combine_evoked([ c1, c2],
                                             weights=[1, -1])
                          for (c1, c2) in zip(evk[conds[0]], evk[conds[1]])
                         ]
    

In [None]:
ylim = {'eeg':[-3.25, 3.5]}
panels = 3 

for contr in contrasts:
    fig, axs = plt.subplots(1, panels, figsize=waveplot_figsize)    
    ax = 0
    if ax < panels - 1:
        show=False
    else:
        show=True
    for roi, chans in rois.items():
        mne.viz.plot_compare_evokeds({contr:evoked_diff[contr]},
                                    picks=chans,
                                    combine='mean',
                                    title='Grand Average:' + contr,
                                    ylim=ylim,
                                    legend='upper right',
                                    show_sensors='lower right',
                                    axes=axs[ax], show=show
                                );     
        ax += 1
    fig.savefig(waveplot_stem + 'grandavg_diff_' + '_'.join(contr.split('/')) + '.' + fig_format)

## Show all difference waveforms overlaid

In [None]:
cond_contr = ['Angry/Grey', 'Angry/Red', 'Neutral/Grey', 'Neutral/Red']
   
ylim = {'eeg':[-3.25, 3.5]}
panels = 3 


fig, axs = plt.subplots(1, panels, figsize=waveplot_figsize)    
ax = 0
if ax < panels - 1:
    show=False
else:
    show=True
for roi, chans in rois.items():
    mne.viz.plot_compare_evokeds({contr:evoked_diff[contr] for contr in cond_contr},
                                 picks=chans,
                                 combine='mean',
                                 title='Grand Average:' + contr,
                                 ylim=ylim,
                                 legend='upper right',
                                 show_sensors='lower right',
                                 ci=False, 
                                 axes=axs[ax], show=show
                            );     
    ax += 1
fig.savefig(waveplot_stem + 'grandavg_diff_' + '_all.' + fig_format)

## Topo maps

In [None]:
mne.grand_average(evoked_diff[contr])

In [None]:
vlim = [-5, 5]
  

for contr in contrasts:
    fig = mne.grand_average(evoked_diff[contr]).plot_topomap(np.arange(diff[subject][contr].tmin + .025, diff[subject][contr].tmax , .050), average=0.050,
                                          show_names=False, sensors=False, contours=False,
                                          colorbar=False, 
                                          vlim=vlim,
                                          title=str(contr),
                                          ); 
    
    fig.savefig(waveplot_stem + 'grandavg_diff_topoplot' + '_'.join(contr.split('/')) + '.' + fig_format)

## Generate results each subject

Saved in an HTML report file for each participant:
- joint plots
- topo plots
- waveform plots
- difference wave plots
- t-tests between target & nontarget




In [None]:
df_all_list = []
df_ttest_all_list = []

for subject in subjects:
    report = mne.Report(subject=subject, 
                            title=study_name + ' participant_id-level analysis: ' + subject,
                            verbose='WARNING')
    
    # joint plot
    uv_range = 5
    ylim = 6
    for cond in sorted(conditions):
        fig = gavg[cond].plot_joint(title=(cond), 
                                    ts_args={'hline':[0], 
                                          'ylim':{'eeg':[-ylim, ylim]}
                                          },
                                    topomap_args={'sensors':False, 'contours':False, 
                                               'vmin':-uv_range, 'vmax':uv_range},
                                    show=False
                                )         
        report.add_figure(fig=fig, title=cond)
        plt.close(fig)
        
    # topo plot
    vlim = [-3, 3]
    for contr in contrasts:
        fig = diff[subject][contr].plot_topomap(np.arange(diff[subject][contr].tmin + .025, diff[subject][contr].tmax , .050), average=0.050,
                                      show_names=False, sensors=False, contours=False,
                                      colorbar=False, 
                                      vlim=vlim,
                                      title=str(subject + ' ' + contr ),
                                               show=False);    
        report.add_figure(fig=fig, title=contr)
        plt.close(fig)

    # waveform plot
    ylim = {'eeg':[-3.25, 3.5]}
    panels = 3    
    fig, axs = plt.subplots(1, panels, figsize=(18, 6))    
    ax = 0
    for roi, chans in rois.items():
        mne.viz.plot_compare_evokeds({c:evoked[subject][c] for c in coi},
                                    picks=chans,
                                    combine='mean',
                                    title=subject + ' target - nontarget',
                                    ylim=ylim,
                                    legend='upper right',
                                    show_sensors='lower right',
                                    axes=axs[ax], show=False
                                );     
        ax += 1
    report.add_figure(fig=fig, title='Waveform plots')
    plt.close(fig)

    # difference wave plot
    ylim = {'eeg':[-3.25, 3.5]}
    panels = 3
    fig, axs = plt.subplots(1, panels, figsize=(18, 6))    
    ax = 0
    for roi, chans in rois.items():

        mne.viz.plot_compare_evokeds(diff[subject],
                                    picks=chans,
                                    combine='mean',
                                    title=subject + ' target - nontarget',
                                    ylim=ylim,
                                    legend='upper right',
                                    show_sensors='lower right',
                                    axes=axs[ax], show=False
                                );     
        ax += 1
    report.add_figure(fig=fig, title='Waveform plots')
    plt.close(fig)
    
    # Measure components
    df_list = []

    for c, cp in components.items():
        tw_start = cp['t_min']
        tw_end   = cp['t_max']
        tw_width = cp['tw_width']
        component_meas = cp['component_meas']

        for cond in conditions:
            for roi, chans in rois.items():                
                if component_meas == 'meana':
                    peak = np.array([np.nan, 
                                     np.median([tw_start, tw_end]), 
                                     np.nan])
                else:
                    # find peak amplitude in specified timewindow, among channels in ROI(s) of interest
                    tmp_dat = evoked[subject][cond].copy().pick_channels(chans)
                    try:
                        peak = tmp_dat.get_peak(tmin=tw_start,
                                                tmax=tw_end, 
                                                mode=component_meas,
                                               )  
                    except:
                         peak = np.array([np.nan, 
                                         np.median([tw_start, tw_end]), 
                                         np.nan])

                peak_window = ((peak[1] - (tw_width / 2)), 
                               (peak[1] + (tw_width / 2))
                              )
                idx_start, idx_stop = np.searchsorted(tmp_dat.times, peak_window)

                df_list.append(pd.concat([pd.DataFrame({'participant_id':subject, 
                                                        'Condition':cond,
                                                        'Component':c,
                                                        'ROI':roi,
                                                        'PeakLat':peak[1],
                                                        'PeakChan':peak[0],
                                                         'Channel':np.tile(chans, epochs[subject][cond].selection.shape)
                                                       }),
                                         pd.DataFrame(epochs[subject][cond].copy().get_data(picks=chans)[:, :, idx_start:idx_stop].mean(axis=-1).flatten() * 10e5,
                                                      columns=['Amplitude'])
                                         ],                                          
                                         axis=1
                                        )
                              )
    
    df = pd.concat(df_list, ignore_index=True)
    
    # remove outliers
    z_thresh = epoch_p['outlier_thresh'] # cutoff for defining outliers, in SD

    # Compute standard (z) scores 
    df['Peak.Ampl.z'] = df.loc[:, ['participant_id', 
                                   'Component', 
                                   'Amplitude']].groupby(['participant_id', 
                                                          'Component']).transform(zscore)

    # Drop outliers based on z_thresh
    df = df[(df['Peak.Ampl.z'] >= -z_thresh) & (df['Peak.Ampl.z'] <= z_thresh)]

    
    df_all_list.append(df)
    
    # Ttests
    df_list = []
    dfi = df.set_index(['participant_id', 'Component', 'ROI', 'Condition'])
    for component in components:
        for contr_name, contr in contrasts.items():
            for roi in rois:
                a = dfi.loc[(subject, component, roi, contr[0]), 'Amplitude']
                b = dfi.loc[(subject, component, roi, contr[1]), 'Amplitude']
               # conduct t test
                t, p = ttest_ind(a, b)
                df_list.append(pd.DataFrame({'participant_id':subject,
                                             'Component':component,
                                             'ROI':roi,
                                             'Contrast':contr_name,
                                             't':t.round(2),
                                             'p':p.round(4)
                                            },
                                            index=[0]
                                           )
                              )

    df_ttest = pd.concat(df_list, ignore_index=True)                       
    df_ttest['p (corr.)'] = multipletests(df_ttest['p'], alpha=.05, method='fdr_bh')[1].round(4)
    df_ttest_all_list.append(df_ttest)
    t_table = df_ttest.set_index([ 'Component', 'participant_id', 'Contrast', 'ROI']).to_html()
    report.add_html(t_table, title='t tests of contrasts (corrected using FDR)')

    
    # Write report
    report_name = report_path + '/' + subject + '.html'
    report.save(report_name, overwrite=True)
    
df_all = pd.concat(df_all_list, ignore_index=True)  
df_ttest_all = pd.concat(df_ttest_all_list, ignore_index=True)  

### Export Trimmed Data For Analysis in R

In [None]:
# generate 1 file/subject because the aggregated file is big and creates issues eg pushing to GitHub
for subj in subjects:
    out_dir = op.join(out_path, subj)
    if Path(out_dir).exists() == False:
        Path(out_dir).mkdir(parents=True) 
    feather.write_feather(df_all[df_all['participant_id'] == subj], 
                          out_dir + '/' + subj + '_indiv_trials' + '.feather')

---
## Measure differences for each subject and component

The code first finds the peak negative value in the target-nontarget difference, for each component and ROI, for each individual. 

Then, it computes the average over a time window centered around the time of the component peak for that individual

In [None]:
# %%time

df_list = []
# df = pd.DataFrame(columns = ['participant_id', 'Trial', 'Condition', 'Component', 'Amplitude', 'Peak Time', 'Peak Chan'])

for c, cp in components.items():
    tw_start = cp['t_min']
    tw_end   = cp['t_max']
    tw_width = cp['tw_width']
    component_meas = cp['component_meas']
    
    for subj in subjects:
        for contr in contrasts:
            for roi, chans in rois.items():                
                if component_meas == 'meana':
                    peak = np.array([np.nan, 
                                     np.median([tw_start, tw_end]), 
                                     np.min(tmp_dat.copy().get_data(picks=chans)[:, np.searchsorted(tmp_dat.times, np.median([tw_start, tw_end]))])
                                    ])
                else:
                    # find peak amplitude in specified timewindow, among channels in ROI(s) of interest
                    tmp_dat = diff[subj][contr].copy().pick_channels(chans)
                    try:
                        peak = tmp_dat.get_peak(tmin=tw_start,
                                                tmax=tw_end, 
                                                mode=component_meas,
                                               )  
                    except:
                         peak = np.array([np.nan, 
                                         np.median([tw_start, tw_end]), 
                                         np.nan])
                       
                    
                peak_window = ((peak[1] - (tw_width / 2)), 
                               (peak[1] + (tw_width / 2))
                              )
                idx_start, idx_stop = np.searchsorted(tmp_dat.times, peak_window)

                df_list.append(pd.DataFrame({'participant_id': subj, 
                                            'Contrast':contr,
                                            'Component':c,
                                             'ROI':roi,
                                            'Amplitude':tmp_dat.copy().get_data(picks=chans)[:, idx_start:idx_stop].mean(axis=-1).flatten() * 10e5,
                                            'PeakLat':peak[1],
                                            'PeakChan':peak[0],
                                             'Channel':chans}),
#                                            pd.DataFrame(tmp_dat.copy().get_data(picks=chans)[:, idx_start:idx_stop].mean(axis=-1).flatten() * 10e5,
#                                                       columns=['Amplitude'])
#                                          ])
                              )
                                                       
                                                        
                                                       
            
df_diff = pd.concat(df_list, ignore_index=True)

In [None]:
df_diff.sample(12)

### Export Trimmed T-NT difference Data For Analysis in R

In [None]:
# generate 1 file/subject because the aggregated file is big and creates issues eg pushing to GitHub
for subj in subjects:
    out_dir = op.join(out_path, subj)
    if Path(out_dir).exists() == False:
        Path(out_dir).mkdir(parents=True) 
    feather.write_feather(df_diff[df_diff['participant_id'] == subj], 
                          out_dir + '/' + subj + '_diff_T-NT' + '.feather')

## Examine distribution of (adaptive) mean amplitudes

In [None]:
df_diff.groupby(['Component'])['Amplitude'].hist(grid=False)
plt.show()

## Examine distribution of peak latencies

In [None]:
df_diff.groupby(['Component'])['PeakLat'].hist(grid=False)
plt.show()

## Examine distribution of peak channels

In [None]:
df_diff.groupby(['Component'])['PeakChan'].hist(grid=False, alpha=.8)
plt.show()

## Statistics

We perform a *t*-test for each individual, between violation and control, based on the N400 measurements from above — i.e., 200 ms averages centred around the peak time, at the peak electrode.

### Aggregate (average) over channels within each ROI/Component/participant_id

This is the input to the *t* tests

In [None]:
df_agg = df_all.groupby(['participant_id', 'Component', 'ROI', 'Condition']).mean()

df_agg

In [None]:
contrasts

In [None]:
dfi.sample(12)

In [None]:
%%time
df_list = []
dfi = df_all.set_index(['participant_id', 'Component', 'ROI', 'Condition'])
for component in components:
     for subj in subjects:
            for contr_name, contr in contrasts.items():
                for roi in rois:
                     a = dfi.loc[(subj, component, roi, contr[0]), 'Amplitude']
                     b = dfi.loc[(subj, component, roi, contr[1]), 'Amplitude']
# conduct t test
                     t, p = ttest_ind(a, b)
                     df_list.append(pd.DataFrame({'participant_id':subj,
                                                  'Component':component,
                                                  'ROI':roi,
                                                  'Contrast':contr_name,
                                                  't':t.round(2),
                                                  'p':p.round(4)
                                                 },
                                                 index=[0]
                                                )
                                   )
df_ttest = pd.concat(df_list, ignore_index=True)                       
df_ttest['p (corr.)'] = multipletests(df_ttest['p'], alpha=.05, method='fdr_bh')[1].round(4)
df_ttest.to_csv('indiv_subj_ttests.csv', index=False)
# df_ttest.sort_values(by='t', axis=0)

### See all t values

In [None]:
df_ttest_all.set_index([ 'Component', 'participant_id', 'Contrast', 'ROI'])

## Show only the significant t values (after FDR correction)

Using a corrected *p* threshold of .05

In [None]:
df_ttest_all.set_index([ 'Component', 'participant_id', 'Contrast', 'ROI']).loc[multipletests(df_ttest['p'], alpha=.05, method='fdr_bh')[0], :]

In [None]:
sig_subjs = multipletests(df_ttest['p'], alpha=.05, method='fdr_bh')[0].sum()
pct_sig_subj = round(((sig_subjs / len(df_ttest)) * 100), 1)
print('Percentage of t tests showing significant effects:', str(pct_sig_subj), '%')
print('(corrected for multiple comparisions using false discovery rate)')