In [1]:
import os
from itertools import product
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from statistics import NormalDist
import glob
from collections import defaultdict

from scipy.stats import ttest_ind, f_oneway

from CrpStats import *

In [2]:

label_f = '~/Ephys/Data/all_pats_bipole_soz_labels.csv'
label_df = pd.read_csv(label_f, header=None)
label_df.columns = ['subj','bipole','label']
label_df.head(4)
dx_df = pd.read_csv("../../Data/Pat_Epilepsy_Subtype.csv")

In [3]:
TLE_IDS = dx_df[dx_df["Pure mTLE"] ==1].PatID.values
NON_TLE_IDS = dx_df[dx_df["Pure mTLE"] !=1].PatID.values

In [4]:

label_f = '~/Ephys/Data/all_pats_bipole_soz_labels.csv'
label_df = pd.read_csv(label_f, header=None)
label_df.columns = ['subj','bipole','label']



label_df = split_bipole(label_df)

label_df.label = label_df.label.apply(map_label)
label_df['bipole'] = label_df.bipole.apply(lambda x: x.replace(" ", ""))

relabel = defaultdict(lambda:"NZ")
relabel['SOZ'] = 'SOZ' 
label_df['label'] = [relabel[x] for x in label_df.label.values]



In [5]:
label_df

Unnamed: 0,subj,bipole,label,contact
0,Epat31,AH3-AH4,SOZ,AH3
1,Epat31,AH4-AH5,SOZ,AH4
2,Epat31,AH5-AH6,SOZ,AH5
3,Epat31,PH1-PH2,SOZ,PH1
4,Epat31,PH2-PH3,SOZ,PH2
...,...,...,...,...
7048,Spat55,LCM6-LCM7,NZ,LCM7
7049,Spat55,LCM8-LCM9,NZ,LCM9
7050,Spat55,LCM9-LCM10,NZ,LCM10
7051,Spat55,LCM10-LCM11,NZ,LCM11


In [6]:
tst = '/mnt/ernie_main/Ghassan/ephys/data/Epat26/Epat26_stim.csv'
tst_df = pd.read_csv(tst)
tmp = merge_label(tst_df,label_df, 'resp_reg', 'bipole')


## Aggregating Results File
This stage reads through the results data on ernie and loads all stim csvs into one dataframe. 
This portion also labels the stim location as SOZ, NIZ, PZ, EZ, using label_df from above. For any regions unlabelled, the stim data is thrown out. 

In [7]:
RES_FILES = glob.glob('/mnt/ernie_main/Ghassan/ephys/data/*pat*/*pat*_crp.csv')
print(len(RES_FILES))
RES_FILES.remove( '/mnt/ernie_main/Ghassan/ephys/data/Spat56/Spat56_crp.csv')
#no label for file currently
print(len(RES_FILES))


25
24


In [8]:
for f in RES_FILES:
    tst_df = pd.read_csv(f)
    tst_df['stim_reg'] = tst_df.stim_resp.apply(lambda x: x.split("_")[0])
    tst_df['resp_reg'] = tst_df.stim_resp.apply(lambda x: x.split("_")[1])
    tst_df['ma'] = tst_df.stim_resp.apply(lambda x: x.split("_")[2])
    tst_df.to_csv(f)
res_df = agg_subject_results(RES_FILES, label_df.drop_duplicates())


24 subjects, total of 151996 trials, dropped: 34576


### Loading Euclidean Distances

The all_pats_euclidean contains a truly massive csv of all bipole pairs and their calculated euclidean distance. This will become important in our analysis for 2 reasons:
1. In the ISH paper, we consider that contacts within 20mm of the stimulation electrode are overriden with stim artifact, thus we need to know to discount them.
2. We can use distance from stimulation source to understand the persistence of certain effects. For example, we expect that electrodes closer to the stimulation site will likely have higher explained variance (conduction effect + stim artifact that remains). So plotting explained variance over distance should help to disentangle some of these confounds

In [9]:
dist_df = pd.read_csv('/mnt/ernie_main/000_Data/SEEG/SEEG_EyesClosed_RestingState/labels/all_pats_euc_dist_list.csv',header=None)
dist_df.columns = ['subj', 'bipole1', 'bipole2','dist']
subj_set = set(res_df.subj)
dist_df = dist_df[dist_df.subj.isin(subj_set)]

In [10]:
dist_df

Unnamed: 0,subj,bipole1,bipole2,dist
220081,Epat26,LAC1-LAC2,LAC1-LAC2,0.0000
220082,Epat26,LAC1-LAC2,LP1-LP2,65.8629
220083,Epat26,LAC1-LAC2,LP2-LP3,65.3933
220084,Epat26,LAC1-LAC2,LTP1-LTP2,66.3953
220085,Epat26,LAC1-LAC2,LTP2-LTP3,66.1579
...,...,...,...,...
667058,Spat55,RAH12-RAH13,RAH8-RAH9,14.1735
667059,Spat55,RAH12-RAH13,RAH9-RAH10,10.5745
667060,Spat55,RAH12-RAH13,RAH10-RAH11,7.0017
667061,Spat55,RAH12-RAH13,RAH11-RAH12,3.5366


In [11]:
## Create a dictionary which takes in a subject and 2 contacts and maps those values 
## to a distance

dist_map = defaultdict(lambda: -1)
dist_map.update({f'{subj}-{bip1}-{bip2}':d for subj,bip1,bip2,d in dist_df[['subj','bipole1','bipole2','dist']].values})
dist_map.update({f'{subj}-{bip2}-{bip1}':d for subj,bip1,bip2,d in dist_df[['subj','bipole1','bipole2','dist']].values})

tst_df = res_df.copy(deep=True)
tst_df['dist'] = [dist_map[f"{subj}-{stim_reg}-{resp_reg}"] for subj,stim_reg,resp_reg in tst_df[['subj','stim_reg','resp_reg']].values]

resdist_df = tst_df

In [12]:
set(resdist_df.stim_reg_label)

{'NZ', 'SOZ'}

##### Filtering Down on the SPES Dataset
2 criteria here: 
1. Distance from the stim electrode - ISH used 20mm to ensure no stim artifact, I used 5 mm in my initial pass to be most permissive
2. Trial parameters : Only using 3mA trials. Most subcortical structures are not probed above 3mA for safety concerns. So this is the highest amperage we can assess most structures at. 

In [13]:
#Filtera conditions
resdist_df = resdist_df[resdist_df.dist > 20] #20mm is in line with ISH
resdist_df = resdist_df[resdist_df.ma == '3mA']
#resdist_df = resdist_df[resdist_df.TR > 20]
print(f"New shape for dataframe: {resdist_df.shape}")

New shape for dataframe: (43476, 530)


In [14]:
resdist_df.columns

Index(['Unnamed: 0.7', 'Unnamed: 0.6', 'Unnamed: 0.5', 'Unnamed: 0.4',
       'Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0',
       'global_index', 'subj',
       ...
       '509', '510', '511', 'stim_reg', 'resp_reg', 'mA', 'ma',
       'resp_reg_label', 'stim_reg_label', 'dist'],
      dtype='object', length=530)

In [18]:
subj_dict = dict()
dim=0
for subj_id in set(resdist_df.subj):
    df = resdist_df[resdist_df.subj == subj_id]
    crp_values = df[[str(i) for i in range(512)]].values
    sim = np.dot(crp_values, crp_values.T)
    dim += sim.shape[0]
    subj_dict[subj_id] = sim 
print(f"total rows = {dim}")

total rows = 43476


In [19]:
resdist_df

Unnamed: 0.8,Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,global_index,subj,...,509,510,511,stim_reg,resp_reg,mA,ma,resp_reg_label,stim_reg_label,dist
323,323,323,323,323,323,323,323,323,323,Epat26,...,0.0,0.0,0.0,LIT11-LIT12,LA2-LA3,3mA,3mA,NZ,NZ,31.7971
324,324,324,324,324,324,324,324,324,324,Epat26,...,0.0,0.0,0.0,LIT11-LIT12,LA3-LA4,3mA,3mA,NZ,NZ,30.8061
325,325,325,325,325,325,325,325,325,325,Epat26,...,0.0,0.0,0.0,LIT11-LIT12,LA4-LA5,3mA,3mA,NZ,NZ,30.1873
326,326,326,326,326,326,326,326,326,326,Epat26,...,0.0,0.0,0.0,LIT11-LIT12,LA6-LA7,3mA,3mA,NZ,NZ,30.4215
327,327,327,327,327,327,327,327,327,327,Epat26,...,0.0,0.0,0.0,LIT11-LIT12,LAC1-LAC2,3mA,3mA,NZ,NZ,76.9833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9369,9369,9369,9369,9369,9369,9369,9369,9369,9369,Spat55,...,0.0,0.0,0.0,LCM10-LCM11,RAH12-RAH13,3mA,3mA,NZ,NZ,112.0913
9370,9370,9370,9370,9370,9370,9370,9370,9370,9370,Spat55,...,0.0,0.0,0.0,LCM10-LCM11,RAH2-RAH3,3mA,3mA,NZ,NZ,86.2346
9371,9371,9371,9371,9371,9371,9371,9371,9371,9371,Spat55,...,0.0,0.0,0.0,LCM10-LCM11,RAH8-RAH9,3mA,3mA,NZ,NZ,101.4144
9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,Spat55,...,0.0,0.0,0.0,LCM10-LCM11,RPH12-RPH13,3mA,3mA,NZ,NZ,114.4181


In [20]:
resdist_df['stim_resp_label'] = [resdist_df.stim_reg_label.values[i]\
                                  +"_"+resdist_df.resp_reg_label.values[i] \
                                 for i in range(resdist_df.shape[0])]
set(resdist_df['stim_resp_label'])

{'NZ_NZ', 'NZ_SOZ', 'SOZ_NZ', 'SOZ_SOZ'}

In [None]:
sim_df = pd.DataFrame(data=sim, columns=resdist_df['stim_resp_label'].values,\
                       index=resdist_df['stim_resp_label'].values)

In [22]:
subj_sim = dict()
for subj_id in set(resdist_df.subj):
    df = resdist_df[resdist_df.subj ==subj_id]
    labels = df['stim_resp_label'].values
    subj_sim[subj_id] = (subj_dict[subj_id], labels)

In [23]:
flat_df_list = list()

for key in subj_sim:
    subj_id = key

    #access tuple of matrix, labels for a given subject, then ignore top triangle of matrix
    mat, labels = subj_sim[subj_id] 
    mat[np.triu_indices(mat.shape[0])] = np.nan 

    #cartesian product of labels for a given subject's stim_response matrix, will serve as index for flattened matric
    cart_prods = list(product(labels, labels))
    flat_labels = ["-".join(tup) for tup in cart_prods]

    #flatten a given subject's stim_response matrix
    flat_mat = mat.flatten()

    #combine flattened matrix and flattened labels array with subj_id to create dataframe
    flat_df = pd.DataFrame()
    flat_df['stim_resp'] = flat_labels
    flat_df['dot_sim'] = flat_mat
    flat_df['subj_id'] = subj_id

    #append this subject's dataframe to flat_df_list
    flat_df_list.append(flat_df)

In [27]:
#dataframe with columns for flattened dot product similarity metrics, cartesian product stim-response pair labels, and subject id for all subjects
mega_df = pd.concat(flat_df_list).dropna()

In [30]:
mega_df.head(5  )

Unnamed: 0,stim_resp,dot_sim,subj_id
1306,NZ_NZ-NZ_NZ,0.252564,Spat49
2612,NZ_NZ-NZ_NZ,-0.096296,Spat49
2613,NZ_NZ-NZ_NZ,0.407681,Spat49
3918,NZ_NZ-NZ_NZ,-0.13067,Spat49
3919,NZ_NZ-NZ_NZ,-0.337374,Spat49


In [32]:
crp_mean_df = mega_df.groupby(['subj_id', 'stim_resp']).mean()

In [34]:
crp_mean_df.reset_index().to_csv('/mnt/ernie_main/Ghassan/ephys/data/crp_dotprod_mean.csv')

In [20]:
#sim_df.to_csv("/mnt/ernie_main/Ghassan/ephys/data/crp_similarity.csv")
