In [1]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from statistics import NormalDist
import glob
from collections import defaultdict

from scipy.stats import ttest_ind, f_oneway

from CrpStats import *

In [2]:

label_f = '~/Ephys/Data/all_pats_bipole_soz_labels.csv'
label_df = pd.read_csv(label_f, header=None)
label_df.columns = ['subj','bipole','label']
label_df.head(4)
dx_df = pd.read_csv("../../Data/Pat_Epilepsy_Subtype.csv")

In [3]:
TLE_IDS = dx_df[dx_df["Pure mTLE"] ==1].PatID.values
NON_TLE_IDS = dx_df[dx_df["Pure mTLE"] !=1].PatID.values

In [4]:

label_f = '~/Ephys/Data/all_pats_bipole_soz_labels.csv'
label_df = pd.read_csv(label_f, header=None)
label_df.columns = ['subj','bipole','label']



label_df = split_bipole(label_df)

label_df.label = label_df.label.apply(map_label)
label_df['bipole'] = label_df.bipole.apply(lambda x: x.replace(" ", ""))

relabel = defaultdict(lambda:"NZ")
relabel['SOZ'] = 'SOZ' 
label_df['label'] = [relabel[x] for x in label_df.label.values]



In [5]:
label_df

Unnamed: 0,subj,bipole,label,contact
0,Epat31,AH3-AH4,SOZ,AH3
1,Epat31,AH4-AH5,SOZ,AH4
2,Epat31,AH5-AH6,SOZ,AH5
3,Epat31,PH1-PH2,SOZ,PH1
4,Epat31,PH2-PH3,SOZ,PH2
...,...,...,...,...
7048,Spat55,LCM6-LCM7,NZ,LCM7
7049,Spat55,LCM8-LCM9,NZ,LCM9
7050,Spat55,LCM9-LCM10,NZ,LCM10
7051,Spat55,LCM10-LCM11,NZ,LCM11


In [6]:
tst = '/mnt/ernie_main/Ghassan/ephys/data/Epat26/Epat26_stim.csv'
tst_df = pd.read_csv(tst)
tmp = merge_label(tst_df,label_df, 'resp_reg', 'bipole')


## Aggregating Results File
This stage reads through the results data on ernie and loads all stim csvs into one dataframe. 
This portion also labels the stim location as SOZ, NIZ, PZ, EZ, using label_df from above. For any regions unlabelled, the stim data is thrown out. 

In [7]:
RES_FILES = glob.glob('/mnt/ernie_main/Ghassan/ephys/data/*pat*/*pat*_crp.csv')
print(len(RES_FILES))
RES_FILES.remove( '/mnt/ernie_main/Ghassan/ephys/data/Spat56/Spat56_crp.csv')
#no label for file currently
print(len(RES_FILES))


25
24


In [8]:
for f in RES_FILES:
    tst_df = pd.read_csv(f)
    tst_df['stim_reg'] = tst_df.stim_resp.apply(lambda x: x.split("_")[0])
    tst_df['resp_reg'] = tst_df.stim_resp.apply(lambda x: x.split("_")[1])
    tst_df['ma'] = tst_df.stim_resp.apply(lambda x: x.split("_")[2])
    tst_df.to_csv(f)
res_df = agg_subject_results(RES_FILES, label_df.drop_duplicates())


24 subjects, total of 151996 trials, dropped: 34576


### Loading Euclidean Distances

The all_pats_euclidean contains a truly massive csv of all bipole pairs and their calculated euclidean distance. This will become important in our analysis for 2 reasons:
1. In the ISH paper, we consider that contacts within 20mm of the stimulation electrode are overriden with stim artifact, thus we need to know to discount them.
2. We can use distance from stimulation source to understand the persistence of certain effects. For example, we expect that electrodes closer to the stimulation site will likely have higher explained variance (conduction effect + stim artifact that remains). So plotting explained variance over distance should help to disentangle some of these confounds

In [9]:
dist_df = pd.read_csv('/mnt/ernie_main/000_Data/SEEG/SEEG_EyesClosed_RestingState/labels/all_pats_euc_dist_list.csv',header=None)
dist_df.columns = ['subj', 'bipole1', 'bipole2','dist']
subj_set = set(res_df.subj)
dist_df = dist_df[dist_df.subj.isin(subj_set)]

In [10]:
dist_df

Unnamed: 0,subj,bipole1,bipole2,dist
220081,Epat26,LAC1-LAC2,LAC1-LAC2,0.0000
220082,Epat26,LAC1-LAC2,LP1-LP2,65.8629
220083,Epat26,LAC1-LAC2,LP2-LP3,65.3933
220084,Epat26,LAC1-LAC2,LTP1-LTP2,66.3953
220085,Epat26,LAC1-LAC2,LTP2-LTP3,66.1579
...,...,...,...,...
667058,Spat55,RAH12-RAH13,RAH8-RAH9,14.1735
667059,Spat55,RAH12-RAH13,RAH9-RAH10,10.5745
667060,Spat55,RAH12-RAH13,RAH10-RAH11,7.0017
667061,Spat55,RAH12-RAH13,RAH11-RAH12,3.5366


In [24]:
## Create a dictionary which takes in a subject and 2 contacts and maps those values 
## to a distance

dist_map = defaultdict(lambda: -1)
dist_map.update({f'{subj}-{bip1}-{bip2}':d for subj,bip1,bip2,d in dist_df[['subj','bipole1','bipole2','dist']].values})
dist_map.update({f'{subj}-{bip2}-{bip1}':d for subj,bip1,bip2,d in dist_df[['subj','bipole1','bipole2','dist']].values})

tst_df = res_df.copy(deep=True)
tst_df['dist'] = [dist_map[f"{subj}-{stim_reg}-{resp_reg}"] for subj,stim_reg,resp_reg in tst_df[['subj','stim_reg','resp_reg']].values]

resdist_df = tst_df

In [27]:
set(resdist_df.stim_reg_label)

{'NZ', 'SOZ'}

##### Filtering Down on the SPES Dataset
2 criteria here: 
1. Distance from the stim electrode - ISH used 20mm to ensure no stim artifact, I used 5 mm in my initial pass to be most permissive
2. Trial parameters : Only using 3mA trials. Most subcortical structures are not probed above 3mA for safety concerns. So this is the highest amperage we can assess most structures at. 

In [28]:
#Filtera conditions
resdist_df = resdist_df[resdist_df.dist > 20] #20mm is in line with ISH
resdist_df = resdist_df[resdist_df.ma == '3mA']
#resdist_df = resdist_df[resdist_df.TR > 20]
print(f"New shape for dataframe: {resdist_df.shape}")

New shape for dataframe: (43476, 526)


In [29]:
set(resdist_df.stim_reg_label)

{'NZ', 'SOZ'}

In [15]:
#resdist_df = resdist_df.drop(columns=resdist_df.columns[0:3])
#Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0'], dtype='object')
crp_values = resdist_df[[str(i) for i in range(512)]].values
sim = np.dot(crp_values, crp_values.T)

In [32]:
resdist_df['stim_resp_label'] = [resdist_df.stim_reg_label.values[i]\
                                  +"_"+resdist_df.resp_reg_label.values[i] \
                                 for i in range(resdist_df.shape[0])]
set(resdist_df['stim_resp_label'])

{'NZ_NZ', 'NZ_SOZ', 'SOZ_NZ', 'SOZ_SOZ'}

In [33]:
sim_df = pd.DataFrame(data=sim, columns=resdist_df['stim_resp_label'].values,\
                       index=resdist_df['stim_resp_label'].values)

In [38]:
sim_df.to_csv("/mnt/ernie_main/Ghassan/ephys/data/crp_similarity.csv")
