In [2]:
import pandas as pd
import numpy as np
from scipy.interpolate import UnivariateSpline
from scipy.interpolate import interp1d
from scipy import interpolate
import pickle
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
import torch.nn.functional as F
from torch import nn
from torchvision import transforms
from torch.autograd import Variable


In [3]:
######################### Read eeg data ##########################

# Enter the EEG second by second data location
EEG_DATA_LOCATION="D:/Projects/Prognostic Predictions/Data/second by second data/High res aEEG_(2020-06-01T09.23.27.972)/output_all_long_502second by second.csv"
eeg_data=pd.read_csv(EEG_DATA_LOCATION,
                     skiprows=0,na_values=["N/A","Insuff. Data"])
eeg_data.columns=[i.replace(" ","_")for i in eeg_data.columns]
# eeg_data=eeg_data[eeg_data["Timebins"]<=10]
Timebins=21600



In [4]:
eeg_data=eeg_data.astype({"Timebins":"object"},copy=False)  # changing type of Timebins so that it isnt agg in the next step
temp1=eeg_data.sort_values(by=["PID","Timebins"], ascending=False).groupby(["PID"])["PID","Timebins","aEEG"]\
                       .rolling(min_periods=1, on="Timebins",window=21600).sum().reset_index()  # After this step Nas remaining will be the ones towards end



  


In [5]:
temp1.iloc[:,3:]=temp1.iloc[:,3:].apply(lambda x : x.isna()) #Flagging those Nas as True 
temp1.rename(columns={"aEEG":"Na_rows"},inplace=True) 
temp1.drop(columns="level_1",inplace=True)

In [6]:
eeg_data=eeg_data.merge(temp1,"inner",on=["PID","Timebins"],suffixes=("","_flag"))
eeg_data=eeg_data.astype({"Timebins":"int"},copy=False) ## Changing timebins back to int

##### Removing rows where less than 4 values are present #######

eeg_row_flag=eeg_data.groupby(["PID"])["aEEG"].agg(["count"])
eeg_row_flag=eeg_row_flag[eeg_row_flag["count"]>4]
eeg_data=eeg_data.merge(eeg_row_flag,'inner',on="PID")
eeg_data.drop(columns="count",inplace=True)
# eeg_data.drop(columns="Na_rows",inplace=True)

In [7]:
arr_to_interpol=eeg_data.iloc[:,2:4].values  #selecting columns other than  PID and Timestamp
for i in range(0,int(len(eeg_data)/Timebins)):
    for j in range(1,int(arr_to_interpol.shape[1])):
        idx_finite = np.isfinite(arr_to_interpol[i*Timebins:(i+1)*Timebins,j])
        transformer=interpolate.splrep(arr_to_interpol[i*Timebins:(i+1)*Timebins,0][idx_finite],arr_to_interpol[i*Timebins:(i+1)*Timebins,j][idx_finite],k=2)
        arr_to_interpol[i*Timebins:(i+1)*Timebins,j]=interpolate.splev(arr_to_interpol[i*Timebins:(i+1)*Timebins,0],transformer)

eeg_data_quadratic_spline=eeg_data.copy(deep=True)
eeg_data_quadratic_spline.iloc[:,3:4]=arr_to_interpol[:,1:] # Transfering back the interpolated data


In [8]:
eeg_data_quadratic_spline_with_zeros=eeg_data_quadratic_spline.copy()
eeg_data_quadratic_spline_with_zeros.loc[eeg_data_quadratic_spline_with_zeros["Na_rows"]==True,"aEEG"]=0.0


In [9]:
processed_time_invariant_data=pickle.load( open(\
    "D:/Projects/Prognostic Predictions/Proccessed data/Time_Invariant_data.pickle", "rb" ) )

# processed_time_invariant_data=pd.concat((train_data,test_data),axis=0)
processed_data=processed_time_invariant_data[["PID","outcome"]].merge(eeg_data_quadratic_spline[["PID","Timebins","aEEG"]],on="PID")

In [10]:
processed_data_array=processed_data.values

In [11]:

# Creating a variable slope feature to capture the slope between current and next record
slope_feature=np.abs(np.arctan((np.concatenate((processed_data_array[1:,3],np.array([0])))-processed_data_array[:,3]).astype('float'))*180/3.14)
# changing the last record of each patient to nan since the next record is not present
slope_feature[21599::21600]=np.nan                          
# combining as a new feature with the data set
processed_data_array=np.concatenate((processed_data_array,slope_feature.reshape(-1,1)),axis=1)

In [12]:
SECONDS=3600  ### variable controls the level of aggregation
processed_data_with_features=np.empty((int(processed_data_array.shape[0]/SECONDS),5),dtype='O')
for i in range(int(len(processed_data_array)/SECONDS)):
#     PID
    processed_data_with_features[i,0]=processed_data_array[i*SECONDS,0]
#     Timebins
    processed_data_with_features[i,1]=np.floor((processed_data_array[i*SECONDS,2]-1)/SECONDS)+1
#     points above 1.5*median
    processed_data_with_features[i,2]=np.count_nonzero(processed_data_array[i*SECONDS:(i+1)*SECONDS,3]>np.median(processed_data_array[i*SECONDS:(i+1)*SECONDS,3])*1.5)/SECONDS    
#     points above 1.5*mean
    processed_data_with_features[i,3]=np.count_nonzero(processed_data_array[i*SECONDS:(i+1)*SECONDS,3]>np.mean(processed_data_array[i*SECONDS:(i+1)*SECONDS,3])*1.5)/SECONDS    
#     points with slope greated than 85
    processed_data_with_features[i,4]=np.count_nonzero(processed_data_array[i*SECONDS:(i+1)*SECONDS,4]>75)/SECONDS        

  del sys.path[0]


In [13]:
# Converting to a pandas data frame
processed_data_with_features=pd.DataFrame(processed_data_with_features,columns=['PID','Timebins','percent_above_1.5*median','percent_above_1.5*mean','steep_slopes'])
# Pivioting the data
processed_data_with_features=processed_data_with_features.set_index(["PID","Timebins"]).unstack()
processed_data_with_features.columns=[str(i[0])+str(i[1]) for i in processed_data_with_features.columns]
processed_data_with_features=processed_data_with_features.reset_index()

for i in processed_data_with_features.columns:
    if i!="PID":
        processed_data_with_features[i]=processed_data_with_features[i].astype("float")

# storing as a picle file
pickle_file="D:/Projects/Prognostic Predictions/Proccessed data/EEG_data_SECONDS_with aditional_features.pickle"
pickle.dump( processed_data_with_features, open(pickle_file, "wb") )