In [1]:
import pandas as pd
import numpy as np
from scipy.interpolate import UnivariateSpline
from scipy.interpolate import interp1d
from scipy import interpolate
import pickle
from sklearn.feature_selection import RFECV

In [2]:

######################### Read eeg data ##########################

eeg_data=pd.read_csv("D:/Projects/Prognostic Predictions/ML analysis/20+ trends for manuscript_(2019-07-17T18.13.11.597)/output_all_long_472.csv",
                     skiprows=0,na_values=["N/A","Insuff. Data"])
eeg_data.columns=[i.replace(" ","_")for i in eeg_data.columns]
Timebins=12
eeg_data=eeg_data[eeg_data["Timebins"]<=Timebins]

# Timebins=48

##### Creating flags to distinguish between Nas in the middle and ones towards the end #######

eeg_data=eeg_data.astype({"Timebins":"object"},copy=False)  # changing type of Timebins so that it isnt agg in the next step

temp1=eeg_data.sort_values(by=["PID","Timebins"], ascending=False).groupby(["PID"])["PID","Timebins","Alpha_sum"]\
                       .rolling(min_periods=1, on="Timebins",window=48).sum().reset_index()  # After this step Nas remaining will be the ones towards end

temp1.iloc[:,3:]=temp1.iloc[:,3:].apply(lambda x : x.isna()) #Flagging those Nas as True 
temp1.rename(columns={"Alpha_sum":"Na_rows"},inplace=True) 
temp1.drop(columns="level_1",inplace=True)

##### Merging with the original data to get the Na_rows flag #######

eeg_data=eeg_data.merge(temp1,"inner",on=["PID","Timebins"],suffixes=("","_flag"))
eeg_data=eeg_data.astype({"Timebins":"int"},copy=False) ## Changing timebins back to int

##### Removing rows where less than 4 values are present #######

eeg_row_flag=eeg_data.groupby(["PID"])["Alpha_sum"].agg(["count"])
eeg_row_flag=eeg_row_flag[eeg_row_flag["count"]>4]
eeg_data=eeg_data.merge(eeg_row_flag,'inner',on="PID")
eeg_data.drop(columns="count",inplace=True)
# eeg_data=eeg_data.merge(eeg_data.groupby(["PID"])["Timestamp"].min().reset_index()
# ,"inner",on=["PID"],suffixes=("","_start_Timestamp"))
# eeg_data.drop(columns="Timestamp",inplace=True)
eeg_data.drop(columns="Na_rows",inplace=True)

  from ipykernel import kernelapp as app


In [3]:
######################### Data Interpolation ##########################

######################### Interpolation through cubic spline ##########################

arr_to_interpol=eeg_data.iloc[:,2:].values  #selecting columns other than  PID and Timestamp
for i in range(0,int(len(eeg_data)/Timebins)):
    for j in range(1,int(arr_to_interpol.shape[1])):
        idx_finite = np.isfinite(arr_to_interpol[i*Timebins:(i+1)*Timebins,j])
        transformer=interpolate.splrep(arr_to_interpol[i*Timebins:(i+1)*Timebins,0][idx_finite],arr_to_interpol[i*Timebins:(i+1)*Timebins,j][idx_finite],k=2)
        arr_to_interpol[i*Timebins:(i+1)*Timebins,j]=interpolate.splev(arr_to_interpol[i*Timebins:(i+1)*Timebins,0],transformer)

eeg_data_quadratic_spline=eeg_data.copy(deep=True)
eeg_data_quadratic_spline.iloc[:,3:]=arr_to_interpol[:,1:] # Transfering back the interpolated data

# We keep the spline for the Nas that appear in between records and build multiple
# datasets using other methods for the Nas that appear in the end

######################### Interpolation through hot deck ##########################

eeg_data_hot_deck=pd.DataFrame().reindex_like(eeg_data)
for i in eeg_data.columns:
    eeg_data_hot_deck[i] = eeg_data.groupby("PID")[i].apply(lambda x : x.fillna(method='ffill'))


######################### Interpolation through Random hot deck ##########################

eeg_data_random_hot_deck=pd.DataFrame().reindex_like(eeg_data)
for i in eeg_data.columns:
    eeg_data_random_hot_deck[i] = eeg_data.groupby("PID")[i].apply(lambda x : x.fillna(pd.Series(
        np.random.choice(x[x.isna()==False], size=len(eeg_data.index)))))


In [4]:
# storing the timeseries data for rnns
pickle_file="D:/Projects/Prognostic Predictions/Proccessed data/EEG_time_series_data.pickle"
pickle.dump( [eeg_data_quadratic_spline,eeg_data_random_hot_deck], open(pickle_file, "wb") )

In [5]:
columns_to_smooth=set(eeg_data_quadratic_spline.columns)-set(["Timestamp"])
eeg_data_quadratic_spline_smoothed=eeg_data_quadratic_spline.groupby(["PID"])[list(columns_to_smooth)]\
                       .rolling(min_periods=1, on="Timebins",window=4).mean().reset_index().drop(columns=["level_1"])
eeg_data_hot_deck_smoothed=eeg_data_hot_deck.groupby(["PID"])[list(columns_to_smooth)]\
                       .rolling(min_periods=1, on="Timebins",window=4).mean().reset_index().drop(columns=["level_1"])
eeg_data_random_hot_deck_smoothed=eeg_data_random_hot_deck.groupby(["PID"])[list(columns_to_smooth)]\
                       .rolling(min_periods=1, on="Timebins",window=4).mean().reset_index().drop(columns=["level_1"])

# eeg_data_quadratic_spline=eeg_data_quadratic_spline[["PID","Timebins","Timestamp"]].merge(eeg_data_quadratic_spline_smoothed,"inner",on=["PID","Timebins"],)
# eeg_data_hot_deck=eeg_data_hot_deck[["PID","Timebins","Timestamp"]].merge(eeg_data_hot_deck_smoothed,"inner",on=["PID","Timebins"])
# eeg_data_random_hot_deck=eeg_data_random_hot_deck[["PID","Timebins","Timestamp"]].merge(eeg_data_random_hot_deck_smoothed,"inner",on=["PID","Timebins"])

eeg_data_quadratic_spline=eeg_data_quadratic_spline_smoothed
eeg_data_hot_deck=eeg_data_hot_deck_smoothed
eeg_data_random_hot_deck=eeg_data_random_hot_deck_smoothed

In [6]:
######################### Pivoting the data ##########################

######################### quadratic spline dataset ##########################

eeg_data_quadratic_spline=eeg_data_quadratic_spline.set_index(["PID","Timebins"]).unstack()
eeg_data_quadratic_spline.columns=[str(i[0])+str(i[1]) for i in eeg_data_quadratic_spline.columns]
eeg_data_quadratic_spline.drop(columns=eeg_data_quadratic_spline.columns[1:48],inplace=True)
eeg_data_quadratic_spline=eeg_data_quadratic_spline.reset_index()

######################### hot deck dataset ##########################

eeg_data_hot_deck=eeg_data_hot_deck.set_index(["PID","Timebins"]).unstack()
eeg_data_hot_deck.columns=[str(i[0])+str(i[1]) for i in eeg_data_hot_deck.columns]
eeg_data_hot_deck.drop(columns=eeg_data_hot_deck.columns[1:48],inplace=True)
eeg_data_hot_deck=eeg_data_hot_deck.reset_index()

######################### Random hot deck dataset ##########################

eeg_data_random_hot_deck=eeg_data_random_hot_deck.set_index(["PID","Timebins"]).unstack()
eeg_data_random_hot_deck.columns=[str(i[0])+str(i[1]) for i in eeg_data_random_hot_deck.columns]
eeg_data_random_hot_deck.drop(columns=eeg_data_random_hot_deck.columns[1:48],inplace=True)
eeg_data_random_hot_deck=eeg_data_random_hot_deck.reset_index()


In [7]:
features_from_seconds_data=pickle.load( open(\
    "D:/Projects/Prognostic Predictions/Proccessed data/EEG_data_SECONDS_with aditional_features.pickle", "rb" ) )

In [8]:
eeg_data_quadratic_spline=eeg_data_quadratic_spline.merge(features_from_seconds_data,on="PID")
eeg_data_hot_deck=eeg_data_random_hot_deck.merge(features_from_seconds_data,on="PID")
eeg_data_random_hot_deck=eeg_data_random_hot_deck.merge(features_from_seconds_data,on="PID")

In [9]:
pickle_file="D:/Projects/Prognostic Predictions/Proccessed data/EEG_data.pickle"
pickle.dump( [eeg_data_quadratic_spline, eeg_data_hot_deck, eeg_data_random_hot_deck], open(pickle_file, "wb") )

In [10]:
patient_timestamp=eeg_data.groupby("PID")["Timestamp"].max().reset_index()
patient_timestamp['Timestamp']=pd.to_datetime(patient_timestamp['Timestamp'], format="%Y-%m-%d %H:%M:%S")

patient_timestamp['Timestamp'] = (patient_timestamp.Timestamp.dt.tz_convert('US/Eastern').dt
                        .strftime("%Y-%m-%d %H:%M:%S"))

In [11]:
pickle_file="D:/Projects/Prognostic Predictions/Proccessed data/patient_timestamp.pickle"
pickle.dump( patient_timestamp, open(pickle_file, "wb") )