In [1]:
import pandas as pd
import utils
from sklearn.preprocessing import StandardScaler
import os
import glob

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# config
NORMALIZATION = True
NORM_ALL_DATA = False
NORM_PHASE_1  = False
NORM_PER_FILE = True

### Load Data

In [3]:
def get_data_overview(df):
    print('Shape of DataFrame', df.shape)
    print('AoP: mean: ', df["AoP" ].mean(), 'std: ', df["AoP" ].std(), 'min: ', df["AoP" ].min(), 'max: ', df["AoP" ].max(), 'median: ', df["AoP" ].median())
    print('VADcurrent: mean: ', df["VADcurrent" ].mean(), 'std: ', df["VADcurrent" ].std(), 'min: ', df["VADcurrent" ].min(), 'max: ', df["VADcurrent" ].max(), 'median: ', df["VADcurrent" ].median())
    print('VadQ: mean: ', df["VadQ" ].mean(), 'std: ', df["VadQ" ].std(), 'min: ', df["VadQ" ].min(), 'max: ', df["VadQ" ].max(), 'median: ', df["VadQ" ].median())
    print('LVP: mean: ', df["LVP" ].mean(), 'std: ', df["LVP" ].std(), 'min: ', df["LVP" ].min(), 'max: ', df["LVP" ].max(), 'median: ', df["LVP" ].median())
    print('LVtot_kalibriert: mean: ', df["LVtot_kalibriert" ].mean(), 'std: ', df["LVtot_kalibriert" ].std(), 'min: ', df["LVtot_kalibriert" ].min(), 'max: ', df["LVtot_kalibriert" ].max(), 'median: ', df["LVtot_kalibriert" ].median())

In [4]:
if not NORMALIZATION:
    path = "/home/johann/Desktop/Uni/Masterarbeit/Cycle_GAN/csv_export_files_alle_Daten/csv_export_files" 
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    
    df = pd.DataFrame()
    scaler = StandardScaler() 
    for f in csv_files:
        df_temp = pd.read_csv(f, sep=";")
        df_temp = utils.drop_cols(df_temp)
        df_temp = df_temp.dropna()
        df_temp = utils.remove_strings(df_temp)  
        df = pd.concat([df, df_temp], axis=0)
        
    get_data_overview(df)

#### Get mean, standard deviation, min, max and median of data

In [6]:
def normalize_by_all_phases(df, scaler):
    '''
    Normalize the data by the whole dataframe
    '''
    cols = df.columns.tolist()
    df = df.to_numpy() 
    scaler.fit(df)
    transformed_data = scaler.transform(df)
    df = pd.DataFrame(transformed_data, columns=cols)  
    return df

def normalize_by_phase1(df, scaler):
    '''
    Normalize the data by the first phase
    '''
    cols = df.columns.tolist()
    phase_1 = df.loc[df['Phasenzuordnung'] == 1]
    phase_1 = phase_1.to_numpy() 
    df = df.to_numpy()  
    scaler.fit(phase_1)
    transformed_data = scaler.transform(df)
    df = pd.DataFrame(transformed_data, columns=cols)  
    return df

def normalize(df, scaler, phase1 = True):
    df_IPA = df[['intervention', 'Phasenzuordnung', 'animal']]
    df_temp = pd.DataFrame()
    NORMALIZE = normalize_by_phase1
    if phase1 == True:
        NORMALIZE = normalize_by_all_phases

    for animal in df['animal'].unique():
        # split df into separate dataframes for each animal
        df_animal = df.loc[df['animal'] == animal]
        df_animal = NORMALIZE(df_animal, scaler) # normalize by phase 1
        # append df_animal to df_temp
        df_temp = pd.concat([df_temp, df_animal], axis=0, ignore_index=True)

    df = df_temp
    df = df.drop(columns=['intervention', 'Phasenzuordnung', 'animal'])
    df.dropna(inplace=True)
    df = df.join(df_IPA)
    return df

In [8]:
if NORMALIZATION == True and NORM_PER_FILE == True:
    path = "/home/johann/Desktop/Uni/Masterarbeit/Cycle_GAN/csv_export_files_alle_Daten/csv_export_files" 
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    
    df = pd.DataFrame()
    scaler = StandardScaler() 
    for f in csv_files:
        df_temp = pd.read_csv(f, sep=";")
        df_temp = utils.drop_cols(df_temp)
        df_temp = df_temp.dropna()
        df_temp = utils.remove_strings(df_temp)  
        df_temp = utils.subsample(df_temp, 10)
        df_temp = normalize(df_temp, scaler, phase1 = True)
        df = pd.concat([df, df_temp], axis=0)
        
    print('Normalized per file')
    get_data_overview(df)

Normalized per file
Shape of DataFrame (6022044, 13)
AoP: mean:  8.476422646032742e-18 std:  1.0000000830282978 min:  -2.7786048713806513 max:  4.808802668585217 median:  -0.1360668248199314
VADcurrent: mean:  4.0852959033440655e-17 std:  1.0000000830282976 min:  -11.981886874581654 max:  17.447844556111704 median:  -0.07346303264848664
VadQ: mean:  -4.832882399519781e-18 std:  1.0000000830282976 min:  -8.201428557907775 max:  3.3115737016978097 median:  -0.08852087878540464
LVP: mean:  1.666022936553206e-18 std:  1.0000000830282978 min:  -2.7810397103291944 max:  3.576518816510218 median:  -0.6391708073508791
LVtot_kalibriert: mean:  -2.1247691955701228e-17 std:  1.0000000830282978 min:  -3.7267938151892253 max:  3.4179702854250453 median:  0.17636830479941046


Normalized per file
Shape of DataFrame (6022044, 13)
AoP: mean:  8.476422646032742e-18 std:  1.0000000830282978 min:  -2.7786048713806513 max:  4.808802668585217 median:  -0.1360668248199314
VADcurrent: mean:  4.0852959033440655e-17 std:  1.0000000830282976 min:  -11.981886874581654 max:  17.447844556111704 median:  -0.07346303264848664
VadQ: mean:  -4.832882399519781e-18 std:  1.0000000830282976 min:  -8.201428557907775 max:  3.3115737016978097 median:  -0.08852087878540464
LVP: mean:  1.666022936553206e-18 std:  1.0000000830282978 min:  -2.7810397103291944 max:  3.576518816510218 median:  -0.6391708073508791
LVtot_kalibriert: mean:  -2.1247691955701228e-17 std:  1.0000000830282978 min:  -3.7267938151892253 max:  3.4179702854250453 median:  0.17636830479941046

In [10]:
path = "/home/johann/Desktop/Uni/Masterarbeit/Cycle_GAN/csv_export_files_alle_Daten/csv_export_files" 
csv_files = glob.glob(os.path.join(path, "*.csv"))

df = pd.DataFrame()
scaler = StandardScaler() 
for f in csv_files:
    df_temp = pd.read_csv(f, sep=";")
    df_temp = utils.drop_cols(df_temp)
    df_temp = df_temp.dropna()
    df_temp = utils.remove_strings(df_temp)  
    df_temp = utils.subsample(df_temp, 10)
    # df_temp = utils.normalize_df(df_temp, scaler) # no normalization per file
    df = pd.concat([df, df_temp], axis=0)

df = df.groupby('animal').filter(lambda x: len(x) > 10)

In [13]:
if NORMALIZATION == True and NORM_PHASE_1 == True:  
    print('Normalized by phase 1')
    df_temp = normalize(df, scaler, phase1 = True)
    get_data_overview(df_temp)

if NORMALIZATION == True and NORM_ALL_DATA == True:
    print('Normalized with all the data')
    df_temp = normalize(df, scaler, phase1 = False)
    get_data_overview(df_temp)

Normalized by phase 1
Shape of DataFrame (11931572, 13)
AoP: mean:  -0.06557444648713584 std:  0.8878459645849367 min:  -3.088167190570084 max:  4.808836670894062 median:  -0.1931671327622163
VADcurrent: mean:  0.32194796372694146 std:  1.0892522152700856 min:  -12.272475323878714 max:  19.724014576100192 median:  0.12902850124052548
VadQ: mean:  0.2397314419752238 std:  0.9637863092666007 min:  -5.488560529279074 max:  4.244330530595154 median:  0.2580906882690985
LVP: mean:  -0.05540577231887373 std:  0.9397205691162268 min:  -2.89908819574177 max:  4.781273048784575 median:  -0.6147855178422799
LVtot_kalibriert: mean:  0.2809593158841215 std:  1.1933054809214665 min:  -4.341492555302559 max:  3.768923461706554 median:  0.19872751888704757


Normalized by phase 1
Shape of DataFrame (11931572, 13)
AoP: mean:  -0.06557444648713584 std:  0.8878459645849367 min:  -3.088167190570084 max:  4.808836670894062 median:  -0.1931671327622163
VADcurrent: mean:  0.32194796372694146 std:  1.0892522152700856 min:  -12.272475323878714 max:  19.724014576100192 median:  0.12902850124052548
VadQ: mean:  0.2397314419752238 std:  0.9637863092666007 min:  -5.488560529279074 max:  4.244330530595154 median:  0.2580906882690985
LVP: mean:  -0.05540577231887373 std:  0.9397205691162268 min:  -2.89908819574177 max:  4.781273048784575 median:  -0.6147855178422799
LVtot_kalibriert: mean:  0.2809593158841215 std:  1.1933054809214665 min:  -4.341492555302559 max:  3.768923461706554 median:  0.19872751888704757

Normalized with all the data
Shape of DataFrame (11931572, 13)
AoP: mean:  0.15143322806935683 std:  1.6548883870195956 min:  -13.965436345696867 max:  12.14236505608361 median:  -0.08535793751574346
VADcurrent: mean:  0.5293288463125844 std:  1.4889912744757337 min:  -14.777772394223327 max:  23.10131774091906 median:  0.35366173081445984
VadQ: mean:  0.24893508799792072 std:  1.4943442918122685 min:  -31.410083873273255 max:  6.632881665829332 median:  0.35879103761717
LVP: mean:  -0.033408512465791164 std:  1.0050075603646809 min:  -2.9248941019933343 max:  5.478443864393581 median:  -0.6269452169984315
LVtot_kalibriert: mean:  0.34259703281089615 std:  1.4799829735049879 min:  -6.974893133985444 max:  4.335202195375505 median:  0.2114695301475898

#### Expected output

AoP: mean:  48.30846696781112 std:  15.156141832600728 min:  15.84199 max:  140.4831 median:  46.1051

VADcurrent: mean:  0.5541030227058161 std:  0.20180199960363357 min:  -0.9118687 max:  5.10126 median:  0.511567

VadQ: mean:  2.177537563636682 std:  0.9466818061058603 min:  -1.511544 max:  6.007812 median:  2.165501

LVP: mean:  27.259501107205093 std:  24.888557742090942 min:  -61.03938 max:  168.5278 median:  11.6282

LVtot_kalibriert: mean:  90.4793272339531 std:  42.87935237198888 min:  -33.8681644353909 max:  320.181386525449 median:  88.4587843298934

#### How much data per phase

In [None]:
print('Size of the dataset with data from phase 1',df.shape)
print('Size of Phase 1: ', df.loc[df['Phasenzuordnung'] == 1].shape)
print('Size of Phase 2: ', df.loc[df['Phasenzuordnung'] == 2].shape)
print('Size of Phase 3: ', df.loc[df['Phasenzuordnung'] == 3].shape)
print('Size of Phase 4: ', df.loc[df['Phasenzuordnung'] == 4].shape)
print('SIze of Phase 5: ', df.loc[df['Phasenzuordnung'] == 5].shape)