In [1]:
import pandas as pd
import utils
from sklearn.preprocessing import StandardScaler
import os
import glob

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# config
NORMALIZATION = True
NORM_ALL_DATA = False
NORM_PHASE_1  = False
NORM_PER_FILE = True

### Load Data

In [3]:
if not NORMALIZATION:
    path = "/home/johann/Desktop/Uni/Masterarbeit/Cycle_GAN/csv_export_files_alle_Daten/csv_export_files" 
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    
    df = pd.DataFrame()
    scaler = StandardScaler() 
    for f in csv_files:
        df_temp = pd.read_csv(f, sep=";")
        df_temp = utils.drop_cols(df_temp)
        df_temp = df_temp.dropna()
        df_temp = utils.remove_strings(df_temp)  
        df = pd.concat([df, df_temp], axis=0)
        
    utils.get_data_overview(df)

#### Get mean, standard deviation, min, max and median of data

In [4]:
if NORMALIZATION == True and NORM_PER_FILE == True:
    path = "/home/johann/Desktop/Uni/Masterarbeit/Cycle_GAN/csv_export_files_alle_Daten/csv_export_files" 
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    
    df = pd.DataFrame()
    scaler = StandardScaler() 
    for f in csv_files:
        df_temp = pd.read_csv(f, sep=";")
        df_temp = utils.drop_cols(df_temp)
        df_temp = df_temp.dropna()
        df_temp = utils.remove_strings(df_temp)  
        df_temp = utils.subsample(df_temp, 10)
        #df_temp = utils.normalize(df_temp, scaler, phase1 = True)
        df = pd.concat([df, df_temp], axis=0)
        
    print('Normalized per file')
    utils.get_data_overview(df)

Normalized per file
Shape of DataFrame (6022044, 16)
AoP: mean:  48.30847789525583 std:  15.153152923498263 min:  15.856160000083037 max:  140.38751000000047 median:  46.10740599997371
VADcurrent: mean:  0.5541030138440085 std:  0.20167086814548843 min:  -0.8063268900004914 max:  5.092731500000809 median:  0.5115693499993199
VadQ: mean:  2.1775348983652427 std:  0.9465230017230916 min:  -1.4067543999917689 max:  5.993453799997951 median:  2.1655909499982045
LVP: mean:  27.25951742579371 std:  24.875284239815052 min:  -60.63108600000123 max:  166.53891999996267 median:  11.646741000042311
LVtot_kalibriert: mean:  90.47925277948431 std:  42.87621222411684 min:  -33.8161546293486 max:  320.06734372721985 median:  88.45988649118226


Expected output:

Normalized per file
Shape of DataFrame (6022044, 13)
AoP: mean:  8.476422646032742e-18 std:  1.0000000830282978 min:  -2.7786048713806513 max:  4.808802668585217 median:  -0.1360668248199314
VADcurrent: mean:  4.0852959033440655e-17 std:  1.0000000830282976 min:  -11.981886874581654 max:  17.447844556111704 median:  -0.07346303264848664
VadQ: mean:  -4.832882399519781e-18 std:  1.0000000830282976 min:  -8.201428557907775 max:  3.3115737016978097 median:  -0.08852087878540464
LVP: mean:  1.666022936553206e-18 std:  1.0000000830282978 min:  -2.7810397103291944 max:  3.576518816510218 median:  -0.6391708073508791
LVtot_kalibriert: mean:  -2.1247691955701228e-17 std:  1.0000000830282978 min:  -3.7267938151892253 max:  3.4179702854250453 median:  0.17636830479941046

In [5]:
if NORM_PHASE_1 == True or NORM_ALL_DATA == True:
    path = "/home/johann/Desktop/Uni/Masterarbeit/Cycle_GAN/csv_export_files_alle_Daten/csv_export_files" 
    csv_files = glob.glob(os.path.join(path, "*.csv"))

    df = pd.DataFrame()
    scaler = StandardScaler() 
    for f in csv_files:
        df_temp = pd.read_csv(f, sep=";")
        df_temp = utils.drop_cols(df_temp)
        df_temp = df_temp.dropna()
        df_temp = utils.remove_strings(df_temp)  
        df_temp = utils.subsample(df_temp, 10)
        # df_temp = utils.normalize_df(df_temp, scaler) # no normalization per file
        df = pd.concat([df, df_temp], axis=0)

    df = df.groupby('animal').filter(lambda x: len(x) > 10)

In [6]:
# get overview of the data before normalization
# utils.get_data_overview(df_temp)

#### Expected output

AoP: mean:  48.30846696781112 std:  15.156141832600728 min:  15.84199 max:  140.4831 median:  46.1051

VADcurrent: mean:  0.5541030227058161 std:  0.20180199960363357 min:  -0.9118687 max:  5.10126 median:  0.511567

VadQ: mean:  2.177537563636682 std:  0.9466818061058603 min:  -1.511544 max:  6.007812 median:  2.165501

LVP: mean:  27.259501107205093 std:  24.888557742090942 min:  -61.03938 max:  168.5278 median:  11.6282

LVtot_kalibriert: mean:  90.4793272339531 std:  42.87935237198888 min:  -33.8681644353909 max:  320.181386525449 median:  88.4587843298934

In [7]:
if NORMALIZATION == True and NORM_PHASE_1 == True:  
    print('Normalized by phase 1')
    df_temp = utils.normalize(df, scaler, phase1 = True)
    utils.get_data_overview(df_temp)

if NORMALIZATION == True and NORM_ALL_DATA == True:
    print('Normalized with all the data')
    df_temp = utils.normalize(df, scaler, phase1 = False)
    utils.get_data_overview(df_temp)

Expected output:

Normalized by phase 1
Shape of DataFrame (11931572, 13)
AoP: mean:  -0.06557444648713584 std:  0.8878459645849367 min:  -3.088167190570084 max:  4.808836670894062 median:  -0.1931671327622163
VADcurrent: mean:  0.32194796372694146 std:  1.0892522152700856 min:  -12.272475323878714 max:  19.724014576100192 median:  0.12902850124052548
VadQ: mean:  0.2397314419752238 std:  0.9637863092666007 min:  -5.488560529279074 max:  4.244330530595154 median:  0.2580906882690985
LVP: mean:  -0.05540577231887373 std:  0.9397205691162268 min:  -2.89908819574177 max:  4.781273048784575 median:  -0.6147855178422799
LVtot_kalibriert: mean:  0.2809593158841215 std:  1.1933054809214665 min:  -4.341492555302559 max:  3.768923461706554 median:  0.19872751888704757

Normalized with all the data
Shape of DataFrame (11931572, 13)
AoP: mean:  0.15143322806935683 std:  1.6548883870195956 min:  -13.965436345696867 max:  12.14236505608361 median:  -0.08535793751574346
VADcurrent: mean:  0.5293288463125844 std:  1.4889912744757337 min:  -14.777772394223327 max:  23.10131774091906 median:  0.35366173081445984
VadQ: mean:  0.24893508799792072 std:  1.4943442918122685 min:  -31.410083873273255 max:  6.632881665829332 median:  0.35879103761717
LVP: mean:  -0.033408512465791164 std:  1.0050075603646809 min:  -2.9248941019933343 max:  5.478443864393581 median:  -0.6269452169984315
LVtot_kalibriert: mean:  0.34259703281089615 std:  1.4799829735049879 min:  -6.974893133985444 max:  4.335202195375505 median:  0.2114695301475898

#### How much data per phase

In [8]:
print('Size of the dataset with data from phase 1',df.shape)
print('Size of Phase 1: ', df.loc[df['Phasenzuordnung'] == 1].shape)
print('Size of Phase 2: ', df.loc[df['Phasenzuordnung'] == 2].shape)
print('Size of Phase 3: ', df.loc[df['Phasenzuordnung'] == 3].shape)
print('Size of Phase 4: ', df.loc[df['Phasenzuordnung'] == 4].shape)
print('SIze of Phase 5: ', df.loc[df['Phasenzuordnung'] == 5].shape)

Size of the dataset with data from phase 1 (6022044, 16)
Size of Phase 1:  (751120, 16)
Size of Phase 2:  (1370162, 16)
Size of Phase 3:  (1371092, 16)
Size of Phase 4:  (1369343, 16)
SIze of Phase 5:  (1160327, 16)


In [9]:
# get percentage of each phase
print('Percentage of Phase 1: ', df.loc[df['Phasenzuordnung'] == 1].shape[0]/df.shape[0])
print('Percentage of Phase 2: ', df.loc[df['Phasenzuordnung'] == 2].shape[0]/df.shape[0])
print('Percentage of Phase 3: ', df.loc[df['Phasenzuordnung'] == 3].shape[0]/df.shape[0])
print('Percentage of Phase 4: ', df.loc[df['Phasenzuordnung'] == 4].shape[0]/df.shape[0])
print('Percentage of Phase 5: ', df.loc[df['Phasenzuordnung'] == 5].shape[0]/df.shape[0])

Percentage of Phase 1:  0.12472841447189692
Percentage of Phase 2:  0.22752440865593143
Percentage of Phase 3:  0.2276788412705055
Percentage of Phase 4:  0.2273884083211614
Percentage of Phase 5:  0.19267992728050476


In [10]:
# Get number of different animals
print('Number of different animals: ', len(df['animal'].unique()))

Number of different animals:  25


In [11]:
df = df.reset_index(drop=True)

for index, row in df.iterrows():
    if row['Phasenzuordnung'] == 1:
        df.at[index, 'intervention'] = 0
    elif row['intervention'] == 10:
        if row['contractility'] == 1.0:
            df.at[index, 'intervention'] = 0      # contractility = 1.0 - could be ignored? - phase 0?
        if row['contractility'] == 3.0:
            df.at[index, 'intervention'] = 9      # contractility = 3.0                                        
        if row['contractility'] == 4.0:
            df.at[index, 'intervention'] = 10    # contractility = 4.0

#get unique intervention
print(df['intervention'].unique())

[ 0.  1.  3. 10.  2.  4.  9.]


In [12]:
# remove columns with intervention = 0
df = df[df.intervention != 0]

In [18]:
# get value counts of intervention 1, when also intervention 2 is present
int_1 = df.loc[df['intervention'] == 1]
Vnormal = int_1.loc[int_1['afterload'] == 1]
Vnormal.shape

(2162838, 16)

In [14]:
# How often does each intervention occur?
print('Number of different interventions: ', len(df['intervention'].unique()))
print('Number of each intervention: ', df['intervention'].value_counts())
# get percentage of each intervention
print('Percentage of each intervention: ', df['intervention'].value_counts()/df.shape[0])

Number of different interventions:  6
Number of each intervention:  1.0     2162838
3.0     1532302
10.0    1212463
2.0       34544
4.0       26222
9.0       19533
Name: intervention, dtype: int64
Percentage of each intervention:  1.0     0.433617
3.0     0.307204
10.0    0.243081
2.0     0.006926
4.0     0.005257
9.0     0.003916
Name: intervention, dtype: float64
