In [2]:
import sys
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.append('../')
os.chdir("../")

base_folder = "./data/raw/model_data/"

In [3]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

def evaluate_col_classes(df):
    for column_name in df.columns:
        if is_string_dtype(df[column_name].dtype):  # the column is probably categorical if correctly encoded
            print('CATEGORICAL: Column name: {0}, dtype: {1}, values: {2}'
                  .format(column_name,
                    df[column_name].dtype,
                    df[column_name].unique()))
        elif is_numeric_dtype(df[column_name].dtype.name): # the column is probably numerical if correctly encoded
            unique_vals = df[column_name].unique()

            if len(unique_vals) > 100: # magic number
                print('NUMERICAL: Column name: {0}, dtype: {1}, #qty unique values: {2}'
                      .format(column_name,
                      df[column_name].dtype,
                      len(unique_vals)))
            else:
                print('NUMERICAL/POSSIBLY CATEGORICAL: Column name: {0}, dtype: {1}, unique values (#qty): {2}, ({3})'
                      .format(column_name,
                      df[column_name].dtype,
                      sorted(unique_vals),
                      len(unique_vals)))

        else: # what kind of type is this?
            print('UNKNOWN: Column name: {0}, dtype: {1}, class: numerical'
                  .format(column_name,
                   df[column_name].dtype))


In [4]:
dim_case_df = pd.read_csv(base_folder + "DIM_FALL.csv", encoding="ISO-8859-1")

evaluate_col_classes(dim_case_df)

  interactivity=interactivity, compiler=compiler, result=result)


CATEGORICAL: Column name: PATIENTID, dtype: object, values: [13831402 13719092 82907641 ... 'P0001605567' '00002621231' '00090912918']
NUMERICAL: Column name: CASEID, dtype: int64, #qty unique values: 2062921
NUMERICAL/POSSIBLY CATEGORICAL: Column name: CASETYP, dtype: int64, unique values (#qty): [1, 2, 3], (3)
CATEGORICAL: Column name: CASESTATUS, dtype: object, values: ['Fall ist abgeschlossen' 'Fall ist aktuell' 'Fall ist im Planstatus']
CATEGORICAL: Column name: FALAR, dtype: object, values: ['ambulanter Fall' 'stationärer Fall' 'teilstationärer Fall']
CATEGORICAL: Column name: BEGDT, dtype: object, values: ['2017-01-12' '2017-01-13' '2017-01-26' ... '2020-05-07' '2020-05-09'
 '2020-05-10']
CATEGORICAL: Column name: ENDDT, dtype: object, values: ['2017-01-13' '1753-01-01' '2018-01-23' ... '2027-05-05' '2020-07-13'
 '2021-03-31']
CATEGORICAL: Column name: PATIENTTYP, dtype: object, values: ['Standard Patient' 'Studie: Patient hat WI-Fall' 'Notfallpatient'
 'bekannter Workaround' 'G

In [5]:
dim_device_df = pd.read_csv(base_folder + "DIM_GERAET.csv", encoding="ISO-8859-1")

evaluate_col_classes(dim_device_df)

NUMERICAL: Column name: dim_geraet_bk, dtype: int64, #qty unique values: 1265
CATEGORICAL: Column name: dim_geraet_name, dtype: object, values: ['Unknown' 'Weiss Neidhart Claudia' 'PLWC' ... 'H5G019-3' 'H5G009-1'
 'H5G019-4']


In [6]:
dim_patient_df = pd.read_csv(base_folder + "DIM_PATIENT.csv", encoding="ISO-8859-1")

evaluate_col_classes(dim_patient_df)

NUMERICAL: Column name: PATIENTID, dtype: int64, #qty unique values: 163798
CATEGORICAL: Column name: GESCHLECHT, dtype: object, values: ['männlich' 'weiblich' nan]
CATEGORICAL: Column name: GEBURTSDATUM, dtype: object, values: ['1972-10-12' '1978-12-11' '1971-12-31' ... '2002-07-23' '2010-03-12'
 '1932-01-30']
CATEGORICAL: Column name: PLZ, dtype: object, values: ['3012' '3027' '1797' ... '95047' '6773' '72072']
CATEGORICAL: Column name: WOHNORT, dtype: object, values: ['Bern' 'Münchenwiler' 'Aarberg' ... 'Bergen' 'Paterno'
 'Prato (Leventina)']
CATEGORICAL: Column name: KANTON, dtype: object, values: ['BE' 'FR' 'SO' 'BS' 'VS' 'NE' 'ZH' 'LU' nan 'TI' 'VD' 'AG' 'BL' 'JU' 'GR'
 'TG' 'OW' 'SZ' 'SG' 'ZG' 'AR' 'TH' 'GE' 'GB' 'UR' 'NW' 'SH' 'HR' '38'
 'GL' 'IT' 'AI' 'MW' 'DE' 'CA' 'HU' 'AE' 'CE' 'LB' 'ES' 'FL' 'AU' 'NL'
 'RO' 'MC' 'ID' '06' 'PL' 'LT' 'BA' 'HK' 'DK' 'NSW' 'CZ' 'AT' 'VA' 'DV'
 'KS' '08' 'IL' '05' 'MI' 'RU' 'FI' 'IR' 'EG' 'MT' '04' '013' 'IE' 'NG'
 '005' 'US' 'NY' '21' '94' 'S

In [16]:
dim_room_df = pd.read_csv(base_folder + "DIM_RAUM.csv", encoding="ISO-8859-1")

evaluate_col_classes(dim_room_df)

#dim_room_df.sort_values(by='RAUMNAME')["RAUMNAME"].to_list()

NUMERICAL: Column name: RAUMID, dtype: int64, #qty unique values: 1374
CATEGORICAL: Column name: RAUMNAME, dtype: object, values: ['Unknown' 'Dermatologie (Eingang 14D)' 'Überwachung_B108b' ... 'Koje_5'
 'Koje_3' 'Koje_4']


In [8]:
dim_appointment_df = pd.read_csv(base_folder + "DIM_TERMIN.csv", encoding="ISO-8859-1")

evaluate_col_classes(dim_appointment_df)

NUMERICAL: Column name: TERMINID, dtype: int64, #qty unique values: 4506281
NUMERICAL/POSSIBLY CATEGORICAL: Column name: IS_DELETED, dtype: bool, unique values (#qty): [False, True], (2)
CATEGORICAL: Column name: TERMINBEZEICHNUNG, dtype: object, values: ['KONS + BE + FIBROSCAN' 'Lucentisinjektion OS  8. IVT'
 'Fuss*\r\r\n Poli Verlaufspatient Röntgen' ...
 'Blutung aus Iliakalgefäss ( whs. zervikal, vaginal)\r\r\nAngiografie in Embolisationsbereitschaft'
 'LISS  Plattenosteosynthese rechts' 'Olecranon Osteosynthese links']
CATEGORICAL: Column name: TERMINART, dtype: object, values: ['KONS + BE + FIBROSCAN' 'Lucentisinjektion OS' 'Fuss Verlauf Röntgen' ...
 "MTT Training 60'" 'Kiefergelenksreposition offen re' 'ABE30K15T300']
CATEGORICAL: Column name: TERMINTYP, dtype: object, values: ['Patiententermin' 'Operation' 'Gruppentermin']
CATEGORICAL: Column name: TERMINDATUM, dtype: object, values: ['2017-03-07' '2018-03-28' '2019-02-05' ... '2022-05-19' '2022-05-08'
 '2022-05-13']
NUMERICAL

In [9]:
fact_drug_df = pd.read_csv(base_folder + "FAKT_MEDIKAMENTE.csv", encoding="ISO-8859-1")

evaluate_col_classes(fact_drug_df)

NUMERICAL: Column name: PATIENTID, dtype: int64, #qty unique values: 119766
NUMERICAL: Column name: CASEID, dtype: int64, #qty unique values: 193067
CATEGORICAL: Column name: DRUG_TEXT, dtype: object, values: ['Novalgin' 'Aspirin Cardio' 'Pantoprazol' ... 'Alges-X' 'Inflectra'
 'Tavor']
CATEGORICAL: Column name: DRUG_ATC, dtype: object, values: ['N02BB02' 'B01AC06' 'A02BC02' ... 'S02DZ' 'A06A' 'B05AA06']
NUMERICAL: Column name: DRUG_QUANTITY, dtype: float64, #qty unique values: 7893
CATEGORICAL: Column name: DRUG_UNIT, dtype: object, values: ['500 mg/1 Stk' '100 mg/1 Stk' '40 mg/1 Stk' ... '1 g/1 g'
 '0.6 ml/1 ml, 0.15 ml/1 ml, 0.25 ml/1 ml' '1.5 mcg/1 Stk']
CATEGORICAL: Column name: DRUG_DISPFORM, dtype: object, values: ['p.o. (peroral)' 'i.v. (intravenös; Injektion)' 'i.v. (intravenös)'
 'i.v. (intravenös; Ernährung)' 'lokal / topisch'
 'p.o. (peroral; Zyto/CMR)' 'rectal (lokal)' 'i.m. (intramuskulär)'
 'rectal (systemisch)' 'i.v. (intravenös; Infusion)'
 'p.o. (peroral; Ernährung)' 

In [4]:
fact_appointment_device_df = pd.read_csv(base_folder + "FAKT_TERMIN_GERAET.csv", encoding="ISO-8859-1")

evaluate_col_classes(fact_appointment_device_df)

NUMERICAL: Column name: TERMINID, dtype: int64, #qty unique values: 3105781
NUMERICAL: Column name: GERAETID, dtype: int64, #qty unique values: 1164
CATEGORICAL: Column name: TERMINSTART_TS, dtype: object, values: ['2007-03-09 09:30:00.0000000' '2007-03-28 09:15:00.0000000'
 '2007-03-26 12:30:00.0000000' ... '2009-12-22 13:35:00.0000000'
 '2009-12-17 12:55:00.0000000' '2010-01-13 12:15:00.0000000']
CATEGORICAL: Column name: TERMINENDE_TS, dtype: object, values: ['2007-03-09 10:00:00.0000000' '2007-03-28 10:00:00.0000000'
 '2007-03-26 13:00:00.0000000' ... '2010-01-12 10:40:00.0000000'
 '2009-12-22 16:34:00.0000000' '2009-12-17 18:53:00.0000000']
NUMERICAL: Column name: DAUERINMIN, dtype: float64, #qty unique values: 1632


In [12]:
fact_appointment_employee_df = pd.read_csv(base_folder + "FAKT_TERMIN_MITARBEITER.csv", encoding="ISO-8859-1")

evaluate_col_classes(fact_appointment_employee_df)

NUMERICAL: Column name: TERMINID, dtype: int64, #qty unique values: 6351344
NUMERICAL: Column name: MITARBEITERID, dtype: int64, #qty unique values: 6474
CATEGORICAL: Column name: TERMINSTART_TS, dtype: object, values: ['2011-01-13 09:15:00.0000000' '2009-12-22 08:30:00.0000000'
 '2010-01-07 14:45:00.0000000' ... '2019-12-17 15:55:00.0000000'
 '2019-11-23 21:52:00.0000000' '2019-12-13 11:42:00.0000000']
CATEGORICAL: Column name: TERMINENDE_TS, dtype: object, values: ['2011-01-13 10:00:00.0000000' '2009-12-22 09:00:00.0000000'
 '2010-01-07 15:15:00.0000000' ... '2019-11-23 18:14:00.0000000'
 '2019-11-24 01:03:00.0000000' '2020-01-08 07:45:00.0000000']
NUMERICAL: Column name: DAUERINMIN, dtype: float64, #qty unique values: 1331


In [5]:
fact_appointment_patient_df = pd.read_csv(base_folder + "FAKT_TERMIN_PATIENT.csv", encoding="ISO-8859-1")

evaluate_col_classes(fact_appointment_patient_df)

NUMERICAL: Column name: TERMINID, dtype: float64, #qty unique values: 909909
NUMERICAL: Column name: PATIENTID, dtype: float64, #qty unique values: 102582
NUMERICAL: Column name: FALLID, dtype: int64, #qty unique values: 161005


In [14]:
fact_appointment_room_df = pd.read_csv(base_folder + "FAKT_TERMIN_RAUM.csv", encoding="ISO-8859-1")

evaluate_col_classes(fact_appointment_room_df)

NUMERICAL: Column name: TERMINID, dtype: float64, #qty unique values: 2685296
NUMERICAL: Column name: RAUMID, dtype: float64, #qty unique values: 1041
CATEGORICAL: Column name: RAUMNAME, dtype: object, values: ['Blutentnahme F 20' 'Gyn/Geb OP Sectio' 'ANGP Koje 5' ... '61_Raum1'
 'SCTA CT' 'Handtherapie A 432']
CATEGORICAL: Column name: TERMINSTART_TS, dtype: object, values: ['2018-10-30 14:45:00.0000000' '2020-05-07 06:09:00.0000000'
 '2018-03-13 15:30:00.0000000' ... '2017-02-27 17:27:00.0000000'
 '2018-05-31 15:37:00.0000000' '2020-01-06 17:22:00.0000000']
CATEGORICAL: Column name: TERMINENDE_TS, dtype: object, values: ['2018-10-30 15:00:00.0000000' '2020-05-07 06:10:00.0000000'
 '2018-03-13 16:55:00.0000000' ... '2019-08-15 14:02:00.0000000'
 '2018-11-27 10:13:00.0000000' '2020-01-06 17:42:00.0000000']
NUMERICAL: Column name: DAUERINMIN, dtype: float64, #qty unique values: 1085


In [17]:
chop_surgical_procedures_code_df = pd.read_csv(base_folder + "LA_CHOP_FLAT.csv", encoding="ISO-8859-1")

evaluate_col_classes(chop_surgical_procedures_code_df)

CATEGORICAL: Column name: CHOPCODE, dtype: object, values: ['C0' 'C1' 'C10' ... 'ZBB.1O' 'ZBB.1P' 'ZBB.1Q']
NUMERICAL/POSSIBLY CATEGORICAL: Column name: CHOPVERWENDUNGSJAHR, dtype: int64, unique values (#qty): [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020], (12)
CATEGORICAL: Column name: CHOP, dtype: object, values: ['Massnahmen und Interventionen nicht anderswo Klassifizierbar (00)'
 'Massnahmen und Interventionen nicht anderswo klassifizierbar (00)'
 'Operationen am Nervensystem (01\x9605)' ...
 'Zusatzaufwand in der Rehabilitation, mindestens 601 bis 650 Aufwandspunkte'
 'Zusatzaufwand in der Rehabilitation, mindestens 651 bis 700 Aufwandspunkte'
 'Zusatzaufwand in der Rehabilitation, 701 und mehr Aufwandspunkte']
CATEGORICAL: Column name: CHOPCODELEVEL1, dtype: object, values: ['C0' 'C1' 'C10' 'C11' 'C12' 'C13' 'C14' 'C15' 'C16' 'C17' 'C18' 'C2' 'C3'
 'C4' 'C5' 'C6' 'C7' 'C8' 'C9']
CATEGORICAL: Column name: CHOPLEVEL1, dtype: object, values: ['Massnahmen u

In [19]:
movement_df = pd.read_csv(base_folder + "LA_ISH_NBEW.csv", encoding="ISO-8859-1")

evaluate_col_classes(movement_df)

NUMERICAL: Column name: FALNR, dtype: int64, #qty unique values: 217918
NUMERICAL: Column name: LFDNR, dtype: int64, #qty unique values: 347
NUMERICAL/POSSIBLY CATEGORICAL: Column name: BEWTY, dtype: int64, unique values (#qty): [1, 2, 3, 4, 6, 7], (6)
CATEGORICAL: Column name: BWART, dtype: object, values: ['SB' 'EI' 'WA' 'AK' 'EN' 'EK' 'KK' 'BE' 'BU' 'AU' 'WB' 'GA' 'EV' 'AE'
 'WZ' 'AN' 'BS' 'GE' 'BK' 'UB' 'UE' 'EW' 'EE' 'GN' 'AS' 'EB' 'KU' 'EU'
 'GB' 'BA' 'EA' 'KZ' nan 'KA' 'WK' 'AT' 'ET' 'ES' 'GK' 'KS' 'RA' 'UO' 'WS'
 'RE' 'KV' 'AG' 'BG' 'EX']
CATEGORICAL: Column name: BWIDT, dtype: object, values: ['2017-01-04' '2017-01-17' '2017-01-23' ... '2020-10-08' '2020-09-08'
 '2020-06-19']
CATEGORICAL: Column name: BWIZT, dtype: object, values: ['15:30:00.0000000' '10:00:00.0000000' '11:37:28.0000000' ...
 '05:26:40.0000000' '14:52:10.0000000' '05:49:02.0000000']
NUMERICAL/POSSIBLY CATEGORICAL: Column name: STATU, dtype: float64, unique values (#qty): [30.0, nan, 20.0, 55.0, 70.0], (5)
CATE

In [20]:
diagnosis_df = pd.read_csv(base_folder + "LA_ISH_NDIA.csv", encoding="ISO-8859-1")

evaluate_col_classes(diagnosis_df)

NUMERICAL: Column name: FALNR, dtype: int64, #qty unique values: 201632
CATEGORICAL: Column name: DKEY1, dtype: object, values: ['L02.4' 'B96.2' 'L89.27' ... 'A00.1' 'M14.57' 'K00.8']
NUMERICAL/POSSIBLY CATEGORICAL: Column name: DKAT1, dtype: int64, unique values (#qty): [11, 15, 17, 19], (4)
CATEGORICAL: Column name: DIADT, dtype: object, values: ['2016-12-28' '2017-01-05' '2016-12-31' ... '2020-02-09' '2020-02-26'
 '2020-04-23']
CATEGORICAL: Column name: DRG_CATEGORY, dtype: object, values: ['P' 'S']


In [21]:
drg_tarif_df = pd.read_csv(base_folder + "LA_ISH_NDRG.csv", encoding="ISO-8859-1")

evaluate_col_classes(drg_tarif_df)


NUMERICAL: Column name: PATCASEID, dtype: int64, #qty unique values: 200700
NUMERICAL: Column name: COST_WEIGHT, dtype: float64, #qty unique values: 8179


In [26]:
partner_case_df = pd.read_csv(base_folder + "LA_ISH_NFPZ.csv", encoding="ISO-8859-1")

evaluate_col_classes(partner_case_df)

CATEGORICAL: Column name: EARZT, dtype: object, values: ['H' 'E' 'U' 'B' 'W' 'X' 'A' 'N' 'G' 'O']
NUMERICAL/POSSIBLY CATEGORICAL: Column name: FARZT, dtype: int64, unique values (#qty): [0, 1, 2, 3, 4, 5, 6, 7, 9], (9)
NUMERICAL: Column name: FALNR, dtype: int64, #qty unique values: 169803
NUMERICAL: Column name: LFDNR, dtype: int64, #qty unique values: 401
NUMERICAL: Column name: PERNR, dtype: int64, #qty unique values: 8457
CATEGORICAL: Column name: STORN, dtype: object, values: [nan 'X']


In [22]:
external_partner_df = pd.read_csv(base_folder + "LA_ISH_NGPA.csv", encoding="ISO-8859-1")

evaluate_col_classes(external_partner_df)

NUMERICAL: Column name: GPART, dtype: int64, #qty unique values: 97647
CATEGORICAL: Column name: NAME1, dtype: object, values: [nan 'Swisscare Insurance AG' 'Praxisgemeinschaft' ... 'Tavares Valdez'
 'Bachofner-Weber' 'Nageh']
CATEGORICAL: Column name: NAME2, dtype: object, values: [nan 'HasliPraxis' 'Medizinisches Labor' ... 'Olimpia Ksenia' 'Kathrine'
 'Thuraia']
CATEGORICAL: Column name: NAME3, dtype: object, values: [nan 'Schadenabteilung' 'Spitalfinanzierung' ... 'Chefarzt Stellvertreter'
 'Dr.med. P. Freiburghaus' 'Praxis Dr. K. Geisbühler']
CATEGORICAL: Column name: LAND, dtype: object, values: ['CH' 'LI' 'DE' 'US' 'NL' 'DK' 'FR' 'GB' 'BE' 'PL' 'CZ' 'AT' 'RS' 'FI'
 'HR' 'IL' 'IT' 'SI' 'IE' 'LU' 'AU' 'HU' 'ZA' 'BG' 'GR' 'ES' 'RU' 'AR'
 'MA' 'SG' 'NZ' 'MK' 'CA' 'UY' 'LB' 'PH' 'BA' 'EG' 'UA' 'NO' 'AL' 'KS'
 'IN' 'CN' 'SN' 'BR' 'MY' 'JP' 'AE' 'PT' 'DZ' 'TN' 'IS' 'IR' 'SE' 'TW'
 'ID' 'LT' 'CM' 'HK' 'LV' 'TH' nan]
CATEGORICAL: Column name: PSTLZ, dtype: object, values: [nan '1700' '49

  interactivity=interactivity, compiler=compiler, result=result)


In [23]:
surgery_df = pd.read_csv(base_folder + "LA_ISH_NICP.csv", encoding="ISO-8859-1")

evaluate_col_classes(surgery_df)

NUMERICAL/POSSIBLY CATEGORICAL: Column name: LFDBEW, dtype: int64, unique values (#qty): [0], (1)
NUMERICAL/POSSIBLY CATEGORICAL: Column name: ICPMK, dtype: int64, unique values (#qty): [11, 16, 17, 18, 19, 20], (6)
CATEGORICAL: Column name: ICPML, dtype: object, values: ['37.51.10' '00.93.20' '00.99.10' ... '39.B1.D1' '39.95.41' '92.28.65']
NUMERICAL/POSSIBLY CATEGORICAL: Column name: ANZOP, dtype: int64, unique values (#qty): [0], (1)
CATEGORICAL: Column name: BGDOP, dtype: object, values: ['2016-07-07' '2016-07-10' '2016-07-14' ... '2020-02-22' '2020-02-23'
 '2020-04-23']
CATEGORICAL: Column name: LSLOK, dtype: object, values: [nan 'L' 'R' 'B' 'U' 'EU']
CATEGORICAL: Column name: STORN, dtype: object, values: [nan 'X']
NUMERICAL: Column name: FALNR, dtype: int64, #qty unique values: 168297
CATEGORICAL: Column name: ORGPF, dtype: object, values: ['INEGE 1' 'G NORD' '71WEST P' '71IPS' 'NDYL' 'HGKT' 'NEPH' 'IBME'
 'KK K SU' 'KK HS' 'Q MITTE' '71WEST 2' 'N NORD' '62STAT A' 'R MITTE1'
 'I

In [27]:
tacs_df = pd.read_csv(base_folder + "TACS_DATEN.csv", encoding="ISO-8859-1")

evaluate_col_classes(tacs_df)

  interactivity=interactivity, compiler=compiler, result=result)


NUMERICAL: Column name: patient_patientid, dtype: int64, #qty unique values: 87893
CATEGORICAL: Column name: patient_typ, dtype: object, values: ['Standard Patient']
CATEGORICAL: Column name: patient_status, dtype: object, values: ['aktiv']
NUMERICAL: Column name: fall_nummer, dtype: int64, #qty unique values: 132974
CATEGORICAL: Column name: fall_typ, dtype: object, values: ['Standard Fall']
CATEGORICAL: Column name: fall_status, dtype: object, values: ['aktiv']
CATEGORICAL: Column name: datum_betreuung, dtype: object, values: ['2018-03-07 00:00:00' '2018-01-23 00:00:00' '2018-07-04 00:00:00'
 '2018-03-15 00:00:00' '2018-03-13 00:00:00' '2018-02-24 00:00:00'
 '2018-02-25 00:00:00' '2018-02-11 00:00:00' '2018-11-03 00:00:00'
 '2018-11-20 00:00:00' '2018-04-08 00:00:00' '2018-09-12 00:00:00'
 '2018-04-09 00:00:00' '2019-02-12 00:00:00' '2019-07-07 00:00:00'
 '2018-02-07 00:00:00' '2019-04-30 00:00:00' '2019-06-21 00:00:00'
 '2018-01-17 00:00:00' '2019-04-29 00:00:00' '2019-07-16 00:00:0

In [28]:
diagnosis_norm_df = pd.read_csv(base_folder + "V_LA_ISH_NDIA_NORM.csv", encoding="ISO-8859-1")

evaluate_col_classes(diagnosis_norm_df)

NUMERICAL: Column name: FALNR, dtype: int64, #qty unique values: 201632
CATEGORICAL: Column name: DKEY1, dtype: object, values: ['H34.8' 'Z92.2' 'R73.9' ... 'B97.3' 'B39.3' 'A00.1']
NUMERICAL/POSSIBLY CATEGORICAL: Column name: DKAT1, dtype: int64, unique values (#qty): [11, 15, 17, 19], (4)
CATEGORICAL: Column name: DIADT, dtype: object, values: ['2017-01-03' '2017-01-04' '2017-01-05' ... '2020-02-25' '2020-02-26'
 '2020-04-23']
CATEGORICAL: Column name: DRG_CATEGORY, dtype: object, values: ['S' 'P']


In [29]:
vre_screening_df = pd.read_csv(base_folder + "VRE_SCREENING_DATA.csv", encoding="ISO-8859-1")

evaluate_col_classes(vre_screening_df)

NUMERICAL: Column name: auftrag_nr, dtype: int64, #qty unique values: 9430
CATEGORICAL: Column name: erfassung, dtype: object, values: ['2017-12-30' '2017-12-31' '2018-01-03' '2018-01-04' '2018-01-05'
 '2018-01-06' '2018-01-08' '2018-01-09' '2018-01-10' '2018-01-11'
 '2018-01-12' '2018-01-13' '2018-01-14' '2018-01-15' '2018-01-16'
 '2018-01-17' '2018-01-18' '2018-01-19' '2018-01-20' '2018-01-21'
 '2018-01-22' '2018-01-23' '2018-01-24' '2018-01-25' '2018-01-26'
 '2018-01-27' '2018-01-28' '2018-01-29' '2018-01-30' '2018-01-31'
 '2018-02-01' '2018-02-02' '2018-02-03' '2018-02-04' '2018-02-05'
 '2018-02-06' '2018-02-07' '2018-02-08' '2018-02-09' '2018-02-10'
 '2018-02-11' '2018-02-12' '2018-02-13' '2018-02-14' '2018-02-15'
 '2018-02-16' '2018-02-18' '2018-02-19' '2018-02-20' '2018-02-21'
 '2018-02-22' '2018-02-23' '2018-02-24' '2018-02-25' '2018-02-26'
 '2018-02-27' '2018-02-28' '2018-03-01' '2018-03-02' '2018-03-03'
 '2018-03-05' '2018-03-06' '2018-03-07' '2018-03-08' '2018-03-09'
 '2018-