<a href="https://colab.research.google.com/github/MichaelKru92/Projekt-ML-Modelierung/blob/main/Schritt_7_ETL_Pipeline_komplett.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**ETL Pipeline**

Die komplette ETL-Pipeline aus den Schritten 1-4 und 7 in einem Code-Block.
Das Ergebnis sind aggregierte und transformierte Daten in der SQL Datebank synthea.db inklusive einer Faktentabelle namens encounters_final.

Um die Datenbank lokal in VisualStudioCode anzusehen muss die Extensio SQLite Viewer installiert werden.

In [None]:

import pandas as pd
import numpy as np
import sys
import os
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import urllib
from zipfile import ZipFile
import certifi
import ssl
import sqlite3

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/MichaelKru92/Projekt-ML-Modelierung/main/"
ROHDATEN_LOCAL_PATH = os.path.join(".", "Rohdaten")
ROHDATEN_URL = DOWNLOAD_ROOT + "Rohdaten(CSV)/Rohdaten.zip"

def fetch_Rohdaten(rohdaten_url=ROHDATEN_URL, rohdaten_local_path=ROHDATEN_LOCAL_PATH):
    os.makedirs(rohdaten_local_path, exist_ok=True)
    zip_path = os.path.join(rohdaten_local_path, "Rohdaten.zip")
    try:
        os.environ['SSL_CERT_FILE'] = certifi.where() # Set environment variable to use certifi
        context = ssl.create_default_context() # Create context using system's CAs
        with urllib.request.urlopen(rohdaten_url, context=context) as response: # Use the context with urlopen
            with open(zip_path, 'wb') as f:
                f.write(response.read())
        print(f"Downloaded to: {zip_path}")

        extract_path = os.path.join(rohdaten_local_path, "Rohdaten(CSV)")  # Separate extraction directory
        os.makedirs(extract_path, exist_ok=True)  # Create the extraction directory

        with ZipFile(zip_path, "r") as zip_ref:  # Use the zip_path VARIABLE
            zip_ref.extractall(extract_path)  # Extract to the separate directory
        print(f"Extracted to: {extract_path}")

    except urllib.error.URLError as e:
        print(f"Error downloading: {e}")
    except Exception as e:  # Catch other potential errors
        print(f"An error occurred: {e}")


fetch_Rohdaten()


CSV_PATH = os.path.join(ROHDATEN_LOCAL_PATH, "Rohdaten(CSV)")

CSV_TABLES = ["careplans.csv", "conditions.csv", "devices.csv", "disease.csv", "encounters.csv", "imaging_studies.csv", "immunizations.csv",
              "medications.csv", "observations.csv", "organizations.csv", "patients.csv", "payer_transitions.csv", "payers.csv", "procedures.csv",
              "providers.csv", "supplies.csv"]

def load_csv_to_df_overview(csv_tables=CSV_TABLES, csv_path=CSV_PATH):
    for i in csv_tables:
        csv_table_path = os.path.join(csv_path, i)
        df = pd.read_csv(csv_table_path)
        print(f"\n\nDie ersten 10 Zeilen von Tabelle {i}:", df.head())
        print(f"\n\nMetadaten zu Tabelle {i}:\n")
        df.info()


load_csv_to_df_overview()

#Suche nach Duplikaten in Tabellenauswahl:
df_careplans = pd.read_csv(os.path.join(CSV_PATH, "careplans.csv"))
df_conditions = pd.read_csv(os.path.join(CSV_PATH, "conditions.csv"))
df_devices = pd.read_csv(os.path.join(CSV_PATH, "devices.csv"))
df_disease = pd.read_csv(os.path.join(CSV_PATH, "disease.csv"))
df_imaging_studies = pd.read_csv(os.path.join(CSV_PATH, "imaging_studies.csv"))
df_encounters = pd.read_csv(os.path.join(CSV_PATH, "encounters.csv"))
df_patients = pd.read_csv(os.path.join(CSV_PATH, "patients.csv"))
df_medications = pd.read_csv(os.path.join(CSV_PATH, "medications.csv"))
df_observations = pd.read_csv(os.path.join(CSV_PATH, "observations.csv"))
df_immunizations = pd.read_csv(os.path.join(CSV_PATH, "immunizations.csv"))
df_procedures = pd.read_csv(os.path.join(CSV_PATH, "procedures.csv"))
df_organizations = pd.read_csv(os.path.join(CSV_PATH, "organizations.csv"))
df_payer_transitions = pd.read_csv(os.path.join(CSV_PATH, "payer_transitions.csv"))
df_payers = pd.read_csv(os.path.join(CSV_PATH, "payers.csv"))
df_providers = pd.read_csv(os.path.join(CSV_PATH, "providers.csv"))
df_supplies = pd.read_csv(os.path.join(CSV_PATH, "supplies.csv"))

def check_for_duplicates(df, column):
  df[column] = df[column].apply(str)
  dup_df = df.duplicated(subset= [column])
  print (f"Anzahl der Duplikate in Spalte {column}:", dup_df.sum())
  print (df[dup_df].head(10))

check_for_duplicates(df_patients, "Id")
check_for_duplicates(df_medications, "ENCOUNTER")
check_for_duplicates(df_immunizations, "ENCOUNTER")
check_for_duplicates(df_observations, "ENCOUNTER")
check_for_duplicates(df_procedures, "ENCOUNTER")


#NULL-Werte aus careplans entfernen:
df_careplans = df_careplans.dropna(subset = ["REASONCODE", "REASONDESCRIPTION"])
print(df_careplans.isnull().sum())

#Spalten ohne Relevanz für Kostenprediktion mit NULL-Werten aus patients entfernen:
df_patients = df_patients.drop(columns = ["DRIVERS", "PASSPORT", "PREFIX", "SUFFIX", "MAIDEN", "MARITAL"])
print(df_patients.isnull().sum())
print(len(df_patients))

#Spalten ohne Relevanz für Kostenprediktion und hoher Zahl an NULL-Werten aus encounters entfernen:
df_encounters = df_encounters.drop(columns = ["REASONCODE", "REASONDESCRIPTION"])
print(df_encounters.isnull().sum())
print(len(df_encounters))

#Spalten ohne Relevanz für Kostenprediktion und hoher Zahl an NULL-Werten aus medications entfernen:
df_medications = df_medications.drop(columns = ["REASONCODE", "REASONDESCRIPTION"])
print(df_medications.isnull().sum())
print(len(df_medications))

#Spalten ohne Relevanz für Kostenprediktion und hoher Zahl an NULL-Werten aus procedures entfernen:
df_procedures = df_procedures.drop(columns = ["REASONCODE", "REASONDESCRIPTION"])
print(df_procedures.isnull().sum())
print(len(df_procedures))

#NULL-Werte aus observations entfernen:
df_observations = df_observations.dropna(subset = "ENCOUNTER")
print(df_observations.isnull().sum())
print(len(df_observations))

COLUMN_LIST = ["START", "STOP"]
DF_DICT = {
    "df_patients": df_patients,
    "df_organizations": df_organizations,
    "df_providers": df_providers,
    "df_payers": df_payers,
    "df_encounters": df_encounters,
    "df_careplans": df_careplans,
    "df_conditions": df_conditions,
    "df_medications": df_medications,
    "df_procedures": df_procedures,
    "df_observations": df_observations,
    "df_devices": df_devices,
    "df_imaging_studies": df_imaging_studies,
    "df_immunizations": df_immunizations,
    "df_payer_transitions": df_payer_transitions,
    "df_disease": df_disease,
    "df_supplies": df_supplies,
}

def change_dateformat(df_dict=DF_DICT, cols_list=COLUMN_LIST):
    for df_name, df in df_dict.items():
        for cols in cols_list:
            if cols in df.columns:
                df[cols] = pd.to_datetime(df[cols], errors="coerce")
                df[cols] = df[cols].dt.strftime("%m/%d/%Y, %H:%M:%S")
                print(f"Datum in Spalte {cols} in DataFrame {df_name} erfolgreich formatiert.")
            else:
                print(f"Spalte {cols} ist in DataFrame {df_name} nicht vorhanden.")
        print (df.head())
    return df_dict

change_dateformat()


#Aggregation
AGG_DIF_DICT = {"df_medications" : df_medications, "df_procedures" : df_procedures, "df_patients" : df_patients,
               "df_immunizations" : df_immunizations, "df_observations" : df_observations, "df_conditions" : df_conditions}

def get_counts(df, group_col, count_col, new_col_name):
    agg_df_counts = df.groupby(group_col)[count_col].count().reset_index()
    agg_df_counts.rename(columns={group_col: "Id", count_col: new_col_name}, inplace=True)
    return agg_df_counts

med_counts = get_counts(AGG_DIF_DICT["df_medications"], "ENCOUNTER", "DESCRIPTION", "MEDICATION_COUNT")
vacc_counts = get_counts(AGG_DIF_DICT["df_immunizations"], "ENCOUNTER", "DESCRIPTION", "VACCINATION_COUNT")
cond_counts = get_counts(AGG_DIF_DICT["df_conditions"], "ENCOUNTER", "DESCRIPTION", "CONDITION_COUNT")
procedures_counts = get_counts(AGG_DIF_DICT["df_procedures"], "ENCOUNTER", "DESCRIPTION", "PROCEDURES_COUNT")

df_encounters_final = df_encounters.copy()
df_encounters_final = pd.merge(df_encounters_final, med_counts, on='Id', how='left')
df_encounters_final = pd.merge(df_encounters_final, vacc_counts, on='Id', how='left')
df_encounters_final = pd.merge(df_encounters_final, cond_counts, on='Id', how='left')
df_encounters_final = pd.merge(df_encounters_final, procedures_counts, on='Id', how='left')

df_encounters_final.fillna(0, inplace=True)
df_encounters_final[["MEDICATION_COUNT", "VACCINATION_COUNT", "CONDITION_COUNT", "PROCEDURES_COUNT"]] = df_encounters_final[["MEDICATION_COUNT", "VACCINATION_COUNT", "CONDITION_COUNT", "PROCEDURES_COUNT"]].astype(int)

# BMI
bmi_observations = df_observations[df_observations["DESCRIPTION"] == "Body Mass Index"].copy()
bmi_observations.rename(columns={"ENCOUNTER" : "Id", "VALUE" : "BMI"}, inplace=True)
df_encounters_final = pd.merge(df_encounters_final, bmi_observations[["Id","BMI"]], on="Id", how= "left") #Nur Spalte BMI im Merge

# Geschlecht
gender_patients = df_patients[["Id","GENDER"]].copy()
gender_patients.rename(columns ={"Id" : "PATIENT"}, inplace=True)
df_encounters_final = pd.merge(df_encounters_final, gender_patients[["PATIENT","GENDER"]], on="PATIENT", how= "left") #Nur Spalte BMI im Merge

#Medikationskosten pro Fall Null-Werte durch Median ersetzt
med_cost = df_medications.groupby("ENCOUNTER")["TOTALCOST"].sum().reset_index()
med_cost.rename(columns = {"ENCOUNTER" : "Id", "TOTALCOST" : "TOTAL_MED_COST"}, inplace=True)
df_encounters_final = pd.merge(df_encounters_final, med_cost[["Id","TOTAL_MED_COST"]], on="Id", how= "left")
df_encounters_final["TOTAL_MED_COST"] = df_encounters_final["TOTAL_MED_COST"].fillna(df_encounters_final["TOTAL_MED_COST"].median())

#Alter bei Aufnahme

df_patients_age = df_patients.rename(columns={"Id" :"PATIENT"})
df_encounters_final = pd.merge(df_encounters_final, df_patients_age[["PATIENT", "BIRTHDATE"]], on="PATIENT", how="left")

df_encounters_final["START"] = pd.to_datetime(df_encounters_final["START"], errors="coerce")
df_encounters_final["BIRTHDATE"] = pd.to_datetime(df_encounters_final["BIRTHDATE"], errors="coerce")
df_encounters_final["AGE_AT_ENCOUNTER"] = (df_encounters_final["START"] - df_encounters_final["BIRTHDATE"]).dt.days/365
df_encounters_final["AGE_AT_ENCOUNTER"] = df_encounters_final["AGE_AT_ENCOUNTER"].astype(int)
df_encounters_final["START"] = df_encounters_final["START"].dt.strftime("%m/%d/%Y, %H:%M:%S")
df_encounters_final.drop(columns = "BIRTHDATE", inplace=True)


#Aufbau der SQL Datenbank
db_name = "synthea.db"
db_path = os.path.join(".", db_name)                                                                           #DB Dateiname wird vergeben
conn = sqlite3.connect(db_path)                                                                   #Verbindung zur DB Datei wird hergestellt
cur = conn.cursor()                                                                               #Ueber dise Verbindung wird auf die DB ein Cursor gesetzt

sql_table_careplans = """
CREATE TABLE IF NOT EXISTS careplans (
    Id STRING PRIMARY KEY,
    START DATE,
    STOP DATE,
    PATIENT STRING,
    ENCOUNTER STRING,
    CODE STRING,
    DESCRIPTION STRING,
    REASONCODE STRING,
    REASONDESCRIPTION STRING,
    FOREIGN KEY (PATIENT)
       REFERENCES patients (Id),
    FOREIGN KEY (ENCOUNTER)
       REFERENCES encounters (Id)
);
"""
cur.execute(sql_table_careplans)


try:
    sql_table_patients = """
    CREATE TABLE IF NOT EXISTS patients (
        Id TEXT PRIMARY KEY,
        BIRTHDATE TEXT,
        DEATHDATE TEXT,
        SSN TEXT,
        FIRST TEXT,
        LAST TEXT,
        SUFFIX TEXT,
        RACE TEXT,
        ETHNICITY TEXT,
        GENDER TEXT,
        BIRTHPLACE TEXT,
        ADDRESS TEXT,
        CITY TEXT,
        STATE TEXT,
        COUNTY TEXT,
        ZIP TEXT,
        LAT REAL,
        LON REAL,
        HEALTHCARE_EXPENSES REAL,
        HEALTHCARE_COVERAGE REAL
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_patients)
    conn.commit()

    sql_table_organizations = """
    CREATE TABLE IF NOT EXISTS organizations (
        Id TEXT PRIMARY KEY,
        NAME TEXT,
        ADDRESS TEXT,
        CITY TEXT,
        STATE TEXT,
        ZIP TEXT,
        LAT REAL,
        LON REAL,
        PHONE TEXT,
        REVENUE REAL,
        UTILIZATION INTEGER
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_organizations)
    conn.commit()

    sql_table_providers = """
    CREATE TABLE IF NOT EXISTS providers (
        Id TEXT PRIMARY KEY,
        ORGANIZATION TEXT,
        NAME TEXT,
        GENDER TEXT,
        SPECIALITY TEXT,
        FOREIGN KEY (ORGANIZATION)
            REFERENCES organizations (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_providers)
    conn.commit()

    sql_table_payers = """
    CREATE TABLE IF NOT EXISTS payers (
        Id TEXT PRIMARY KEY,
        NAME TEXT,
        ADDRESS TEXT,
        CITY TEXT,
        STATE_HEADQUARTERED TEXT,
        ZIP TEXT,
        PHONE TEXT,
        AMOUNT_COVERED REAL,
        AMOUNT_UNCOVERED REAL,
        REVENUE INTEGER,
        COVERED_ENCOUNTERS INTEGER,
        UNCOVERED_ENCOUNTERS INTEGER,
        COVERED_MEDICATIONS INTEGER,
        UNCOVERED_MEDICATIONS INTEGER,
        COVERED_PROCEDURES INTEGER,
        UNCOVERED_PROCEDURES INTEGER,
        COVERED_IMMUNIZATIONS INTEGER,
        UNCOVERED_IMMUNIZATIONS INTEGER,
        UNIQUE_CUSTOMERS INTEGER,
        QOLS_AVG REAL,
        MEMBER_MONTHS INTEGER
        );"""
    cur = conn.cursor()
    cur.execute(sql_table_payers)
    conn.commit()

    sql_table_encounters = """
    CREATE TABLE IF NOT EXISTS encounters (
        Id TEXT PRIMARY KEY,
        START TEXT,
        STOP TEXT,
        PATIENT TEXT,
        ORGANIZATION TEXT,
        PROVIDER TEXT,
        PAYER TEXT,
        ENCOUNTERCLASS TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        BASE_ENCOUNTER_COST REAL,
        TOTAL_CLAIM_COST REAL,
        PAYER_COVERAGE REAL,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ORGANIZATION)
            REFERENCES organizations (Id),
        FOREIGN KEY (PROVIDER)
            REFERENCES providers (Id),
        FOREIGN KEY (PAYER)
            REFERENCES payers (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_encounters)
    cur.commit()

    sql_table_careplans = """
    CREATE TABLE IF NOT EXISTS careplans (
        Id TEXT PRIMARY KEY,
        START TEXT,
        STOP TEXT,
        PATIENT TEXT,
        ENCOUNTER TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        REASONCODE TEXT,
        REASONDESCRIPTION TEXT,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ENCOUNTER)
            REFERENCES encounters (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_careplans)
    conn.commit()

    sql_table_conditions = """
    CREATE TABLE IF NOT EXISTS conditions (
        START TEXT,
        STOP TEXT,
        PATIENT TEXT,
        ENCOUNTER TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ENCOUNTER)
            REFERENCES encounters (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_conditions)
    conn.commit()

    sql_table_medications = """
    CREATE TABLE IF NOT EXISTS medications (
        START TEXT,
        STOP TEXT,
        PATIENT TEXT,
        PAYER TEXT,
        ENCOUNTER TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        BASE_COST REAL,
        PAYER_COVERAGE REAL,
        DISPENSES REAL,
        TOTALCOST REAL,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ENCOUNTER)
            REFERENCES encounters (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_medications)
    conn.commit()

    sql_table_procedures = """
    CREATE TABLE IF NOT EXISTS procedures (
        DATE DATE,
        PATIENT TEXT,
        ENCOUNTER TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        BASE_COST REAL,
        REASONCODE TEXT,
        REASONDESCRIPTION TEXT,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ENCOUNTER)
            REFERENCES encounters (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_procedures)
    conn.commit()

    sql_table_observations = """
    CREATE TABLE IF NOT EXISTS observations (
        DATE DATE,
        PATIENT TEXT,
        ENCOUNTER TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        VALUE TEXT,
        UNITS TEXT,
        TYPE TEXT,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ENCOUNTER)
            REFERENCES encounters (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_observations)
    conn.commit()

    sql_table_devices = """
    CREATE TABLE IF NOT EXISTS devices (
        START TEXT,
        STOP TEXT,
        PATIENT TEXT,
        ENCOUNTER TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        UDI TEXT,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ENCOUNTER)
            REFERENCES encounters (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_devices)
    conn.commit()

    sql_table_imaging = """
    CREATE TABLE IF NOT EXISTS imaging_studies (
        Id TEXT PRIMARY KEY,
        DATE DATE,
        PATIENT TEXT,
        ENCOUNTER TEXT,
        BODYSITE_CODE TEXT,
        BODYSITE_DESCRIPTION TEXT,
        MODALITY_CODE TEXT,
        MODALITY_DESCRIPTION TEXT,
        SOP_CODE TEXT,
        SOP_DESCRIPTION TEXT,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ENCOUNTER)
             REFERENCES encounters (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_imaging)
    conn.commit()

    sql_table_immunizations = """
    CREATE TABLE IF NOT EXISTS immunizations (
        DATE DATE,
        PATIENT TEXT,
        ENCOUNTER TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        BASE_COST REAL,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ENCOUNTER)
            REFERENCES encounters (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_immunizations)
    conn.commit()

    sql_table_payer_transitions = """
    CREATE TABLE IF NOT EXISTS payer_transitions (
        PATIENT TEXT,
        START_YEAR INTEGER,
        END_YEAR INTEGER,
        PAYER TEXT,
        OWNERSHIP TEXT,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id)
        FOREIGN KEY (PAYER)
            REFERENCES payers (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_payer_transitions)
    conn.commit()

    sql_table_disease = """
    CREATE TABLE IF NOT EXISTS disease (
        START TEXT,
        STOP TEXT,
        PATIENT TEXT,
        ENCOUNTER TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ENCOUNTER)
            REFERENCES encounters (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_disease)
    conn.commit()

    sql_table_supplies = """
    CREATE TABLE IF NOT EXISTS supplies (
        DATE DATE,
        PATIENT TEXT,
        ENCOUNTER TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        QUANTITY INTEGER,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id)
        FOREIGN KEY (ENCOUNTER)
            REFERENCES encounters (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_supplies)
    conn.commit()

    sql_table_encounters = """
    CREATE TABLE IF NOT EXISTS encounters_final (
        Id TEXT PRIMARY KEY,
        START TEXT,
        STOP TEXT,
        PATIENT TEXT,
        ORGANIZATION TEXT,
        PROVIDER TEXT,
        PAYER TEXT,
        ENCOUNTERCLASS TEXT,
        CODE TEXT,
        DESCRIPTION TEXT,
        BASE_ENCOUNTER_COST REAL,
        TOTAL_CLAIM_COST REAL,
        PAYER_COVERAGE REAL,
        MEDICATION_COUNT INTEGER,
        VACCINATION_COUNT INTEGER,
        CONDITION_COUNT INTEGER,
        PROCEDURE_COUNT INTEGER,
        BMI INTEGER,
        AGE_AT_ENCOUNTER,
        GENDER TEXT,
        TOTAL_MED_COST REAL,
        FOREIGN KEY (PATIENT)
            REFERENCES patients (Id),
        FOREIGN KEY (ORGANIZATION)
            REFERENCES organizations (Id),
        FOREIGN KEY (PROVIDER)
            REFERENCES providers (Id),
        FOREIGN KEY (PAYER)
            REFERENCES payers (Id)
        );
        """
    cur = conn.cursor()
    cur.execute(sql_table_encounters)
    cur.commit()

    print("Alle Tabellen (3NF) wurden erfolgreich angelegt!")
except Exception as e:
    print(f"Ein Fehler ist aufgetreten: {e}")
    conn.rollback()


#Mapping Dictionary zum automatisierten befuellen der synthea.db
SQL_TABLE_MAPPING = {
    "df_patients": "patients",
    "df_organizations": "organizations",
    "df_providers": "providers",
    "df_payers": "payers",
    "df_encounters": "encounters",
    "df_careplans": "careplans",
    "df_conditions": "conditions",
    "df_medications": "medications",
    "df_procedures": "procedures",
    "df_observations": "observations",
    "df_devices": "devices",
    "df_imaging_studies": "imaging_studies",
    "df_immunizations": "immunizations",
    "df_payer_transitions": "payer_transitions",
    "df_disease": "disease",
    "df_supplies": "supplies",
    "df_encounters_final" : "encounters_final"
}
#Faktentabelle in DF_Dict aufnehmen zum befüllen der synthea.db
DF_DICT["df_encounters_final"] = df_encounters_final

#Funktion zum Befüllen der synthea.db. CAVE Reihenfolge in DF_DICT und SQL_MAPPING_DICT ist entscheidend!
def load_df_to_sql_db(df_dict=DF_DICT, sql_table_mapping=SQL_TABLE_MAPPING, con=conn):
    for df_name, df in df_dict.items():
        if df_name in sql_table_mapping:
            table_name = sql_table_mapping[df_name]
            try:
                cur = con.cursor()
                df.to_sql(table_name, conn, if_exists="replace", index=True)
                con.commit()
                print(f"DataFrame '{df_name}' wurde in Tabelle '{table_name}' der synthea.db geladen.")
            except Exception as e:
                print(f"Fehler beim laden des DataFrame '{df_name}': {e}")
                con.rollback()
        else:
            print(f"Es konnte keine SQL-Tabelle in synthea.db für den DataFrame '{df_name}' gefunden.")


load_df_to_sql_db()

conn.close()

