In [None]:
import pandas as pd
from sqlalchemy import create_engine
import os

db_name = os.getenv("DB_NAME_SEMINAR2")
db_user = os.getenv("DB_USER")
db_host = os.getenv("DB_HOST")
db_pw = os.getenv("DB_PW")
db_port = os.getenv("DB_PORT")

engine = create_engine(f"postgresql://{db_user}:{db_pw}@{db_host}:{db_port}/{db_name}")

In [None]:
# Wie viel Tage nach Aufnahme wird die Hauptdiagnose eingetragen?
df = pd.read_sql("""SELECT "CreationTime_day_of_visit" AS creationtime_day_of_visit
FROM student_data.vwd_diagnosen
WHERE "isPrimary" = TRUE;""", engine)

df.creationtime_day_of_visit.describe()

In [None]:
# Define the bin edges based on the desired width
bin_width = 1
bin_edges = range(int(df['creationtime_day_of_visit'].min()), int(df['creationtime_day_of_visit'].max()) + bin_width, bin_width)

df.plot(kind="hist",
       bins=bin_edges,
        rwidth=0.8,
        color='#86bf91',
        alpha=0.7)

In [None]:
# Was sind das für Fälle, wo die Diagnose vor der Aufnahme eingetragen wird?
df = pd.read_sql("""SELECT *
FROM student_data.vwd_diagnosen
WHERE "isPrimary" = TRUE AND "CreationTime_day_of_visit" < 0;""", engine)

df

In [None]:
# Wie groß ist der Anteil derjenigen Fälle mit einer negativen Hauptdiagnosen-creation-time, die über die Notaufnahme kommen? --> Hypothese: die sind großteilig elektiv


In [None]:
# Wie ist die Geschlechterverteilung bei Lungenkrebs?
df = pd.read_sql("""SELECT p.sex, COUNT(DISTINCT d.patient_oid) AS patient_count
FROM student_data.vwd_diagnosen d
JOIN student_data.vwd_fälle p ON d.patient_oid = p.patient_oid
WHERE d.code LIKE 'C34%%'
GROUP BY p.sex;""", engine)

df

In [None]:
# Wann werden Fälle aufgenommen?
df = pd.read_sql("""
SELECT MOD("CreationTime_hour_of_day" + 24 - MOD(CAST("CreationTime_minutes_since_start" / 60.0 AS BIGINT), 24), 24) AS start_hour
FROM student_data.vwd_diagnosen WHERE "isPrimary" = True;""", engine)

df.head()

In [None]:
df.plot(kind="hist",
        # binedges = range(24),
        rwidth=0.8,
        alpha=0.7)

In [None]:
df.start_hour.describe()

In [None]:
from sqlalchemy import text

In [None]:
df = pd.read_sql(text("""
SELECT CAST(sum(CASE WHEN birth_decade >= 1980 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) * 100 AS percentage_infarct_under_40
FROM student_data.vwd_diagnosen AS d JOIN student_data.vwd_faelle AS f ON d.visit_oid = f.visit_oid
WHERE ("Code" ILIKE 'I64%' OR "Code" ILIKE 'I63%')
AND f.startingvisitoid = f.visit_oid;"""), engine)

print(f"{round(df.iloc[0, 0], 2)}% of all stroke patients are born 1980 or later.")

In [None]:
alchemy_object = text("""
SELECT CAST(sum(CASE WHEN birth_decade > 1980 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) * 100 AS percentage_infarct_under_40
FROM student_data.vwd_diagnosen AS d JOIN student_data.vwd_faelle AS f ON d.visit_oid = f.visit_oid
WHERE ("Code" ILIKE 'I64%' OR "Code" ILIKE 'I63%')
AND f.startingvisitoid = f.visit_oid;""")

In [None]:
print(alchemy_object.compile(engine))

In [None]:
from src.database import read_sql

In [None]:
# (verify that there is only 1 primary diagnosis per visit)
read_sql("""with primary_counts as (
	SELECT count(*)
	from student_data.vwd_diagnosen as d join student_data.vwd_faelle as f
	on D.visit_oid = F.visit_oid where d."isPrimary" = true group by F.startingvisitoid limit 10
) select * from primary_counts where "count" != 1""")

In [None]:
# Wie viel Zeit vergeht im Durchschnitt, bis es die ersten Laborwerte gibt?
# (checking only 100 samples & omitting observations before arrival)
df = read_sql("""with t as (select min(observationdatetime_minutes_since_start) as time
from student_data.vwd_investigationresults as i join student_data.vwd_faelle as f
on i.patientvisit_oid = f.visit_oid group by F.startingvisitoid limit 100) select * from t where time > 0 """)
df.head()

In [None]:
# Wie viel Zeit vergeht im Durchschnitt, bis es die ersten Laborwerte gibt?
# (checking only 100 samples & omitting observations before arrival)
df = read_sql("""with t as (select min(observationdatetime_minutes_since_start) as time
from student_data.vwd_investigationresults as i join student_data.vwd_faelle as f
on i.patientvisit_oid = f.visit_oid group by F.startingvisitoid limit 100) select * from t where time > 0 """)
df.head()

In [None]:
df.hist()

In [None]:
# Was sind die Top 10 Erkrankungen, mit denen man im KEVB liegt?
read_sql("""select d."Code", count(d."Code") as code_count
from student_data.vwd_diagnosen as d
join student_data.vwd_faelle as f on D.visit_oid = F.visit_oid
where d."isPrimary" = true and F.startingvisitoid = f.visit_oid
group by d."Code"
order by code_count desc
limit 10""")