In [2]:
# %%
import pandas as pd
from pathlib import Path

# Indstil mappen hvor filerne ligger
DATA_DIR = Path(r"C:\Users\kfq6\Documents\Data")

FILES = {
    "Bookinger":  "Bookinger.xlsx",
    "LABKA":      "LABKA.xlsx",
    "Population": "Population.xlsx",
    "WHO-5 (PRO)": "WHO-5 (PRO).xlsx",
}


In [3]:
# %%
dfs = {}
for name, fname in FILES.items():
    path = DATA_DIR / fname
    df = pd.read_excel(path, sheet_name=0, engine="openpyxl")
    dfs[name] = df
    print(f"{name}: {df.shape[0]} rækker, {df.shape[1]} kolonner")


  warn("Workbook contains no default style, apply openpyxl's default")


Bookinger: 207222 rækker, 24 kolonner


  warn("Workbook contains no default style, apply openpyxl's default")


LABKA: 419010 rækker, 22 kolonner
Population: 2837 rækker, 32 kolonner
WHO-5 (PRO): 4330 rækker, 7 kolonner


In [6]:
# %%
summary_rows = []
for name, df in dfs.items():
    rows = len(df)
    cols = df.shape[1]
    unique_cpr = df["CPRNummer"].nunique(dropna=True) if "CPRNummer" in df.columns else None
    missing_cpr = df["CPRNummer"].isna().sum() if "CPRNummer" in df.columns else None
    summary_rows.append({
        "file": name,
        "rows": rows,
        "cols": cols,
        "unique_CPRNummer": unique_cpr,
        "missing_CPRNummer": missing_cpr
    })

summary = pd.DataFrame(summary_rows).sort_values("file").reset_index(drop=True)
summary


Unnamed: 0,file,rows,cols,unique_CPRNummer,missing_CPRNummer
0,Bookinger,207222,24,2823,0
1,LABKA,419010,22,2829,0
2,Population,2837,32,2837,0
3,WHO-5 (PRO),4330,7,1872,0


In [5]:
df = dfs["LABKA"]

# Now you can safely do:
print(df.columns.tolist())  # check what the column names actually are

unique_values = df["Analysenavn"].unique()
print("Unique text values in 'Analysenavn':")
print(unique_values)

# Count how many rows for each unique analysis name
analysis_counts = df["Analysenavn"].value_counts()

print(analysis_counts)

['CPRNummer', 'DW_EK_Borger', 'DW_SK_Borger1', 'DW_SK_RekvSvar', 'ORDINV_ID', 'REQUISITION_NR', 'Alder_Proevetagningstid', 'Dato_Svartid', 'Klok_Svartid', 'Dato_Proevetagningstid', 'Klok_Proevetagningstid', 'Analysenummer', 'INVER_ID', 'Analysekode', 'Analysenavn', 'NPUKode', 'Svartype_Tekst', 'Internt_Svar_Numerisk', 'Svar', 'Enhed', 'NyesteResultatAnalyse', 'Over70']
Unique text values in 'Analysenavn':
['B-Hæmoglobin'
 'eGFR_x0020__x002F__x0020_1_x002C_73m_x00B2__x0028_CKD-EPI_x0029_'
 'P-Natrium' 'P-Kalium' 'P-Kreatinin' 'P-Triglycerid'
 'U-Albumin_x0020__x002F__x0020_Kreatinin-ratio' 'P-Kolesterol'
 'P-Kolesterol_x0020_HDL' 'P-Kolesterol_x0020_LDL' 'P-Calcium'
 'P-Calcium_x0020__x0028_albuminkorrigeret_x0029_' 'P-Albumin'
 'P-Vitamin_x0020_B12'
 'Hb_x0028_B_x0029_-Hæmoglobin_x0020_A1c_x0020__x0028_IFCC_x0029_'
 'P-25-Hydroxy-Vitamin_x0020_D_x0028_D3_x002B_D2_x0029_']
Analysenavn
P-Kreatinin                                                         49476
eGFR_x0020__x002F__x0020_1_x0

## Typer af kontakt, antal patienter under hvert kontakt

In [6]:
df = dfs["Population"]



unique_values = df["KontaktType"].unique()
print("Unique text values in 'Analysenavn':")
print(unique_values)

# Count how many rows for each unique analysis name
analysis_counts = df["KontaktType"].value_counts()

print(analysis_counts)

Unique text values in 'Analysenavn':
['Virtuel kontakt' 'Ambulant' 'Indlæggelse']
KontaktType
Ambulant           1564
Virtuel kontakt    1261
Indlæggelse          12
Name: count, dtype: int64


In [5]:
df = dfs["WHO-5 (PRO)"]

ID_COL = "DW_EK_Borger"

# True for rows where the WHO score is missing
who_missing = df["WHO5_score"].isna()

# For each patient, is WHO missing in *all* their rows?
all_missing_per_patient = (
    who_missing.groupby(df[ID_COL]).transform("all")
)

# IDs of patients with no WHO anywhere in the master
ids_no_who_in_master = df.loc[all_missing_per_patient, ID_COL].unique()

len(ids_no_who_in_master), ids_no_who_in_master[:10]


(0, array([], dtype=int64))

In [7]:
# %%
# Filter rows where KontaktType is 'Indlæggelse'
inpatient_df = df[df["KontaktType"] == "Indlæggelse"].copy()

# Display their unique patient IDs
inpatient_ids = inpatient_df["DW_EK_Borger"].unique()

print(f"Number of inpatient rows: {len(inpatient_df)}")
print(f"Number of unique inpatient patients: {len(inpatient_ids)}\n")

print("Inpatient patient IDs:")
print(inpatient_ids)

Number of inpatient rows: 12
Number of unique inpatient patients: 12

Inpatient patient IDs:
[ 143581  443158  461784  700244  998971 1006451 1082134 1114165 1220750
 1233676 1237827 1268194]


## Hvilken center har ansvar for patienter

In [8]:
df = dfs["Population"]



unique_values = df["ShakAfdKodeTekst_KontaktAnsvar"].unique()
print("Unique text values in 'Analysenavn':")
print(unique_values)

# Count how many rows for each unique analysis name
analysis_counts = df["ShakAfdKodeTekst_KontaktAnsvar"].value_counts()

print(analysis_counts)

Unique text values in 'Analysenavn':
['800109 - Alb Endokrinologisk Område']
ShakAfdKodeTekst_KontaktAnsvar
800109 - Alb Endokrinologisk Område    2837
Name: count, dtype: int64


## type af diagnose

In [9]:
df = dfs["Population"]



unique_values = df["Aktionsdiagnosegruppe"].unique()
print("Unique text values in 'Analysenavn':")
print(unique_values)

# Count how many rows for each unique analysis name
analysis_counts = df["Aktionsdiagnosegruppe"].value_counts()

print(analysis_counts)

Unique text values in 'Analysenavn':
['Type 2-diabetes' 'Type 1-diabetes' 'Andre former for diabetes'
 'Ikke specificeret diabetes']
Aktionsdiagnosegruppe
Type 1-diabetes               1923
Type 2-diabetes                793
Andre former for diabetes       89
Ikke specificeret diabetes      32
Name: count, dtype: int64


In [10]:
#============================================================
# ===  POPULATION ============================================
# ============================================================
pop = dfs["Population"]
print("\n=== POPULATION ===")
print(pop.info())
print(pop.describe(include="all").T.head())




=== POPULATION ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2837 entries, 0 to 2836
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   DW_SK_Kontakt                   2837 non-null   int64         
 1   DW_SK_Forloeb                   2837 non-null   int64         
 2   DW_SK_Helbredsforloeb           2837 non-null   int64         
 3   Encounter_ID                    2837 non-null   int64         
 4   CPRNummer                       2837 non-null   int64         
 5   DW_EK_Borger                    2837 non-null   int64         
 6   DW_SK_Borger                    2837 non-null   int64         
 7   Alder_KontaktStart              2837 non-null   int64         
 8   Dato_ForloebStart               2837 non-null   datetime64[ns]
 9   Dato_ForloebSlut                0 non-null      float64       
 10  Dato_KontaktStart               2837 non-null   date

In [11]:
# ---- Time coverage
for c in ["Dato_ForloebStart","Dato_ForloebSlut","Dato_KontaktStart","Dato_KontaktSlut"]:
    if c in pop.columns:
        pop[c] = pd.to_datetime(pop[c], errors="coerce")
        print(f"{c}: {pop[c].min()}  →  {pop[c].max()}")

Dato_ForloebStart: 2022-03-25 00:00:00  →  2025-10-18 00:00:00
Dato_ForloebSlut: NaT  →  NaT
Dato_KontaktStart: 2022-03-30 00:00:00  →  2025-10-21 00:00:00
Dato_KontaktSlut: 2022-03-30 00:00:00  →  2025-10-21 00:00:00


## How many contacts did each patient had 

In [12]:
book = dfs["Bookinger"].copy()

# Columns you want gone
drop_cols = ["CPRNummer"]

# Drop them safely
book = book.drop(columns=drop_cols, errors="ignore")

# Take a look (optional but therapeutic)
book.info()

# Save it back to Excel
out_path = r"C:\Users\kfq6\Documents\Data\Bookinger_clean.xlsx"
book.to_excel(out_path, index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207222 entries, 0 to 207221
Data columns (total 23 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   DW_SK_BookingAftale               207222 non-null  int64         
 1   DW_SK_Kontakt                     207222 non-null  int64         
 2   DW_SK_Forloeb                     99666 non-null   float64       
 3   Aftale_OID                        207222 non-null  int64         
 4   DW_EK_Borger                      207222 non-null  int64         
 5   DW_SK_Borger                      207222 non-null  int64         
 6   Dato_MoedeDato                    141714 non-null  datetime64[ns]
 7   Klok_MoedeTid                     141714 non-null  float64       
 8   Dato_SenesteAendring              207222 non-null  datetime64[ns]
 9   BookingStatusKode                 207222 non-null  int64         
 10  BookingStatusTekst              

In [16]:
books = dfs["Bookinger"].copy()

col = "Angaaende"

# Unique values
unique_values = books[col].dropna().unique()

print(f"\n--- Unique values in '{col}' ---")
print(f"Count: {len(unique_values)}\n")

for v in unique_values:
    print(f"- {v}")



--- Unique values in 'Angaaende' ---
Count: 6484

- Diabeteskontrol
- Øjendryp_x0020__x0028_SAM_x0029_
- Screeningssamtale_x0020_med_x0020_diabetessygeplejerske
- Screening_x0020_ved_x0020_fodterapeut
- Fysioterapeutisk_x0020_undersøgelse_x0020_og_x0020_behandling_x002C__x0020_inkontinens
- Sammedagsscreening_x0020_læge
- KBA-Med_x0020_Blodprøvetagning_x0020_5_x0020_min
- _x0032_4_x0020_timers_x0020_BT_x0020_påmontering
- _x0032_4_x0020_timers_x0020_BT_x0020_aftagning
- Telefonkonsultation
- Sårbehandling_x0020_-_x0020_fremmøde
- Øjenbaggrundsfoto_x0020__x0028_SAM_x0029_
- infliximab-behandling
- KBA-Nord_x0020_Blodprøvetagning_x0020_5_x0020_min
- FOD_x0020_-_x0020_Telefonbesked_x0020_t_x002F_sygeplejerske
- Diabetes_x0020_årsstatus_x0020_ved_x0020_sygeplejerske
- Jernbehandling
- Lægesamtale
- DC-konvertering
- Ekko_x0020_SPL_x0020_subakut
- pace_x002F_ICD-kontrol
- KBA-Nord_x0020_Inhouse_x0020_spor_x0020_10_x0020_min
- Frit_x0020_valg_x0020_App_x002F_fremmøde_x0020_status_x0020_ved_

In [None]:
# Filter bookings beyond 2025
future_bookings = book[book["year"] > 2025][
    ["DW_EK_Borger", "Dato_MoedeDato", "BookingStatusTekst"]
]

# Sort descending (furthest in the future first)
future_bookings = future_bookings.sort_values("Dato_MoedeDato", ascending=False)

print(future_bookings.head(20))  # show the first 20 most futuristic bookings


        DW_EK_Borger Dato_MoedeDato BookingStatusTekst
21254         563409     2031-05-01             Booket
21255        1077998     2031-05-01             Booket
192318        527558     2030-06-03             Aflyst
196447       1559031     2030-04-01             Booket
122555        156172     2030-02-08             Aflyst
159872        968115     2029-06-01             Aflyst
21253         380414     2028-09-08             Booket
97349         679669     2028-08-31             Booket
86026        1276811     2028-08-31             Aflyst
206726       1107738     2028-06-20             Booket
197818        354675     2028-05-16             Aflyst
192549        303308     2028-04-19             Booket
188286        457511     2028-04-11             Aflyst
189285        534712     2028-04-11             Aflyst
184479        662579     2028-03-14             Aflyst
184476       1255369     2028-03-14             Aflyst
21258         574135     2028-02-04             Booket
107171    

In [None]:
book["BookingStatusTekst"].value_counts()


BookingStatusTekst
afviklet      125452
slettet        33438
aflyst         25313
booket         11768
huskeliste     11152
anmodet           80
genbooking        19
Name: count, dtype: int64

In [None]:
book_filtered["Dato_MoedeDato"] = pd.to_datetime(book_filtered["Dato_MoedeDato"], errors="coerce")
book_filtered["year"] = book_filtered["Dato_MoedeDato"].dt.year
book_filtered["year"].value_counts().sort_index()


year
2022    26408
2023    37968
2024    40965
2025    31633
2026      220
2027       17
2028        6
2030        1
2031        2
Name: count, dtype: int64