In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import sys
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.append('../')
os.chdir("../")

base_folder = "./data/raw/screening_data/"


# Inspect and filter all VRE screenings

## Positive and negative screenings

### File 1

In [None]:
# https://docs.python.org/3/library/codecs.html#standard-encodings
vre_df1 = pd.read_csv(base_folder + "2018_10_18_sdvre_tot_28_10_18.csv", encoding="cp850", parse_dates=["Erfassung", "Entnahme", "Geburtsdat."], dtype="str")
vre_df1

In [None]:
vre_df1.columns

In [None]:
vre_df1.columns = ['Order ID', 'Record Date', 'Measurement Date', 'Patient Number', 'Last Name',
                          'First Name', 'Birth Date', 'Age', 'Gender', 'Zip Code', 'Place of Residence',
                          'Canton', 'Country', 'Patient ID', 'Case ID', 'Requester', 'Cost Unit', 'Material Type',
                          'Transport', 'Pathogen 1 Result', "Pathogen 2 Result"]


In [None]:
vre_df1

In [None]:
vre_df1.fillna({'Pathogen 2 Result': 'nn'}, inplace=True)

In [None]:
vre_df1['Pathogen Result'] = np.where(~vre_df1["Pathogen 1 Result"].str.contains("nn") | ~vre_df1["Pathogen 2 Result"].str.contains("nn"),'pp', 'nn')

In [None]:
vre_df1

In [None]:
vre_df1["Patient ID"].isna().sum()

In [None]:
vre_df1[vre_df1["Patient ID"].isna()]

In [None]:
vre_df1 = vre_df1[~vre_df1["Patient ID"].isna()]
vre_df1

In [None]:
vre_df1["Patient ID"].isna().sum()

### File 2

In [None]:
vre_df2 = pd.read_csv(base_folder + "2018_10_18_sdvre_tot_28_10_18_2.csv", encoding="ISO-8859-1", parse_dates=["erfassung", "entnahme", "geburtsdatum"], dtype="str")
vre_df2

In [None]:
vre_df2.columns = ['Order ID', 'Record Date', 'Measurement Date', 'First Name',
                          'Last Name', 'Birth Date', 'Patient ID', 'Requester', 'Cost Unit', 'Material Type',
                          'Transport', 'Pathogen Result', 'Analysis Method', 'Screening Context']


In [None]:
vre_df2

In [None]:
vre_df2["Patient ID"].isna().sum()

In [None]:
vre_df2[vre_df2["Patient ID"].isna()]

In [None]:
vre_df2 = vre_df2[~vre_df2["Patient ID"].isna()]
vre_df2

In [None]:
vre_df2["Patient ID"].isna().sum()

### File 3

In [None]:
vre_df3 = pd.read_csv(base_folder + "2020_08_24_VREweekly_200824-0926.csv", encoding="ISO-8859-1", parse_dates=["Erfassung", "Entnahme", "Geburtsdat."], dtype="str")
vre_df3

In [None]:
vre_df3.columns

In [None]:
vre_df3.columns = ['Order ID', 'Record Date', 'Measurement Date', 'Patient Number', 'Last Name',
                          'First Name', 'Birth Date', 'Requester', 'Cost Unit', 'Material Type',
                          'Transport', 'Pathogen 1 Result', "Pathogen 2 Result", 'Index', 'Result PCR', 'vanA C(t)', 'vanA Qual',
       'vanB C(t)', 'vanB Qual', 'E.faecium C(t)', 'E.faecium Qual']

In [None]:
vre_df3

In [None]:
vre_df3.fillna({'Pathogen 1 Result': 'nn', 'Pathogen 2 Result': 'nn'}, inplace=True)

In [None]:
vre_df3['Pathogen Result'] = np.where(~vre_df3["Pathogen 1 Result"].str.contains("nn") | ~vre_df3["Pathogen 2 Result"].str.contains("nn"),'pp', 'nn')

In [None]:
vre_df3

Patient Nr are wothless. Continuing in file 3a where I matched the data with SAP data to find correspondences.

### File 3a

In [None]:
vre_df3a = pd.read_csv(base_folder + "2020_08_24_VREweekly_200824-0926_GC_request_IDSC202101311_MKA.GC_20210302.csv", parse_dates=["Erfassung", "Entnahme", "Geburtsdatum"], dtype="str")
vre_df3a

In [None]:
vre_df3a.drop(columns=["StudieID", "PID", "PatNr", "Index"], inplace=True)

In [None]:
vre_df3a.columns

In [None]:
vre_df3a.columns = ['Order ID', 'Record Date', 'Measurement Date', 'Last Name',
                          'First Name', 'Birth Date', 'Requester', 'Cost Unit', 'Material Type',
                          'Transport', 'Pathogen 1 Result', "Pathogen 2 Result", 'Result PCR', 'vanA C(t)', 'vanA Qual',
       'vanB C(t)', 'vanB Qual', 'E.faecium C(t)', 'E.faecium Qual', 'general_consent', 'matching_status', 'warnings',
       'Patient ID', 'SAP Birth Date', 'SAP First Name', 'SAP Last Name']

In [None]:
vre_df3a

In [None]:
vre_df3a.update(vre_df3a[["SAP Last Name"]].rename(columns={"SAP Last Name": "Last Name"}))
vre_df3a.update(vre_df3a[["SAP First Name"]].rename(columns={"SAP First Name": "First Name"}))
vre_df3a.update(vre_df3a[["SAP Birth Date"]].rename(columns={"SAP Birth Date": "Birth Date"}))
vre_df3a

In [None]:
vre_df3a.drop(columns=["SAP Birth Date", "SAP First Name", "SAP Last Name", "matching_status", "warnings", "general_consent"], inplace=True)
vre_df3a

In [None]:
vre_df3a.fillna({'Pathogen 1 Result': 'nn', 'Pathogen 2 Result': 'nn'}, inplace=True)
vre_df3a['Pathogen Result'] = np.where(~vre_df3a["Pathogen 1 Result"].str.contains("nn") | ~vre_df3a["Pathogen 2 Result"].str.contains("nn"),'pp', 'nn')
vre_df3a

In [None]:
vre_df3a["Patient ID"].isna().sum()

In [None]:
vre_df3a[vre_df3a["Patient ID"].isna()]

In [None]:
vre_df3a = vre_df3a[~vre_df3a["Patient ID"].isna()]
vre_df3a

In [None]:
vre_df3a["Patient ID"].isna().sum()

### File 6

In [None]:
vre_df6 = pd.read_csv(base_folder + "vre_screenings.csv", delimiter=";", parse_dates=["Erfassung", "Entnahme"], dtype="str")
vre_df6


In [None]:
vre_df6.columns = ['Order ID', 'Record Date', 'Measurement Date', 'Patient Number',
                          'Patient ID', 'Case ID', 'Requester', 'Cost Unit', 'Material Type',
                          'Transport', 'Pathogen Result', 'vreih', 'Analysis Method', 'Screening Context']

In [None]:
vre_df6

In [None]:
vre_df6['vreih'].unique()

In [None]:
vre_df6["Patient ID"].isna().sum()

In [None]:
vre_df6[vre_df6["Patient ID"].isna()]

In [None]:
vre_df6 = vre_df6[~vre_df6["Patient ID"].isna()]
vre_df6

In [None]:
vre_df6["Patient ID"].isna().sum()

## Only VRE positive patients

### File 4

In [None]:
# only VRE positive patients
vre_df4 = pd.read_csv(base_folder + "VRE_Pat_191211.csv", parse_dates=["MeldeDatum", "EntnahmeDatum", "GebDatum"], dtype="str")
vre_df4

In [None]:
vre_df4.columns = ['Measurement Date', 'Record Date',  'Screening Type',
                          'Patient ID', 'Last Name', 'First Name', "Birth Date", 'Gender', 'Campus', 'Ward of Measurement', 'Department', 'Type of Resistency', 'MLS Resistency', "Infection", "Explanation"]

In [None]:
vre_df4["Pathogen Result"] = "pp"

In [None]:
vre_df4

In [None]:
vre_df4["Patient ID"].isna().sum()

In [None]:
vre_df4[vre_df4["Patient ID"].isna()]

In [None]:
vre_df4 = vre_df4[~vre_df4["Patient ID"].isna()]
vre_df4

In [None]:
vre_df4["Patient ID"].isna().sum()

### File 5

In [None]:
# only VRE positive patients
vre_df5 = pd.read_csv(base_folder + "VRE_Pat_191211_modified.csv", parse_dates=["erfassung", "entnahme", "geburtsdatum"], dtype="str")
vre_df5


In [None]:
vre_df5.columns = ['Order ID', 'Measurement Date', 'Record Date', 'First Name', 'Last Name', 'Birth Date', 'Patient ID', 'Requester', 'Cost Unit', 'Material Type',
                          'Transport', 'Pathogen Result', 'Analysis Method', 'Screening Context']

In [None]:
vre_df5

In [None]:
vre_df5["Patient ID"].isna().sum()

In [None]:
vre_df5[vre_df5["Patient ID"].isna()]

In [None]:
vre_df5 = vre_df5[~vre_df5["Patient ID"].isna()]
vre_df5

In [None]:
vre_df5["Patient ID"].isna().sum()

# Merge df2 and df3a!

In [None]:
vre_df = pd.concat([vre_df2, vre_df3a]) 
# vre_df1 excluded because it is a duplicate of vre_df2 and stems from the same file source
# vre_df3 excluded because it is a duplicate of vre_df3a, but does not contain any patient IDs
# vre_df4 and vre_df5 excluded as they are just positive screenings and thus should be included in the all screenings document
# vre_df6 excluded as this file does not contain any first and last name nor birth date and as such is hard to match against any others
# vre_df2 and vre_df3a seem to cover the date range well as can be seen in the screening plots below
vre_df

In [None]:
vre_df.columns

In [None]:
vre_df["Pathogen Result"].unique()

In [None]:
vre_df.drop(columns=['Result PCR',
       'vanA C(t)', 'vanA Qual', 'vanB C(t)', 'vanB Qual', 'E.faecium C(t)',
       'E.faecium Qual', 'Campus', 'Ward of Measurement',
       'Department', 'Type of Resistency', 'MLS Resistency', 'Infection',
       'Explanation', 'vreih', "Screening Type", "Case ID", "Analysis Method", "Age", "Pathogen 1 Result", "Pathogen 2 Result", "Requester", "Cost Unit", "Screening Context", "Material Type", "Transport"], inplace=True, errors="ignore")

In [None]:
vre_df

## Screenings across time

In [None]:
vre_df[["Record Date"]].groupby([vre_df["Record Date"].dt.year, vre_df["Record Date"].dt.month]).count().plot(kind="bar", figsize=(16,12), title="VRE Screenings across time")

In [None]:
vre_df.columns

In [None]:
len(vre_df)

In [None]:
#vre_df["Patient Number"] = pd.to_numeric(vre_df["Patient Number"], errors='coerce')

In [None]:
#vre_df["Patient ID"] = pd.to_numeric(vre_df["Patient ID"], errors='coerce')

In [None]:
#vre_df[["Patient Number", "Patient ID"]].describe()

They are absolutely not the same and not something like the patient id without the checksum digit.

In [None]:
vre_df["Record Date"].isna().sum()

In [None]:
#vre_df[["Patient Number", "Patient ID"]]

In [None]:
v_df1 = vre_df.drop_duplicates(subset=['Order ID', "Record Date"])

In [None]:
v_df1 = v_df1.drop_duplicates(subset=['Record Date', "Pathogen Result", "Patient ID"])

In [None]:
v_df1

In [None]:
v_df1["Patient ID"].isna().sum()

In [None]:
v_df1[v_df1["Patient ID"].isna()]

## Save screenings to file

In [None]:
v_df1 = v_df1.sort_values(by=["Record Date"])
v_df1.to_csv("./data/raw/model_data/VRE_SCREENING_DATA.csv")

In [None]:
v_df1[~v_df1["Pathogen Result"].str.contains("nn")]

In [None]:
pd.set_option('display.max_rows', 2000)
v_df1[~v_df1["Pathogen Result"].str.contains("nn")].sort_values(by=["Record Date"])

In [None]:
len(v_df1[~v_df1["Pathogen Result"].str.contains("nn")].sort_values(by=["Record Date"]))

In [None]:
v_df_pos = v_df1[~v_df1["Pathogen Result"].str.contains("nn")]
v_df_pos["Pathogen Result"] = "pp"
v_df_pos = v_df_pos.drop_duplicates(subset=["Pathogen Result", "Patient ID"]).sort_values(by=["Record Date"])
v_df_pos

In [None]:
len(v_df_pos)

## Positive screenings over time

In [None]:
v_df_pos[["Record Date"]].groupby([v_df_pos["Record Date"].dt.year, v_df_pos["Record Date"].dt.month]).count().plot(kind="bar", figsize=(16,12), title="VRE Positive Screenings across time")