# Plot data statistics for presentation

In [None]:
from matplotlib import ticker
from typing import Any

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import sys
import os

### Define data paths

In [None]:
data_path = "/Users/manuel/Desktop/BiomedDataAnalysisCourse/project/data/"
# filled some missing values and corrected mistakes
ps_rx_fname = os.path.join(data_path, "merged_data_processed_corrected.csv")

### Define functions used throughout the analysis

In [None]:
def compute_age(birth_year: int, visit_year: int) -> int:
    assert isinstance(birth_year, int)
    assert isinstance(visit_year, int)
    assert birth_year < visit_year
    age = visit_year - birth_year
    return age

In [None]:
def assign_age_group(age: int) -> int:
    assert isinstance(age, int)
    assert age > 0  # unlikely setting
    if age <= 40: 
        return 0
    elif age > 40 and age <= 55:
        return 1
    elif age > 55 and age <= 70:
        return 2
    elif age > 70 and age <= 85:
        return 3
    else:  # age > 85
        return 4

In [None]:
def encode_o2_therapies(therapy: str) -> int:
    assert isinstance(therapy, str)
    assert str(therapy) != "nan"  # remove NaN values
    if therapy == "CPAP/NIV":
        return 0
    elif therapy == "IOT":
        return 1
    elif therapy == "M. Venturi":
        return 2
    elif therapy == "M. reservoir":
        return 3
    elif therapy == "M. semplice":
        return 4
    elif therapy == "Nasocannule":
        return 5
    elif therapy == "No (aria ambiente)":
        return 6
    else:
        raise ValueError(f"Forbidden therapy value ({therapy})")

In [None]:
def encode_smoke(smoke: str) -> int:
    assert isinstance(smoke, str)
    assert str(smoke) != "nan"  # remove NaN values
    if smoke == "Attivo":
        return 0
    elif smoke == "Ex":
        return 1
    elif smoke == "Mai":
        return 2
    elif smoke == "Non noto":
        return 3
    else:
        raise ValueError(f"Forbidden smoke history value ({smoke})")

In [None]:
def encode_alcohol(alcohol: str) -> int: 
    assert isinstance(alcohol, str)
    assert str(alcohol) != "nan"
    if "Frequente" in alcohol:
        return 0
    elif alcohol == "Non consuma alcolici":
        return 1
    elif alcohol == "Non noto":
        return 2
    elif alcohol == "Quotidiano":
        return 3
    elif alcohol == "Saltuario (qualche volta in un mese)":
        return 4
    else:
        raise ValueError(f"Forbidden alchol history value ({alcohol})")

### Compute plots on data statistics

In the current notebook we will plot some statistics regarding the COVID-19 dataset used throughout the analysis.<br>
For consistency reason with the outcome of final analysis, the statistics are computed on the final dataset (some `NaN` were filled and some mistakes were corrected), used during the the final analysis.

In [None]:
ps_rx_df = pd.read_csv(ps_rx_fname, delimiter=";", decimal=",")
ps_rx_df.head()

Let's begin by exploring the number of visits available by gender and the visits outcome by gender.

#### Visits and outcome by gender

In [None]:
# explore number of visits by gender
data = {
    "Male":ps_rx_df[ps_rx_df.GENDER == "M"].shape[0], 
    "Female":ps_rx_df[ps_rx_df.GENDER == "F"].shape[0]
}
values = list(data.values())
gender = list(data.keys())
f, ax = plt.subplots(1, 1, figsize=(15,10))
ax.bar(gender, values, color = "#104B8E", width=.4)
ax.set_xlabel("Gender", size=16)
ax.set_ylabel("Number of visits", size=16)
ax.tick_params(labelsize=14)
ax.set_title("ER visits by gender", size=18)
plt.show()  # more males than females in our dataset


In [None]:
# explore visits outcome by gender
data = {
    "Dead":ps_rx_df[ps_rx_df.DEATH == 1].shape[0],
    "Alive":ps_rx_df[ps_rx_df.DEATH == 0].shape[0]
}
values = list(data.values())
outcome = list(data.keys())
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,10))
ax1.bar(outcome, values, color="#104B8E", width=.4)
ax1.set_xlabel("Outcome", size=16)
ax1.set_ylabel("Number of visits", size=16)
ax1.tick_params(labelsize=14)
ax1.set_title("ER visits outcome", size=18)
colors = ["#104B8E", "#DC661A"]
data = {
    "Dead":[
        ps_rx_df[(ps_rx_df.DEATH == 1) & (ps_rx_df.GENDER == "M")].shape[0],
        ps_rx_df[(ps_rx_df.DEATH == 1) & (ps_rx_df.GENDER == "F")].shape[0]
    ],
    "Alive":[
        ps_rx_df[(ps_rx_df.DEATH == 0) & (ps_rx_df.GENDER == "M")].shape[0],
        ps_rx_df[(ps_rx_df.DEATH == 0) & (ps_rx_df.GENDER == "F")].shape[0]
    ]
}
x = np.arange(2)
width = .2
ax2.bar(
    x - width / 2, 
    [data["Dead"][0], data["Alive"][0]], 
    width, 
    label="Males", 
    color=colors[0]
)
ax2.bar(
    x + width / 2, 
    [data["Dead"][1], data["Alive"][1]], 
    width, 
    label="Females", 
    color=colors[1]
)
ax2.set_xlabel("Outcome", size=16)
ax2.set_ylabel("Number of visits", size=16)
ax2.tick_params(labelsize=14)
ax2.set_xticks(x)
ax2.set_xticklabels(outcome)
ax2.legend(["Male", "Female"], prop={"size":18})
ax2.set_title("ER visits outcome by gender", size=18)
plt.show()  # most of the patients survived and males seem to have higher mortality 

Check if there are changes in the percentage of visits with `Alive` or `Dead` outcome between male or female subjects.

In [None]:
alive_pct = (
    data["Alive"][0] / (data["Alive"][0] + data["Dead"][0]),
    data["Alive"][1] / (data["Alive"][1] + data["Dead"][1]) 
)
dead_pct = (
    data["Dead"][0] / (data["Alive"][0] + data["Dead"][0]),
    data["Dead"][1] / (data["Alive"][1] + data["Dead"][1]) 
)
print(f"Male subjects:\nDead\tAlive\n%.2f\t%.2f\n" % (dead_pct[0], alive_pct[0]))
print(f"Female subjects:\nDead\tAlive\n%.2f\t%.2f" % (dead_pct[1], alive_pct[1]))  

Interestingly the percentages are equal. Therefore, there is no bias in the visit outcome introduced by subjects' gender.

#### Visits and outcome by age 

In [None]:
# compute subjects' age from visit and birth date
ps_rx_df["AGE"] = ps_rx_df.apply(lambda x : compute_age(int(x[1]), int(x[-1].split("/")[-1])), axis=1)
ps_rx_df.head()

Subjects' age ranges from 23 to 104. It would be useful to group ages in 5 chunks:
- $<$ 40
- 41 - 55
- 56 - 70
- 71 - 85
- $>$ 85

Let's create these 5 age groups.

In [None]:
ps_rx_df["AGE_GROUP"] = ps_rx_df.apply(lambda x : assign_age_group(x[-1]), axis=1)
ps_rx_df.head()

In [None]:
# explore the number of visits by age group
data = {
    "< 40":ps_rx_df[ps_rx_df.AGE_GROUP == 0].shape[0],
    "41 - 55":ps_rx_df[ps_rx_df.AGE_GROUP == 1].shape[0],
    "56 - 70":ps_rx_df[ps_rx_df.AGE_GROUP == 2].shape[0],
    "71 - 85":ps_rx_df[ps_rx_df.AGE_GROUP == 3].shape[0],
    "> 85":ps_rx_df[ps_rx_df.AGE_GROUP == 4].shape[0]
}
values = list(data.values())
age_group = list(data.keys())
f, ax = plt.subplots(1, 1, figsize=(15,10))
ax.bar(age_group, values, color = "#104B8E", width=.4)
ax.set_xlabel("Age group", size=16)
ax.set_ylabel("Number of visits", size=16)
ax.tick_params(labelsize=14)
ax.set_title("ER visits by age group", size=18)
plt.show()  # most of the visits interest subjects in their 70s-80s

In [None]:
# explore visits outcome by age group
data = {
    "Dead":{
        "< 40":ps_rx_df[(ps_rx_df.DEATH == 1) & (ps_rx_df.AGE_GROUP == 0)].shape[0],
        "41 - 55":ps_rx_df[(ps_rx_df.DEATH == 1) & (ps_rx_df.AGE_GROUP == 1)].shape[0],
        "56 - 70":ps_rx_df[(ps_rx_df.DEATH == 1) & (ps_rx_df.AGE_GROUP == 2)].shape[0],
        "71 - 85":ps_rx_df[(ps_rx_df.DEATH == 1) & (ps_rx_df.AGE_GROUP == 3)].shape[0],
        "> 85":ps_rx_df[(ps_rx_df.DEATH == 1) & (ps_rx_df.AGE_GROUP == 4)].shape[0]
    },
    "Alive":{
        "< 40":ps_rx_df[(ps_rx_df.DEATH == 0) & (ps_rx_df.AGE_GROUP == 0)].shape[0],
        "41 - 55":ps_rx_df[(ps_rx_df.DEATH == 0) & (ps_rx_df.AGE_GROUP == 1)].shape[0],
        "56 - 70":ps_rx_df[(ps_rx_df.DEATH == 0) & (ps_rx_df.AGE_GROUP == 2)].shape[0],
        "71 - 85":ps_rx_df[(ps_rx_df.DEATH == 0) & (ps_rx_df.AGE_GROUP == 3)].shape[0],
        "> 85":ps_rx_df[(ps_rx_df.DEATH == 0) & (ps_rx_df.AGE_GROUP == 4)].shape[0]
    }
}
data = pd.DataFrame(data).T
data = data[["< 40", "41 - 55", "56 - 70", "71 - 85", "> 85"]]
data.plot(kind="bar", figsize=(15,10))
plt.xlabel("Age group", size=16)
plt.ylabel("Number of visits", size=16)
plt.title("Visits outcome by age group", size=18)
plt.show()


#### Visits outcome and O2 therapies

Let's now explore if there exists some correlation between the visit outcome and the O2 therapy used to treat the patients during the COVID-19 infection.

There are 7 different therapy values in the analyzed dataset:
- CPAP/NIV
- IOT
- M. Venturi
- M.reservoir
- M. semplice
- Nasocannule
- No (environmental air)

In [None]:
# remove rows with NaN values on O2 therapies column
ps_rx_df_o2 = ps_rx_df[ps_rx_df["FIELDSET_EVOLUTION-O2"].notna()]
# encode O2 therapies
ps_rx_df_o2["O2_THERAPY"] = ps_rx_df_o2.apply(
    lambda x : encode_o2_therapies(
        x[
            int(
                np.where(
                    np.array(ps_rx_df_o2.columns.tolist()) == "FIELDSET_EVOLUTION-O2"
                )[0]
            )
        ]
    ), axis=1
)
ps_rx_df_o2.head()

In [None]:
# explore number of visits by O2 therapy
data = {
    "CPAP/NIV": ps_rx_df_o2[ps_rx_df_o2.O2_THERAPY == 0].shape[0],
    "IOT": ps_rx_df_o2[ps_rx_df_o2.O2_THERAPY == 1].shape[0],
    "M. Venturi": ps_rx_df_o2[ps_rx_df_o2.O2_THERAPY == 2].shape[0],
    "M. reservoir": ps_rx_df_o2[ps_rx_df_o2.O2_THERAPY == 3].shape[0],
    "M. semplice": ps_rx_df_o2[ps_rx_df_o2.O2_THERAPY == 4].shape[0],
    "Nasocannule": ps_rx_df_o2[ps_rx_df_o2.O2_THERAPY == 5].shape[0],
    "No (aria ambiente)": ps_rx_df_o2[ps_rx_df_o2.O2_THERAPY == 6].shape[0]
}
values = list(data.values())
therapies = list(data.keys())
f, ax = plt.subplots(1, 1, figsize=(15,10))
ax.bar(therapies, values, color = "#104B8E", width=.4)
ax.set_xlabel("Therapy", size=16)
ax.set_ylabel("Number of visits", size=16)
ax.tick_params(labelsize=14)
ax.set_title("ER visits by O2 therapy", size=18)
plt.show()  # M. reservoir seems to be the most used therapy

In [None]:
# explore visit outcome by O2 therapy
data = {
    "Dead":{
        "CPAP/NIV": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 1) & (ps_rx_df_o2.O2_THERAPY == 0)
        ].shape[0],
        "IOT": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 1) & (ps_rx_df_o2.O2_THERAPY == 1)
        ].shape[0],
        "M. Venturi": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 1) & (ps_rx_df_o2.O2_THERAPY == 2)
        ].shape[0],
        "M. reservoir": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 1) & (ps_rx_df_o2.O2_THERAPY == 3)
        ].shape[0],
        "M. semplice": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 1) & (ps_rx_df_o2.O2_THERAPY == 4)
        ].shape[0],
        "Nasocannule": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 1) & (ps_rx_df_o2.O2_THERAPY == 5)
        ].shape[0],
        "No (aria ambiente)": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 1) & (ps_rx_df_o2.O2_THERAPY == 6)
        ].shape[0]
    },
    "Alive":{
        "CPAP/NIV": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 0) & (ps_rx_df_o2.O2_THERAPY == 0)
        ].shape[0],
        "IOT": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 0) & (ps_rx_df_o2.O2_THERAPY == 1)
        ].shape[0],
        "M. Venturi": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 0) & (ps_rx_df_o2.O2_THERAPY == 2)
        ].shape[0],
        "M. reservoir": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 0) & (ps_rx_df_o2.O2_THERAPY == 3)
        ].shape[0],
        "M. semplice": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 0) & (ps_rx_df_o2.O2_THERAPY == 4)
        ].shape[0],
        "Nasocannule": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 0) & (ps_rx_df_o2.O2_THERAPY == 5)
        ].shape[0],
        "No (aria ambiente)": ps_rx_df_o2[
            (ps_rx_df_o2.DEATH == 0) & (ps_rx_df_o2.O2_THERAPY == 6)
        ].shape[0]
    }
}
data = pd.DataFrame(data).T
data = data[
    [
        "No (aria ambiente)", 
        "CPAP/NIV", 
        "M. Venturi", 
        "M. reservoir", 
        "M. semplice", 
        "Nasocannule"
    ]
]
data.plot(kind="bar", figsize=(15,10))
plt.xlabel("O2 therapy", size=16)
plt.ylabel("Number of visits", size=16)
plt.title("Visits outcome by O2 therapy", size=18)
plt.show()


#### Visit outcome by subjects' smoke and alcohol history

In [None]:
# encode subjects' smoke history
ps_rx_df_smoke = ps_rx_df[ps_rx_df["FIELDSET_PHYSIOLOGICAL-SMOKE"].notna()]
ps_rx_df_smoke["SMOKE"] = ps_rx_df_smoke.apply(
    lambda x : encode_smoke(
        x[
            int(
            np.where(
                np.array(ps_rx_df_smoke.columns) == "FIELDSET_PHYSIOLOGICAL-SMOKE"
            )[0]
        )
        ]
    ), axis=1
)
ps_rx_df_smoke.head()


In [None]:
# explore the number of visits by subjects' smoke history
data = {
    "Active": ps_rx_df_smoke[ps_rx_df_smoke["SMOKE"] == 0].shape[0],
    "Past": ps_rx_df_smoke[ps_rx_df_smoke["SMOKE"] == 1].shape[0],
    "Never": ps_rx_df_smoke[ps_rx_df_smoke["SMOKE"] == 2].shape[0],
    "Unknown": ps_rx_df_smoke[ps_rx_df_smoke["SMOKE"] == 3].shape[0]
}
values = list(data.values())
smoke = list(data.keys())
f, ax = plt.subplots(1, 1, figsize=(15,10))
ax.bar(smoke, values, color = "#104B8E", width=.4)
ax.set_xlabel("Smoke history", size=16)
ax.set_ylabel("Number of visits", size=16)
ax.tick_params(labelsize=14)
ax.set_title("ER visits by subjects' smoke history", size=18)
plt.show()  # most of subjects have an unknown smoking history

In [None]:
# explore visit outcome by smoking history
data = {
    "Dead": {
        "Active": ps_rx_df_smoke[
            (ps_rx_df_smoke.DEATH == 1) & (ps_rx_df_smoke.SMOKE == 0)
        ].shape[0],
        "Past": ps_rx_df_smoke[
            (ps_rx_df_smoke.DEATH == 1) & (ps_rx_df_smoke.SMOKE == 1)
        ].shape[0],
        "Never": ps_rx_df_smoke[
            (ps_rx_df_smoke.DEATH == 1) & (ps_rx_df_smoke.SMOKE == 2)
        ].shape[0],
        "Unknown": ps_rx_df_smoke[
            (ps_rx_df_smoke.DEATH == 1) & (ps_rx_df_smoke.SMOKE == 3)
        ].shape[0]
    },
    "Alive": {
        "Active": ps_rx_df_smoke[
            (ps_rx_df_smoke.DEATH == 0) & (ps_rx_df_smoke.SMOKE == 0)
        ].shape[0],
        "Past": ps_rx_df_smoke[
            (ps_rx_df_smoke.DEATH == 0) & (ps_rx_df_smoke.SMOKE == 1)
        ].shape[0],
        "Never": ps_rx_df_smoke[
            (ps_rx_df_smoke.DEATH == 0) & (ps_rx_df_smoke.SMOKE == 2)
        ].shape[0],
        "Unknown": ps_rx_df_smoke[
            (ps_rx_df_smoke.DEATH == 0) & (ps_rx_df_smoke.SMOKE == 3)
        ].shape[0],
    }
}
data = pd.DataFrame(data).T
data = data[["Never", "Past", "Active", "Unknown"]]
data.plot(kind="bar", figsize=(15,10))
plt.xlabel("Smoke history", size=16)
plt.ylabel("Number of visits", size=16)
plt.title("Visits outcome by subjects' smoke history", size=18)
plt.show()


In [None]:
# encode subjects' alcohol history
ps_rx_df_alcohol = ps_rx_df[ps_rx_df["FIELDSET_PHYSIOLOGICAL-ALCOHOL"].notna()]
ps_rx_df_alcohol["ALCOHOL"] = ps_rx_df_alcohol.apply(
    lambda x : encode_alcohol(
        x[
            int(
            np.where(
                np.array(ps_rx_df_alcohol.columns) == "FIELDSET_PHYSIOLOGICAL-ALCOHOL"
            )[0]
        )
        ]
    ), axis=1
)
ps_rx_df_alcohol.head()

In [None]:
# explore the number of visits by subjects' smoke history
data = {
    "Frequent": ps_rx_df_alcohol[ps_rx_df_alcohol["ALCOHOL"] == 0].shape[0],
    "Never": ps_rx_df_alcohol[ps_rx_df_alcohol["ALCOHOL"] == 1].shape[0],
    "Unknown": ps_rx_df_alcohol[ps_rx_df_alcohol["ALCOHOL"] == 2].shape[0],
    "Daily": ps_rx_df_alcohol[ps_rx_df_alcohol["ALCOHOL"] == 3].shape[0],
    "Rare": ps_rx_df_alcohol[ps_rx_df_alcohol["ALCOHOL"] == 4].shape[0]
}
values = list(data.values())
alcohol = list(data.keys())
f, ax = plt.subplots(1, 1, figsize=(15,10))
ax.bar(alcohol, values, color = "#104B8E", width=.4)
ax.set_xlabel("Alcohol history", size=16)
ax.set_ylabel("Number of visits", size=16)
ax.tick_params(labelsize=14)
ax.set_title("ER visits by subjects' alcohol history", size=18)
plt.show()  # most of subjects have an unknown alcohol history

In [None]:
# explore visit outcome by subjects' alcohol history
data = {
    "Dead": {
        "Frequent": ps_rx_df_alcohol[
            (ps_rx_df_alcohol.DEATH == 1) & (ps_rx_df_alcohol.ALCOHOL == 0)
        ].shape[0],
        "Never": ps_rx_df_alcohol[
            (ps_rx_df_alcohol.DEATH == 1) & (ps_rx_df_alcohol.ALCOHOL == 1)
        ].shape[0],
        "Unknown": ps_rx_df_alcohol[
            (ps_rx_df_alcohol.DEATH == 1) & (ps_rx_df_alcohol.ALCOHOL == 2)
        ].shape[0],
        "Daily": ps_rx_df_alcohol[
            (ps_rx_df_alcohol.DEATH == 1) & (ps_rx_df_alcohol.ALCOHOL == 3)
        ].shape[0],
        "Rare": ps_rx_df_alcohol[
            (ps_rx_df_alcohol.DEATH == 1) & (ps_rx_df_alcohol.ALCOHOL == 4)
        ].shape[0]
    },
    "Alive": {
        "Frequent": ps_rx_df_alcohol[
            (ps_rx_df_alcohol.DEATH == 0) & (ps_rx_df_alcohol.ALCOHOL == 0)
        ].shape[0],
        "Never": ps_rx_df_alcohol[
            (ps_rx_df_alcohol.DEATH == 0) & (ps_rx_df_alcohol.ALCOHOL == 1)
        ].shape[0],
        "Unknown": ps_rx_df_alcohol[
            (ps_rx_df_alcohol.DEATH == 0) & (ps_rx_df_alcohol.ALCOHOL == 2)
        ].shape[0],
        "Daily": ps_rx_df_alcohol[
            (ps_rx_df_alcohol.DEATH == 0) & (ps_rx_df_alcohol.ALCOHOL == 3)
        ].shape[0],
        "Rare": ps_rx_df_alcohol[
            (ps_rx_df_alcohol.DEATH == 0) & (ps_rx_df_alcohol.ALCOHOL == 4)
        ].shape[0]
    }
}
data = pd.DataFrame(data).T
data = data[["Never", "Rare", "Frequent", "Daily", "Unknown"]]
data.plot(kind="bar", figsize=(15,10))
plt.xlabel("Alcohol history", size=16)
plt.ylabel("Number of visits", size=16)
plt.title("Visits outcome by subjects' alcohol history", size=18)
plt.show()

Both alcohol and smoke history do not seem to have any impact on visits outcome.

#### Visit outcomes and blood pressure

Let's now finish our exploratory analysis by looking if there exists any difference between the blood pressures of dead and alive subjects.

In [None]:
# systolic pressure
ps_rx_df_sys = ps_rx_df[ps_rx_df["FIELDSET_PS-SYSTOLIC"].notna()]
f, ax = plt.subplots(1, 1, figsize=(15,10))
sns.boxplot(data=ps_rx_df_sys, x="DEATH", y="FIELDSET_PS-SYSTOLIC", ax=ax)
ax.set_xlabel("Visit outcome", size=16)
ax.set_ylabel("Systolic pressure", size=16)
ax.tick_params(labelsize=14)
ax.set_xticklabels(["Alive", "Dead"])
plt.show()  # dead subjects have higher variance in systolic pressure 

In [None]:
# diastolic pressure
ps_rx_df_dia = ps_rx_df[ps_rx_df["FIELDSET_PS-DIASTOLIC"].notna()]
f, ax = plt.subplots(1, 1, figsize=(15,10))
sns.boxplot(
    data=ps_rx_df_dia[ps_rx_df_dia["FIELDSET_PS-DIASTOLIC"] < 600], 
    x="DEATH", 
    y="FIELDSET_PS-DIASTOLIC", 
    ax=ax
)
ax.set_xlabel("Visit outcome", size=16)
ax.set_ylabel("Diastolic pressure", size=16)
ax.tick_params(labelsize=14)
ax.set_xticklabels(["Alive", "Dead"])
plt.show()  # dead subjects have higher variance in diastolic pressure 