# Exploratory analysis of dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import sys
import os

### Define data path

In [None]:
data_path = "/Users/manuel/Desktop/BiomedDataAnalysisCourse/datasets/"
ps_data_path = os.path.join(data_path, "database_base.csv")
rx_data_path = os.path.join(data_path, "database_RX_torace.csv")

### Load and explore data

#### ER Unit data

In [None]:
ps_df = pd.read_csv(ps_data_path, sep=";")
# store column value description line
vars_description_ps = ps_df.iloc[1,:]
ps_df.head()

In [None]:
# remove lines repeating the header and describing vars
ps_df.drop(ps_df.head(3).index, axis=0, inplace=True)
ps_df.reset_index(drop=True, inplace=True)
# drop columns with header "####" (separators)
drop_cols = [col for col in ps_df.columns.tolist() if "###" in col]
ps_df.drop(drop_cols, axis=1, inplace=True)
ps_df.head()

In [None]:
ps_df.shape  # 788 patients (660 variables)
ps_df.describe()  # ID is the only variable with 788 unique values 

#### RX data

In [None]:
rx_df = pd.read_csv(rx_data_path, sep=";")
# store column value description
vars_description_rx = rx_df.iloc[1,:]
rx_df.head()

In [None]:
# remove lines repeating header names
rx_df.drop(rx_df.head(3).index, axis=0, inplace=True)
# remove columns named "####" (separators)
drop_cols = [col for col in rx_df.columns.tolist() if "###" in col]
rx_df.drop(drop_cols, axis=1, inplace=True)
rx_df.head()

In [None]:
rx_df.shape  # 942 patients (59 variables)
rx_df.describe()  # again ID is the only variable with 942 unique values

#### Merge ER and RX datasets

In [None]:
# check how many ER dataset's patients are in the RX dataset
visit_ps = set(ps_df.ID.tolist())
visit_rx = set(rx_df.ID.tolist())
len(visit_ps.intersection(visit_rx))  # 770 --> the additional 18 patients should be removed

In [None]:
ps_df = ps_df[ps_df.ID.isin(visit_ps.intersection(visit_rx))]
rx_df = rx_df[rx_df.ID.isin(visit_ps.intersection(visit_rx))]
ps_rx_df = ps_df.merge(rx_df, on="ID")
ps_rx_df.head()

In [None]:
ps_rx_df.shape  # merged dataset --> 770 ER accesses and 718 total variables
ps_rx_df.describe()

In [None]:
# search and remove repeated columns
drop_cols = [col for col in ps_rx_df.columns.tolist() if "_x" in col or "_y" in col]
keep_cols = [col for col in drop_cols if "_x" in col]
for col in keep_cols:
    ps_rx_df[col.split("_")[0]] = ps_rx_df[col]
ps_rx_df.drop(drop_cols, axis=1, inplace=True)
ps_rx_df.shape  # number of variables decreased to 715

In [None]:
# DEAD_DATE is used to classify our patients as DEAD or DISMISSED
final_result = []
for ddate in ps_rx_df.DEAD_DATE.tolist():
    if str(ddate) == "nan":
        final_result.append(0)
    else:
        final_result.append(1)
assert len(final_result) == ps_rx_df.shape[0]
ps_rx_df["FINAL_OUTCOME"] = final_result
ps_rx_df.head()

In [None]:
# continue with the final variable cleaning procedure
drop_cols = [
    "DEAD_CAUSE-0",
    "DEAD_CAUSE-1",
    "DEAD_CAUSE-2",
    "DEAD_CAUSE-3",
    "DEAD_CAUSE-4"
]
ps_rx_df.drop(drop_cols, axis=1, inplace=True)

#### Plotting some stats on our data

In [None]:
# explore the outcomes
data = {
    "Dead":ps_rx_df[ps_rx_df.FINAL_OUTCOME == 1].shape[0], 
    "Alive":ps_rx_df[ps_rx_df.FINAL_OUTCOME == 0].shape[0]
}
outcome = list(data.keys())
values = list(data.values())
f,ax = plt.subplots(1,1,figsize = (10, 5))
plt.bar(outcome, values, color = "#104B8E", width = 0.4)
plt.xlabel("Final outcome", size=16)
plt.ylabel("Number of patients", size=16)
ax.tick_params(labelsize=14)
plt.show()