In [102]:
import pandas as pd

In [103]:
# Load data
combined_y1 = pd.read_csv(f"../data/curated/combined_Y1.csv")
combined_y2 = pd.read_csv(f"../data/curated/combined_Y2.csv")

## **Dropping Columns**
Column `ProviderID`, `Vendor` and `PCP`, since these are random numerical variables that are random adding unnecessary noise.

In [104]:
# Drop columns ProviderID, Vendor, and PCP
combined_y1.drop(columns=['ProviderID', 'Vendor', 'PCP'], inplace=True)
combined_y2.drop(columns=['ProviderID', 'Vendor', 'PCP'], inplace=True)

## **Handling Missing Numerical Values**

In [105]:
# Replace null values in DSFS columns

# Drop rows with missing values for DSFS_Claims
combined_y1 = combined_y1.dropna(subset=["DSFS_x"])
combined_y2 = combined_y2.dropna(subset=["DSFS_x"])

# Replace missing values in DSFS_Drugs and DSFS_Labs with 12 (maximum value implying no claim was made in the last 12 months)
combined_y1["DSFS_y"] = combined_y1["DSFS_y"].fillna(12)
combined_y2["DSFS_y"] = combined_y2["DSFS_y"].fillna(12)
combined_y1["DSFS"] = combined_y1["DSFS"].fillna(12)
combined_y2["DSFS"] = combined_y2["DSFS"].fillna(12)

In [106]:
# Replace missing values in DrugCount and LabCount with 0 (implies no drugs or labs were claimed)
combined_y1["DrugCount"] = combined_y1["DrugCount"].fillna(0)
combined_y2["DrugCount"] = combined_y2["DrugCount"].fillna(0)
combined_y1["LabCount"] = combined_y1["LabCount"].fillna(0)
combined_y2["LabCount"] = combined_y2["LabCount"].fillna(0)

## **Handling Categorical Variables**
The following section handles preprocessing categorical data from the following columns; `Specialty`, `PlaceSvc`, `PrimaryConditionGroup`, `ProcedureGroup`.
Missing values in `Specialty` and `PlaceSvc` were first mode-imputed.
Column `Specialty` was one-hot-encoded based on whether the specialty was likely to be located in a hospital or not.
Column `PlaceSvc` was one-hot-encoded based on whether the place of service was classified as being in a hospital and the relative level of emergency. Noting that; services provided at a hospital (Outpatient) was not classified as being in a hospital.

In [107]:
# Mode imputation for missing values in Specialty
mode_specialty_y1 = combined_y1['Specialty'].mode()[0]
mode_specialty_y2 = combined_y2['Specialty'].mode()[0]
combined_y1.fillna({'Specialty': mode_specialty_y1}, inplace=True)
combined_y2.fillna({'Specialty': mode_specialty_y2}, inplace=True)

# Mode imputation for missing values in PlaceSvc
mode_placesv_y1 = combined_y1['PlaceSvc'].mode()[0]
mode_placesv_y2 = combined_y2['PlaceSvc'].mode()[0]
combined_y1.fillna({'PlaceSvc': mode_placesv_y1}, inplace=True)
combined_y2.fillna({'PlaceSvc': mode_placesv_y2}, inplace=True)

In [108]:
# One hot encode categroical variables in Specialty
# 1 for specialties in hospitals, 0 for rest
positive = ['Anesthesiology', 'Diagnostic Imaging', 'Emergency', 'Internal', 'Pathology', 'Surgery']
combined_y1['Specialty'] = combined_y1['Specialty'].apply(lambda x: 1 if x in positive else 0)
combined_y2['Specialty'] = combined_y2['Specialty'].apply(lambda x: 1 if x in positive else 0)

# One hot encode categroical variables in PlaceSvc
# 1 for specialties in hospitals, 0 for rest
positive = ['Inpatient Hospital', 'Ambulance', 'Urgent Care']
combined_y1['PlaceSvc'] = combined_y1['PlaceSvc'].apply(lambda x: 1 if x in positive else 0)
combined_y2['PlaceSvc'] = combined_y2['PlaceSvc'].apply(lambda x: 1 if x in positive else 0)

In [109]:
# Encode Sex column with normalised values
combined_y1['Sex'] = combined_y1['Sex'].replace({'M': 0, 'N': 0.5, 'F': 1})
combined_y2['Sex'] = combined_y2['Sex'].replace({'M': 0, 'N': 0.5, 'F': 1})

  combined_y1['Sex'] = combined_y1['Sex'].replace({'M': 0, 'N': 0.5, 'F': 1})
  combined_y2['Sex'] = combined_y2['Sex'].replace({'M': 0, 'N': 0.5, 'F': 1})


In [110]:
# Drop rows with missing values since there are only a few
subset = ['ProcedureGroup', 'PrimaryConditionGroup']
combined_y1.dropna(subset=subset, inplace=True)
combined_y2.dropna(subset=subset, inplace=True)

`ProcedureGroup` was encoded using the following table, where each procedure was assigned a rank based on the expected hospital stay, ranked on a scale of 0 to 5, where a rank of 5 is a procedure expected to result in the longest stay.
Rank assignments are based off of the conditions listed for each `ProcedureGroup` in https://foreverdata.org/1015/content/Data_Dictionary_release3.pdf and informal knowledge pertaining to the severity of the procedure.

| ProcedureGroup | Description                             | Rank (0-5) |
|----------------|-----------------------------------------|------------|
| PL             | Pathology and Laboratory                | 0          |
| RAD            | Radiology                               | 0          |
| MED            | Medicine                                | 0          |
| ANES           | Anesthesia                              | 1          |
| EM             | Evaluation and Management               | 1          |
| SEOA           | Surgery-Eye and Ocular Adnexa           | 1          |
| SAS            | Surgery-Auditory System                 | 2          |
| SIS            | Surgery-Integumentary System            | 2          |
| SMCD           | Surgery-Maternity Care and Delivery     | 3          |
| SO             | Surgery-Other                           | 3          |
| SGS            | Surgery-Genital System                  | 3          |
| SUS            | Surgery-Urinary System                  | 4          |
| SDS            | Surgery-Digestive System                | 4          |
| SMS            | Surgery-Musculoskeletal System          | 4          |
| SRS            | Surgery-Respiratory System              | 5          |
| SNS            | Surgery-Nervous System                  | 5          |
| SCS            | Surgery-Cardiovascular System           | 5          |

In [111]:
r0 = ['PL', 'RAD', 'MED']
r1 = ['ANES', 'EM', 'SEOA']
r2 = ['SAS', 'SIS']
r3 = ['SMCD', 'SO', 'SGS']
r4 = ['SUS', 'SDS', 'SMS']
r5 = ['SRS', 'SNS', 'SCS']

ranks = [r0, r1, r2, r3, r4, r5]

for i in range(len(ranks)):
    combined_y1['ProcedureGroup'] = combined_y1['ProcedureGroup'].apply(lambda x: i if x in ranks[i] else x)
    combined_y2['ProcedureGroup'] = combined_y2['ProcedureGroup'].apply(lambda x: i if x in ranks[i] else x)

`PrimaryConditionGroup` was encoded using the following table, where each procedure was assigned a rank based on the expected hospital stay, ranked on a scale of 1 to 5, where a rank of 5 is a procedure expected to result in the longest stay.
Rank assignments are based off of the conditions listed for each `PrimaryConditionGroup` in https://foreverdata.org/1015/content/Data_Dictionary_release3.pdf and informal knowledge pertaining to the severity of the condition.

| PrimaryConditionGroup | Description                                  | Rank (1-5) |
|-----------------------|----------------------------------------------|------------|
| GYNEC1                | Gynecology                                   | 1          |
| ODaBNCA               | Ingestions and benign tumors                 | 1          |
| PRGNCY                | Pregnancy                                    | 1          |
| UTI                   | Urinary tract infections                     | 2          |
| ARTHSPIN              | Arthropathies                                | 2          |
| INFEC4                | All other infections                         | 2          |
| SKNAUT                | Skin and autoimmune disorders                | 2          |
| MISCL1                | Miscellaneous 1                              | 2          |
| MSC2a3                | Miscellaneous 2                              | 2          |
| MISCL5                | Miscellaneous 3                              | 2          |
| HEMTOL                | Non-malignant hematologic                    | 2          |
| ROAMI                 | Chest pain                                   | 2          |
| HEART2                | Other cardiac conditions                     | 3          |
| APPCHOL               | Appendicitis                                 | 3          |
| SEIZURE               | Seizure                                      | 3          |
| RENAL3                | Other renal disorders                        | 3          |
| MISCHRT               | Miscellaneous cardiac                        | 3          |
| METAB3                | Other metabolic disorders                    | 3          |
| NEUMENT               | Other neurological disorders                 | 3          |
| HEART4                | Atherosclerosis and peripheral vascular disease | 3       |
| PERVALV               | Pericarditis                                 | 3          |
| FLaELEC               | Fluid and electrolyte disorders              | 3          |
| FXDISLC               | Fractures and dislocations                   | 3          |
| PERINTL               | Perinatal period                             | 4          |
| HIPFX                 | Hip fracture                                 | 4          |
| GIOBSENT              | Gastrointestinal, IBD, and obstruction       | 4          |
| GIBLEED               | Gastrointestinal bleeding                    | 4          |
| LIVERDZ               | Liver disorders                              | 4          |
| METAB1                | Diabetic ketoacidosis                        | 4          |
| PNCRDZ                | Pancreatic disorders                         | 4          |
| PNEUM                 | Pneumonia                                    | 4          |
| COPD                  | Chronic obstructive pulmonary disorder       | 4          |
| RENAL2                | Chronic renal failure                        | 4          |
| RESPR4                | Acute respiratory disorders                  | 4          |
| CHF                   | Congestive heart failure                     | 4          |
| TRAUMA                | All other trauma                             | 4          |
| CATAST                | Catastrophic conditions                      | 5          |
| CANCRA                | Cancer A                                     | 5          |
| CANCRB                | Cancer B                                     | 5          |
| RENAL1                | Acute renal failure                          | 5          |
| SEPSIS                | Sepsis                                       | 5          |
| GYNECA                | Gynecologic cancers                          | 5          |
| CANCRM                | Ovarian and metastatic cancer                | 5          |
| STROKE                | Stroke                                       | 5          |
| AMI                   | Acute myocardial infarction                  | 5          |

In [112]:
r1 = ['GYNEC1', 'ODaBNCA', 'PRGNCY']
r2 = ['UTI', 'ARTHSPIN', 'INFEC4', 'SKNAUT', 'MISCL1', 'HEMTOL', 'MSC2a3', 'ROAMI', 'MISCL5']
r3 = ['HEART2', 'APPCHOL','SEIZURE', 'RENAL3', 'MISCHRT', 'METAB3', 'NEUMENT', 'HEART4', 'PERVALV', 'FLaELEC', 'FXDISLC']
r4 = ['PERINTL', 'HIPFX', 'GIOBSENT', 'GIBLEED' , 'LIVERDZ', 'METAB1', 'PNCRDZ', 'PNEUM', 'COPD', 'RENAL2', 'RESPR4', 'CHF', 'TRAUMA']
r5 = ['CATAST', 'CANCRA', 'CANCRB', 'RENAL1', 'SEPSIS', 'GYNECA', 'CANCRM', 'STROKE', 'AMI']

ranks = [r1, r2, r3, r4, r5]

for i in range(1,len(ranks)+1):
    combined_y1['PrimaryConditionGroup'] = combined_y1['PrimaryConditionGroup'].apply(lambda x: i if x in ranks[i-1] else x)
    combined_y2['PrimaryConditionGroup'] = combined_y2['PrimaryConditionGroup'].apply(lambda x: i if x in ranks[i-1] else x)

## **Normalization**
This section is intended to normalize the data in `LengthOfStay` column, however, this has been moved to within the cross validation loops since the normalizer should be fit with only training data (and not also validation/testing data).

## **Exporting DataFrames**

In [113]:
combined_y1.rename(columns={'DSFS_x': 'DSFS_Claims', 'DSFS_y': 'DSFS_Drugs', 'DSFS': 'DSFS_Lab'}, inplace=True)
combined_y2.rename(columns={'DSFS_x': 'DSFS_Claims', 'DSFS_y': 'DSFS_Drugs', 'DSFS': 'DSFS_Lab'}, inplace=True)
combined_y1.rename(columns={'DaysInHospital': 'DaysInHospitalY2'}, inplace=True)
combined_y2.rename(columns={'DaysInHospital': 'DaysInHospitalY3'}, inplace=True)

combined_y1.to_csv("../data/preprocessed/prp_combined_Y1.csv", index=False)
combined_y2.to_csv("../data/preprocessed/prp_combined_Y2.csv", index=False)