In [76]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


In [93]:
# Load data
combined_y1 = pd.read_csv(f"../data/curated/combined_Y1.csv")
combined_y2 = pd.read_csv(f"../data/curated/combined_Y2.csv")

drugs_y1 = pd.read_csv(f"../data/curated/DrugCount_Y1.csv")
drugs_y2 = pd.read_csv(f"../data/curated/DrugCount_Y2.csv")

lab_y1 = pd.read_csv(f"../data/curated/LabCount_Y1.csv")
lab_y2 = pd.read_csv(f"../data/curated/LabCount_Y2.csv")

claims_y1 = pd.read_csv(f"../data/curated/Claims_Y1.csv")
claims_y2 = pd.read_csv(f"../data/curated/Claims_Y2.csv")

combined_y1

Unnamed: 0,MemberID,ProviderID,Vendor,PCP,Specialty,PlaceSvc,DSFS_x,PrimaryConditionGroup,ProcedureGroup,SupLOS,...,LengthOfStay,CharlsonIndex,AgeAtFirstClaim,Sex,DSFS_y,DrugCount,DSFS,LabCount,ClaimsTruncated,DaysInHospital
0,210,8448244.0,122401.0,37508.0,Internal,Office,1.0,GYNEC1,MED,0.0,...,0.250000,0.000000,35.0,N,3.333333,5.0,2.000000,2.0,0,0
1,3197,1367098.0,122401.0,47016.0,Pediatrics,Office,12.0,RESPR4,EM,0.2,...,0.000000,0.000000,5.0,F,7.750000,5.0,,,0,0
2,3889,7053364.0,5166.0,37796.0,Emergency,Urgent Care,1.0,MSC2a3,EM,0.0,...,0.230769,0.615385,45.0,F,6.000000,30.0,1.000000,10.0,0,0
3,4187,8883983.0,887998.0,45901.0,Internal,Office,1.0,ARTHSPIN,EM,0.0,...,0.000000,0.000000,55.0,F,5.500000,61.0,,,0,0
4,9063,5244762.0,791272.0,78718.0,Internal,Office,1.0,ARTHSPIN,EM,0.0,...,0.000000,0.000000,65.0,F,1.500000,2.0,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76033,99995554,8511459.0,64764.0,1303.0,Internal,Office,7.0,SKNAUT,MED,0.0,...,0.000000,0.000000,45.0,M,7.500000,8.0,6.333333,11.0,1,0
76034,99996214,4410262.0,628773.0,13742.0,General Practice,Office,1.0,ARTHSPIN,EM,0.0,...,0.000000,0.000000,45.0,M,,,,,0,0
76035,99997485,9149087.0,140343.0,27585.0,General Practice,Office,1.0,ODaBNCA,EM,0.0,...,0.000000,0.000000,15.0,M,,,,,0,0
76036,99997895,321261.0,152610.0,91687.0,Laboratory,Independent Lab,9.0,ARTHSPIN,EM,0.0,...,0.000000,0.000000,45.0,M,,,6.500000,10.0,0,0


## **Handling Categorical Variables**
The following section handles preprocessing categorical data from the following columns; `Specialty`, `PlaceSvc`, `PrimaryConditionGroup`, `ProcedureGroup`.
Missing values in `Specialty` and `PlaceSvc` were first mode-imputed.
Column `Specialty` was one-hot-encoded based on whether the specialty was likely to be located in a hospital or not.
Column `PlaceSvc` was one-hot-encoded based on whether the place of service was classified as being in a hospital and the relative level of emergency. Noting that; services provided at a hospital (Outpatient) was not classified as being in a hospital.

In [95]:
# Mode imputation for missing values in Specialty
mode_specialty_y1 = combined_y1['Specialty'].mode()[0]
mode_specialty_y2 = combined_y2['Specialty'].mode()[0]
combined_y1.fillna({'Specialty': mode_specialty_y1}, inplace=True)
combined_y2.fillna({'Specialty': mode_specialty_y2}, inplace=True)

# Mode imputation for missing values in PlaceSvc
mode_placesv_y1 = combined_y1['PlaceSvc'].mode()[0]
mode_placesv_y2 = combined_y2['PlaceSvc'].mode()[0]
combined_y1.fillna({'PlaceSvc': mode_placesv_y1}, inplace=True)
combined_y2.fillna({'PlaceSvc': mode_placesv_y2}, inplace=True)

In [99]:
# One hot encode categroical variables in Specialty
# 1 for specialties in hospitals, 0 for rest
positive = ['Anesthesiology', 'Diagnostic Imaging', 'Emergency', 'Internal', 'Pathology', 'Surgery']
combined_y1['Specialty'] = combined_y1['Specialty'].apply(lambda x: 1 if x in positive else 0)
combined_y2['Specialty'] = combined_y2['Specialty'].apply(lambda x: 1 if x in positive else 0)

# One hot encode categroical variables in PlaceSvc
# 1 for specialties in hospitals, 0 for rest
positive = ['Inpatient Hospital', 'Ambulance', 'Urgent Care']
combined_y1['PlaceSvc'] = combined_y1['PlaceSvc'].apply(lambda x: 1 if x in positive else 0)
combined_y2['PlaceSvc'] = combined_y2['PlaceSvc'].apply(lambda x: 1 if x in positive else 0)

In [96]:
# Drop rows with missing values since there are only a few
subset = ['ProcedureGroup', 'PrimaryConditionGroup']
combined_y1.dropna(subset=subset, inplace=True)
combined_y2.dropna(subset=subset, inplace=True)

## **Normalization**
The following section handles normalization of continuous features, specifically the following columns; `LengthOfStay`.

In [97]:
length_of_stay_scaler = MinMaxScaler()

# Normalize LengthOfStay 
# TODO: discuss whether this should be within cross validation loops (when do we fit the scaler?)
length_of_stay_scaler.fit(combined_y1[['LengthOfStay']])
combined_y1['LengthOfStay'] = length_of_stay_scaler.transform(combined_y1[['LengthOfStay']])
combined_y2['LengthOfStay'] = length_of_stay_scaler.transform(combined_y2[['LengthOfStay']])