#### IMPORTING REQUIRED LIBRARIES

In [31]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore") 
import xgboost as xgb
#import Scoring metric
from sklearn.metrics import accuracy_score,recall_score,f1_score,fbeta_score,classification_report 

In [32]:
import pandas as pd
data=pd.read_csv("TrainingData.csv")

### Data cleaning and preprocessing

In [33]:
data['Result']=data['Result'].replace({0:1,1:0})

# Get the count of 0s and 1s
count_0 = data['Result'].value_counts()[0]
count_1 = data['Result'].value_counts()[1]

print("Count of 0s: ", count_0)
print("Count of 1s: ", count_1)

Count of 0s:  209734
Count of 1s:  629126


In [34]:
print(data[data['Result']==0]['Area_Service'].mode())
print(data[data['Result']==1]['Area_Service'].mode())

0    Hudson Valley
Name: Area_Service, dtype: object
0    Hudson Valley
Name: Area_Service, dtype: object


In [35]:
# Filling the missing values with the mode since it's a categorical column
data['Area_Service'].fillna(value=0.0,inplace=True)

In [36]:
data['Area_Service'].isnull().sum()

0

In [37]:
print(data[data['Result']==0]['Hospital County'].mode())
print(data[data['Result']==1]['Hospital County'].mode())

0    Erie
Name: Hospital County, dtype: object
0    Erie
Name: Hospital County, dtype: object


In [38]:
# Filling the missing values with the mode since it's a categorical column
data['Hospital County'].fillna(value=0.0,inplace=True)

In [39]:
data['Hospital County'].isnull().sum()

0

In [40]:
print(data[data['Result']==0]['Mortality risk'].mode())
print(data[data['Result']==1]['Mortality risk'].mode())

0    1.0
Name: Mortality risk, dtype: float64
0    1.0
Name: Mortality risk, dtype: float64


In [41]:
# Filling the missing values with the mode since it's a categorical column
data['Mortality risk'].fillna(value=0.0,inplace=True)

In [42]:
data['Mortality risk'].isnull().sum()

0

In [43]:
data.columns

Index(['Area_Service', 'Hospital County', 'Hospital Id', 'Age', 'Gender',
       'Cultural_group', 'ethnicity', 'Days_spend_hsptl', 'Admission_type',
       'Home or self care,', 'ccs_diagnosis_code', 'ccs_procedure_code',
       'apr_drg_description', 'Code_illness', 'Mortality risk',
       'Surg_Description', 'Weight_baby', 'Abortion', 'Emergency dept_yes/No',
       'Tot_charg', 'Tot_cost', 'ratio_of_total_costs_to_total_charges',
       'Payment_Typology', 'Result'],
      dtype='object')

In [44]:
data=data.drop('Hospital Id',axis=1)
data.columns

Index(['Area_Service', 'Hospital County', 'Age', 'Gender', 'Cultural_group',
       'ethnicity', 'Days_spend_hsptl', 'Admission_type', 'Home or self care,',
       'ccs_diagnosis_code', 'ccs_procedure_code', 'apr_drg_description',
       'Code_illness', 'Mortality risk', 'Surg_Description', 'Weight_baby',
       'Abortion', 'Emergency dept_yes/No', 'Tot_charg', 'Tot_cost',
       'ratio_of_total_costs_to_total_charges', 'Payment_Typology', 'Result'],
      dtype='object')

## MODEL FITTING

### Taking out Numerical Variables from dataset

In [45]:
numerical_Variables = data.select_dtypes(include=['int', 'float'])#exclude categorical variables
numerical_Variables=numerical_Variables.drop(["ccs_diagnosis_code","ccs_procedure_code"],axis=1)
numerical_Variables.columns

Index(['Code_illness', 'Mortality risk', 'Weight_baby', 'Tot_charg',
       'Tot_cost', 'ratio_of_total_costs_to_total_charges', 'Payment_Typology',
       'Result'],
      dtype='object')

In [46]:

# Extract the categorical variable to encode
cat_var = data['apr_drg_description']

# Calculate the frequency of each category
freq = cat_var.value_counts(normalize=True)

# Create a dictionary to map each category to its frequency
freq_dict = freq.to_dict()

# Replace the original categorical variable with its frequency-encoded version
data['apr_drg_description'] = cat_var.map(freq_dict)


### One-Hot Encoding of Categorical Variables

In [47]:
OHE_Categorical_Variables = pd.get_dummies(data[['Area_Service', 'Hospital County','Age', 'Gender','Cultural_group', 'ethnicity', 'Admission_type','Home or self care,','Surg_Description', 'Abortion', 'Emergency dept_yes/No']])
OHE_Categorical_Variables 

Unnamed: 0,Area_Service_0.0,Area_Service_Capital/Adirond,Area_Service_Central NY,Area_Service_Finger Lakes,Area_Service_Hudson Valley,Area_Service_New York City,Area_Service_Southern Tier,Area_Service_Western NY,Hospital County_0.0,Hospital County_Albany,...,"Home or self care,_Psychiatric Hospital or Unit of Hosp","Home or self care,_Short-term Hospital","Home or self care,_Skilled Nursing Home",Surg_Description_Medical,Surg_Description_Not Applicable,Surg_Description_Surgical,Abortion_N,Abortion_Y,Emergency dept_yes/No_N,Emergency dept_yes/No_Y
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,1,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
838855,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
838856,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
838857,0,0,0,0,1,0,0,0,0,0,...,0,0,1,1,0,0,1,0,0,1
838858,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0


In [48]:
#combing the one hot encoded categorical variable to rest of the predictors
join= [numerical_Variables,OHE_Categorical_Variables] 
Data = pd.concat(join,axis=1,join='inner')  
Data.shape

(838860, 115)

### Assigning X as predictors and y as target

In [49]:
X_train = Data.drop(["Result"],axis=1)  #predictors
y_train = Data["Result"] #target

In [50]:
X_train.shape

(838860, 114)

### Class-Balancing  using Random oversampling

In [51]:
from sklearn.model_selection import train_test_split

# Split dataset into train and test sets (80/20 split)
X_train_test, X_test, y_train_test, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Split train set into train and validation sets (75/25 split)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_test, y_train_test, test_size=0.25, random_state=42, stratify=y_train_test)


In [52]:
from imblearn.over_sampling import RandomOverSampler

# X is your feature matrix and y is your target variable
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [53]:
y_train = pd.DataFrame(y_resampled)
X_train = pd.DataFrame(X_resampled)

In [54]:
y_train.value_counts()

Result
0         377476
1         377476
dtype: int64

### Import Test CSV

In [55]:
data1=pd.read_csv("TestingData.csv")
data1

Unnamed: 0,Area_Service,Hospital County,Hospital Id,Age,Gender,Cultural_group,ethnicity,Days_spend_hsptl,Admission_type,"Home or self care,",...,Mortality risk,Surg_Description,Weight_baby,Abortion,Emergency dept_yes/No,Tot_charg,Tot_cost,ratio_of_total_costs_to_total_charges,Payment_Typology,Result
0,Western NY,Erie,3067.0,70 or Older,F,White,Not Span/Hispanic,1,Emergency,Home or Self Care,...,1.0,Medical,0,N,Y,6942.70,2574.25,0.370786,2,0
1,Western NY,Erie,213.0,70 or Older,F,White,Not Span/Hispanic,3,Emergency,Inpatient Rehabilitation Facility,...,3.0,Medical,0,N,Y,22240.86,12706.03,0.571292,2,0
2,Central NY,Tompkins,977.0,18 to 29,F,White,Not Span/Hispanic,3,Emergency,Home or Self Care,...,1.0,Medical,0,N,Y,4874.54,2975.84,0.610486,1,0
3,Central NY,Oneida,599.0,70 or Older,M,White,Not Span/Hispanic,7,Emergency,Home or Self Care,...,2.0,Medical,0,N,Y,38027.53,15137.02,0.398054,2,1
4,Hudson Valley,Dutchess,192.0,70 or Older,F,White,Not Span/Hispanic,3,Elective,Home or Self Care,...,1.0,Surgical,0,N,N,30128.46,11771.46,0.390709,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209710,Central NY,Onondaga,630.0,18 to 29,F,White,Not Span/Hispanic,4,Urgent,Short-term Hospital,...,1.0,Medical,0,N,N,6640.68,3160.84,0.475982,2,1
209711,Western NY,Niagara,585.0,70 or Older,F,White,Not Span/Hispanic,5,Emergency,Home or Self Care,...,2.0,Medical,0,N,Y,8546.01,3962.58,0.463676,2,1
209712,New York City,Bronx,1165.0,30 to 49,M,Black/African American,Not Span/Hispanic,2,Emergency,Home or Self Care,...,1.0,Medical,0,N,Y,22126.54,18320.42,0.827984,1,0
209713,Capital/Adirond,Albany,1.0,0 to 17,F,White,Not Span/Hispanic,3,Newborn,Home or Self Care,...,1.0,Medical,2900,N,N,5103.00,1085.09,0.212637,4,0


### Data cleaning and preprocessing

In [56]:
data1['Result']=data1['Result'].replace({0:1,1:0})

# Get the count of 0s and 1s
count_0 = data1['Result'].value_counts()[0]
count_1 = data1['Result'].value_counts()[1]

print("Count of 0s: ", count_0)
print("Count of 1s: ", count_1)

Count of 0s:  52434
Count of 1s:  157281


In [57]:
print(data1[data1['Result']==0]['Area_Service'].mode())
print(data1[data1['Result']==1]['Area_Service'].mode())

0    Hudson Valley
Name: Area_Service, dtype: object
0    Hudson Valley
Name: Area_Service, dtype: object


In [58]:
# Filling the missing values with the mode since it's a categorical column
data1['Area_Service'].fillna(value=0.0,inplace=True)

In [59]:
data1['Area_Service'].isnull().sum()

0

In [60]:
print(data1[data1['Result']==0]['Hospital County'].mode())
print(data1[data1['Result']==1]['Hospital County'].mode())

0    Erie
Name: Hospital County, dtype: object
0    Erie
Name: Hospital County, dtype: object


In [61]:
# Filling the missing values with the mode since it's a categorical column
data1['Hospital County'].fillna(value=0.0,inplace=True)

In [62]:
data1['Hospital County'].isnull().sum()

0

In [63]:
print(data1[data1['Result']==0]['Mortality risk'].mode())
print(data1[data1['Result']==1]['Mortality risk'].mode())

0    1.0
Name: Mortality risk, dtype: float64
0    1.0
Name: Mortality risk, dtype: float64


In [64]:
# Filling the missing values with the mode since it's a categorical column
data1['Mortality risk'].fillna(value=0.0,inplace=True)

In [65]:
data1['Mortality risk'].isnull().sum()

0

In [66]:
data1.columns

Index(['Area_Service', 'Hospital County', 'Hospital Id', 'Age', 'Gender',
       'Cultural_group', 'ethnicity', 'Days_spend_hsptl', 'Admission_type',
       'Home or self care,', 'ccs_diagnosis_code', 'ccs_procedure_code',
       'apr_drg_description', 'Code_illness', 'Mortality risk',
       'Surg_Description', 'Weight_baby', 'Abortion', 'Emergency dept_yes/No',
       'Tot_charg', 'Tot_cost', 'ratio_of_total_costs_to_total_charges',
       'Payment_Typology', 'Result'],
      dtype='object')

In [67]:
data1=data1.drop('Hospital Id',axis=1)
data1.columns

Index(['Area_Service', 'Hospital County', 'Age', 'Gender', 'Cultural_group',
       'ethnicity', 'Days_spend_hsptl', 'Admission_type', 'Home or self care,',
       'ccs_diagnosis_code', 'ccs_procedure_code', 'apr_drg_description',
       'Code_illness', 'Mortality risk', 'Surg_Description', 'Weight_baby',
       'Abortion', 'Emergency dept_yes/No', 'Tot_charg', 'Tot_cost',
       'ratio_of_total_costs_to_total_charges', 'Payment_Typology', 'Result'],
      dtype='object')

## MODEL FITTING

### Taking out Numerical Variables from dataset

In [68]:
numerical_Variables = data1.select_dtypes(include=['int', 'float'])#exclude categorical variables
numerical_Variables=numerical_Variables.drop(["ccs_diagnosis_code","ccs_procedure_code"],axis=1)
numerical_Variables.columns

Index(['Code_illness', 'Mortality risk', 'Weight_baby', 'Tot_charg',
       'Tot_cost', 'ratio_of_total_costs_to_total_charges', 'Payment_Typology',
       'Result'],
      dtype='object')

In [69]:

# Extract the categorical variable to encode
cat_var = data1['apr_drg_description']

# Calculate the frequency of each category
freq = cat_var.value_counts(normalize=True)

# Create a dictionary to map each category to its frequency
freq_dict = freq.to_dict()

# Replace the original categorical variable with its frequency-encoded version
data1['apr_drg_description'] = cat_var.map(freq_dict)


### One-Hot Encoding of Categorical Variables

In [70]:
OHE_Categorical_Variables = pd.get_dummies(data1[['Area_Service', 'Hospital County','Age', 'Gender','Cultural_group', 'ethnicity', 'Admission_type','Home or self care,','Surg_Description', 'Abortion', 'Emergency dept_yes/No']])
OHE_Categorical_Variables 

Unnamed: 0,Area_Service_0.0,Area_Service_Capital/Adirond,Area_Service_Central NY,Area_Service_Finger Lakes,Area_Service_Hudson Valley,Area_Service_New York City,Area_Service_Southern Tier,Area_Service_Western NY,Hospital County_0.0,Hospital County_Albany,...,"Home or self care,_Psychiatric Hospital or Unit of Hosp","Home or self care,_Short-term Hospital","Home or self care,_Skilled Nursing Home",Surg_Description_Medical,Surg_Description_Not Applicable,Surg_Description_Surgical,Abortion_N,Abortion_Y,Emergency dept_yes/No_N,Emergency dept_yes/No_Y
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209710,0,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,0,1,0
209711,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
209712,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
209713,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,1,0


In [71]:
#combing the one hot encoded categorical variable to rest of the predictors
join= [numerical_Variables,OHE_Categorical_Variables] 
Data1 = pd.concat(join,axis=1,join='inner')  
Data1.shape

(209715, 115)

### Assigning X as predictors and y as target

In [None]:
X_test = Data1.drop(["Result"],axis=1)  #predictors
y_test = Data1["Result"] #target

###  XGboost Classifier - Test data

In [None]:
# Create XGBClassifier object
clf = xgb.XGBClassifier(learning_rate= 0.26424315526454, max_depth= 3, n_estimators= 83, scale_pos_weight= 2.9987774512317094)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Make predictions on the validation data
pred_test = clf.predict(X_test)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, pred_test)
f1 = f1_score(y_test, pred_test)
recall = recall_score(y_test, pred_test)
f2 = fbeta_score(y_test, pred_test, beta=2)

# Print the classification report and evaluation metrics
print(classification_report(y_test, pred_test))
XGB = pd.DataFrame({'XGBoost_Model_1': [accuracy, f1, recall,  f2]}, 
                    index=['Accuracy', 'F1_score', 'Recall', 'F2_score'])
print(XGB)
