In this notebook you will find feature engineering and modeling.
I have kept enough of the cleaning code to be followed but if you'd like the step by step process or to see my EDA please refer to _____


#### Feature Engineering


#### Modeling

In [1]:
# Importing basics libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier


In [2]:
# Importing data
stay_df = pd.read_csv('data/train_data.csv')
stay_test_df = pd.read_csv('data/test_data.csv')
stay_df.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [3]:
print(stay_df.shape, stay_test_df.shape)
stay_df.Stay.unique()

(318438, 18) (137057, 17)


array(['0-10', '41-50', '31-40', '11-20', '51-60', '21-30', '71-80',
       'More than 100 Days', '81-90', '61-70', '91-100'], dtype=object)

In [4]:
stay_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital_code                      318438 non-null  int64  
 2   Hospital_type_code                 318438 non-null  object 
 3   City_Code_Hospital                 318438 non-null  int64  
 4   Hospital_region_code               318438 non-null  object 
 5   Available Extra Rooms in Hospital  318438 non-null  int64  
 6   Department                         318438 non-null  object 
 7   Ward_Type                          318438 non-null  object 
 8   Ward_Facility_Code                 318438 non-null  object 
 9   Bed Grade                          318325 non-null  float64
 10  patientid                          318438 non-null  int64  
 11  City_Code_Patient                  3139

In [5]:
stay_df.describe()

Unnamed: 0,case_id,Hospital_code,City_Code_Hospital,Available Extra Rooms in Hospital,Bed Grade,patientid,City_Code_Patient,Visitors with Patient,Admission_Deposit
count,318438.0,318438.0,318438.0,318438.0,318325.0,318438.0,313906.0,318438.0,318438.0
mean,159219.5,18.318841,4.771717,3.197627,2.625807,65747.579472,7.251859,3.284099,4880.749392
std,91925.276847,8.633755,3.102535,1.168171,0.873146,37979.93644,4.745266,1.764061,1086.776254
min,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1800.0
25%,79610.25,11.0,2.0,2.0,2.0,32847.0,4.0,2.0,4186.0
50%,159219.5,19.0,5.0,3.0,3.0,65724.5,8.0,3.0,4741.0
75%,238828.75,26.0,7.0,4.0,3.0,98470.0,8.0,4.0,5409.0
max,318438.0,32.0,13.0,24.0,4.0,131624.0,38.0,32.0,11008.0


In [6]:
stay_df.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

In [7]:
stay_test_df.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                              35
patientid                               0
City_Code_Patient                    2157
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
dtype: int64

In [8]:
# clean up column names first by replacing spaes with underscore

[stay_df.rename(columns={name: name.replace(" ", "_")}, inplace= True) for name in stay_df.columns if ' ' in name]

[stay_test_df.rename(columns={name: name.replace(" ", "_")}, inplace = True) for name in stay_test_df.columns if ' ' in name]

stay_df.columns, stay_test_df.columns

(Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
        'Hospital_region_code', 'Available_Extra_Rooms_in_Hospital',
        'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed_Grade',
        'patientid', 'City_Code_Patient', 'Type_of_Admission',
        'Severity_of_Illness', 'Visitors_with_Patient', 'Age',
        'Admission_Deposit', 'Stay'],
       dtype='object'),
 Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
        'Hospital_region_code', 'Available_Extra_Rooms_in_Hospital',
        'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed_Grade',
        'patientid', 'City_Code_Patient', 'Type_of_Admission',
        'Severity_of_Illness', 'Visitors_with_Patient', 'Age',
        'Admission_Deposit'],
       dtype='object'))

In [9]:
# fill missing Bed_Grade

stay_df.Bed_Grade.fillna(stay_df.Bed_Grade.mode()[0], inplace=True)

stay_test_df.Bed_Grade.fillna(stay_df.Bed_Grade.mode()[0], inplace=True)

stay_df.Bed_Grade = stay_df.Bed_Grade.astype('int')
stay_test_df.Bed_Grade = stay_test_df.Bed_Grade.astype('int')

In [10]:
# fill missing City_Code_Patient with the City_Code_Hospital of the sample

stay_df.loc[stay_df.City_Code_Patient.isnull(), 'City_Code_Patient'] = stay_df.City_Code_Hospital.loc[stay_df.City_Code_Patient.isnull()]

stay_test_df.loc[stay_test_df.City_Code_Patient.isnull(), 'City_Code_Patient'] = stay_test_df.City_Code_Hospital.loc[stay_test_df.City_Code_Patient.isnull()]

stay_df.City_Code_Patient = stay_df.City_Code_Patient.astype('int')
stay_test_df.City_Code_Patient = stay_test_df.City_Code_Patient.astype('int')

In [11]:
stay_df.case_id.nunique() == len(stay_df)

True

In [14]:
stay_df.columns

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available_Extra_Rooms_in_Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed_Grade',
       'patientid', 'City_Code_Patient', 'Type_of_Admission',
       'Severity_of_Illness', 'Visitors_with_Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')

## Feature engineering

for each of: 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code'
create a feature for count of: 'patientid', 'Available_Extra_Rooms_in_Hospital', 'Bed_Grade','Severity_of_Illness', 'Visitors_with_Patient'


create a feature for ratio by hospital of: 'patientid', 'Type_of_Admission', 'Visitors_with_Patient', 'Age' 'Severity_of_Illness'

Create total_medical_cost for 'patienid'

In [None]:
hosp_descriptor = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code']

hosp_features = ['patientid', 'Available_Extra_Rooms_in_Hospital', 'Bed_Grade', 'Visitors_with_Patient']

In [None]:
def new_count_feature(to_count, to_name):
    
    for feature in hosp_descriptor:
        stay_df[feature + to_name] = 

In [None]:
stay_df[hosp_code_patient_count] = stay_df[]

In [49]:

 stay_df.groupby('Hospital_code').size()


Hospital_code
1      5249
2      5102
3      7116
4      1240
5      5261
6     20425
7      1306
8      3663
9     11510
10     9435
11    17328
12    11297
13     5236
14    17328
15     9257
16     3671
17     5501
18     3630
19    21219
20     1405
21     8150
22     4277
23    26566
24     7992
25     9834
26    33076
27    14244
28    17137
29    11311
30     5002
31     3967
32    10703
dtype: int64

In [53]:
stay_df.groupby(['Available_Extra_Rooms_in_Hospital', 'Hospital_code']).size()

Available_Extra_Rooms_in_Hospital  Hospital_code
0                                  10               1
                                   11               1
                                   12               1
                                   15               1
                                   23               2
                                                   ..
13                                 27               2
14                                 12               1
20                                 26               2
21                                 26               3
24                                 27               1
Length: 302, dtype: int64

In [48]:
stay_df.groupby('patientid').size()

patientid
1         4
2         2
4         2
6         1
7         4
         ..
131620    9
131621    3
131622    4
131623    2
131624    3
Length: 92017, dtype: int64