<a href="https://colab.research.google.com/github/Geralberrio/Proyecto_Analitica/blob/master/Hospitalizacion_clasificacion_RF_GB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regresión con Random Forest y Gradient Boosting


In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA

from matplotlib import pyplot as plt

In [2]:
# Definamos el "random_state" para que los resultados sean reproducibles:
random_state=42

# Preprocesamiento de datos

In [5]:
# Carguemos los datos:
data_train = pd.read_csv('./Hospitalizacion_train_data.csv', header=0)
data_train

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210982,210983,10,e,1,X,4,gynecology,Q,E,1.0,123564,2.0,Trauma,Extreme,4,51-60,4087.0,21-30
210983,210984,28,b,11,X,2,gynecology,R,F,4.0,123564,2.0,Trauma,Extreme,4,51-60,4683.0,51-60
210984,210985,11,b,2,Y,4,gynecology,Q,D,2.0,123564,2.0,Trauma,Extreme,4,51-60,4570.0,21-30
210985,210986,6,a,6,X,2,gynecology,Q,F,2.0,123564,2.0,Trauma,Extreme,4,51-60,4416.0,21-30


In [3]:
# datos de prueba, estos datos no tienen la variable objetivo porque la idea es medir como funciona el modelo para estos casos nuevos
data_test = pd.read_csv('./Hospitalizacion_test_data.csv', header=0)
data_test

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,318439,21,c,3,Z,3,gynecology,S,A,2.0,17006,2.0,Emergency,Moderate,2,71-80,3095.0
1,318440,29,a,4,X,2,gynecology,S,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4018.0
2,318441,26,b,2,Y,3,gynecology,Q,D,4.0,17006,2.0,Emergency,Moderate,3,71-80,4492.0
3,318442,6,a,6,X,3,gynecology,Q,F,2.0,17006,2.0,Trauma,Moderate,3,71-80,4173.0
4,318443,28,b,11,X,2,gynecology,R,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4161.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137052,455491,11,b,2,Y,4,anesthesia,Q,D,3.0,41160,3.0,Emergency,Minor,4,41-50,6313.0
137053,455492,25,e,1,X,2,radiotherapy,R,E,4.0,30985,7.0,Emergency,Moderate,2,0-10,3510.0
137054,455493,30,c,3,Z,2,anesthesia,R,A,4.0,81811,12.0,Urgent,Minor,2,0-10,7190.0
137055,455494,5,a,1,X,2,anesthesia,R,E,4.0,57021,10.0,Trauma,Minor,2,41-50,5435.0


In [6]:
# Veamos la descripción del dataset:
data_train.describe(include='all')

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
count,210987.0,210987.0,210987,210987.0,210987,210987.0,210987,210987,210987,210917.0,210987.0,208436.0,210987,210987,210987.0,210987,210987.0,210986
unique,,,7,,3,,5,6,6,,,,3,3,,10,,11
top,,,a,,X,,gynecology,R,F,,,,Trauma,Moderate,,31-40,,21-30
freq,,,95354,,88036,,163712,85157,74843,,,,107036,119849,,42636,,59393
mean,105494.0,18.554925,,4.794731,,3.16182,,,,2.615375,65838.446914,7.055624,,,3.28979,,4911.492618,
std,60906.84496,8.615863,,3.09514,,1.147128,,,,0.870744,37996.420844,4.494443,,,1.7738,,1068.180261,
min,1.0,1.0,,1.0,,0.0,,,,1.0,2.0,1.0,,,0.0,,45.0,
25%,52747.5,11.0,,2.0,,2.0,,,,2.0,32937.5,4.0,,,2.0,,4226.0,
50%,105494.0,19.0,,5.0,,3.0,,,,3.0,65807.0,8.0,,,3.0,,4778.0,
75%,158240.5,26.0,,7.0,,4.0,,,,3.0,98520.5,8.0,,,4.0,,5435.0,


In [7]:
# Veamos la descripción del dataset test:
data_test.describe(include='all')

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
count,137057.0,137057.0,137057,137057.0,137057,137057.0,137057,137057,137057,137022.0,137057.0,134900.0,137057,137057,137057.0,137057,137057.0
unique,,,7,,3,,5,6,6,,,,3,3,,10,
top,,,a,,X,,gynecology,R,F,,,,Trauma,Moderate,,41-50,
freq,,,61305,,57513,,107202,54992,48717,,,,65411,75722,,27746,
mean,386967.0,18.343747,,4.758692,,3.192686,,,,2.634489,65877.903515,7.243996,,,3.284531,,4869.731097
std,39565.092259,8.634694,,3.102245,,1.16425,,,,0.869295,37942.997623,4.790625,,,1.77727,,1080.766723
min,318439.0,1.0,,1.0,,0.0,,,,1.0,3.0,1.0,,,0.0,,1800.0
25%,352703.0,11.0,,2.0,,2.0,,,,2.0,32945.0,4.0,,,2.0,,4178.0
50%,386967.0,19.0,,5.0,,3.0,,,,3.0,65786.0,8.0,,,3.0,,4731.0
75%,421231.0,26.0,,7.0,,4.0,,,,3.0,98851.0,8.0,,,4.0,,5398.0


In [8]:
# Verifiquemos si hay datos faltantes:
data_train.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                              70
patientid                               0
City_Code_Patient                    2551
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    1
dtype: int64

In [9]:
data_test.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                              35
patientid                               0
City_Code_Patient                    2157
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
dtype: int64

In [10]:
# Se elimina las columnas de codigo del pacientes, id del caso y numero de visitantes debido a que no agregan valor al modelo.
data_train.drop(columns=['City_Code_Patient','Visitors with Patient','case_id'],inplace=True)

In [11]:
data_test.drop(columns=['City_Code_Patient','Visitors with Patient','case_id'],inplace=True)

In [12]:
# Eliminemos los registros de bed grade que esten vacios
data_train.dropna(inplace=True)

In [13]:
data_test.dropna(inplace=True)

In [14]:
#Comprobamos que no hay más datos nulos
data_train.isna().sum()

Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
patientid                            0
Type of Admission                    0
Severity of Illness                  0
Age                                  0
Admission_Deposit                    0
Stay                                 0
dtype: int64

In [15]:
data_test.isna().sum()

Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
patientid                            0
Type of Admission                    0
Severity of Illness                  0
Age                                  0
Admission_Deposit                    0
dtype: int64

In [16]:
#Se aplica el one-hot enconding para las columnas de Departamento del hospital, tipo de admisión y severidad de la enfermedad.
columnas = ['Department', 'Type of Admission','Severity of Illness']
for col in columnas:
  data_train = pd.concat([data_train.drop(columns=col), pd.get_dummies(data_train[col])], axis=1)

In [17]:
#Se aplica también el one-hot enconding para las columnas de Departamento del hospital, tipo de admisión y severidad de la enfermedad del conjunto test,
# para que al hacer la validación no haya problemas
columnas_test = ['Department', 'Type of Admission','Severity of Illness']
for col in columnas_test:
  data_test = pd.concat([data_test.drop(columns=col), pd.get_dummies(data_test[col])], axis=1)

In [18]:
#Se aplica el one-hot enconding para las columnas de la lista, pero con un prefijo del nombre de la columna, 
#debido a que los posibles valores de la variable no indican mucho del dato para el analísis
columnas = ['Ward_Type',	'Ward_Facility_Code','Hospital_type_code','Hospital_region_code','Hospital_code',	'City_Code_Hospital','Bed Grade','Age']
for col in columnas:
  data_train = pd.concat([data_train.drop(columns=col), pd.get_dummies(data_train[col], prefix=col)], axis=1)

In [19]:
#Se aplica el one-hot enconding para las columnas de la lista en el conjunto test, pero con un prefijo del nombre de la columna, 
#debido a que los posibles valores de la variable no indican mucho del dato para el analísis
columnas = ['Ward_Type',	'Ward_Facility_Code','Hospital_type_code','Hospital_region_code','Hospital_code',	'City_Code_Hospital','Bed Grade','Age']
for col in columnas:
  data_test = pd.concat([data_test.drop(columns=col), pd.get_dummies(data_test[col], prefix=col)], axis=1)

In [20]:
#se convierte la columna admission deposit en un entero
data_train['Admission_Deposit'] = data_train['Admission_Deposit'].astype('int')

In [21]:
#se convierte la columna admission deposit en un entero del conjunto de test.
data_test['Admission_Deposit'] = data_test['Admission_Deposit'].astype('int')

In [22]:
#vamos a conocer que valores tiene la variable objetivo
data_train['Stay'].value_counts()

21-30                 59372
11-20                 49233
31-40                 37338
51-60                 24531
0-10                  13749
41-50                  7744
71-80                  7231
More than 100 Days     4508
81-90                  3442
91-100                 1922
61-70                  1846
Name: Stay, dtype: int64

In [23]:
#Se aplica un ordinal encoding a la variable objetivo para dejarlo en 11 categorías
data_train['Stay'].replace(
    ['0-10','11-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90','91-100','More than 100 Days'],
    [0,1,2,3,4,5,6,7,8,9,10],
    inplace= True
)

In [24]:
data_train['Stay'].value_counts()

2     59372
1     49233
3     37338
5     24531
0     13749
4      7744
7      7231
10     4508
8      3442
9      1922
6      1846
Name: Stay, dtype: int64

In [25]:
#dataset final
data_train

Unnamed: 0,Available Extra Rooms in Hospital,patientid,Admission_Deposit,Stay,TB & Chest disease,anesthesia,gynecology,radiotherapy,surgery,Emergency,Trauma,Urgent,Extreme,Minor,Moderate,Ward_Type_P,Ward_Type_Q,Ward_Type_R,Ward_Type_S,Ward_Type_T,Ward_Type_U,Ward_Facility_Code_A,Ward_Facility_Code_B,Ward_Facility_Code_C,Ward_Facility_Code_D,Ward_Facility_Code_E,Ward_Facility_Code_F,Hospital_type_code_a,Hospital_type_code_b,Hospital_type_code_c,Hospital_type_code_d,Hospital_type_code_e,Hospital_type_code_f,Hospital_type_code_g,Hospital_region_code_X,Hospital_region_code_Y,Hospital_region_code_Z,Hospital_code_1,Hospital_code_2,Hospital_code_3,...,Hospital_code_18,Hospital_code_19,Hospital_code_20,Hospital_code_21,Hospital_code_22,Hospital_code_23,Hospital_code_24,Hospital_code_25,Hospital_code_26,Hospital_code_27,Hospital_code_28,Hospital_code_29,Hospital_code_30,Hospital_code_31,Hospital_code_32,City_Code_Hospital_1,City_Code_Hospital_2,City_Code_Hospital_3,City_Code_Hospital_4,City_Code_Hospital_5,City_Code_Hospital_6,City_Code_Hospital_7,City_Code_Hospital_9,City_Code_Hospital_10,City_Code_Hospital_11,City_Code_Hospital_13,Bed Grade_1.0,Bed Grade_2.0,Bed Grade_3.0,Bed Grade_4.0,Age_0-10,Age_11-20,Age_21-30,Age_31-40,Age_41-50,Age_51-60,Age_61-70,Age_71-80,Age_81-90,Age_91-100
0,3,31397,4911,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,2,31397,5954,4,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,2,31397,4745,3,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,2,31397,7272,4,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
4,2,31397,5558,4,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210981,4,123564,5279,2,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
210982,4,123564,4087,2,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
210983,2,123564,4683,5,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
210984,4,123564,4570,2,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
