<a href="https://colab.research.google.com/github/Geralberrio/Proyecto_Analitica/blob/master/Hospitalizacion_regresion_logistica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regresión Logistica


In [1]:
import numpy as np
import pandas as pd
from scipy.stats import reciprocal
from matplotlib import pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, plot_confusion_matrix
from sklearn.decomposition import PCA

In [2]:
# Definamos el "random_state" para que los resultados sean reproducibles:
random_state=42

# Preprocesamiento de datos

In [3]:
# Carguemos los datos:
data_train = pd.read_csv('./Hospitalizacion_train_data.csv', header=0)
data_train

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318433,318434,6,a,6,X,3,radiotherapy,Q,F,4.0,86499,23.0,Emergency,Moderate,3,41-50,4144.0,11-20
318434,318435,24,a,1,X,2,anesthesia,Q,E,4.0,325,8.0,Urgent,Moderate,4,81-90,6699.0,31-40
318435,318436,7,a,4,X,3,gynecology,R,F,4.0,125235,10.0,Emergency,Minor,3,71-80,4235.0,11-20
318436,318437,11,b,2,Y,3,anesthesia,Q,D,3.0,91081,8.0,Trauma,Minor,5,11-20,3761.0,11-20


In [4]:
# datos de prueba, estos datos no tienen la variable objetivo porque la idea es medir como funciona el modelo para estos casos nuevos
data_test = pd.read_csv('./Hospitalizacion_test_data.csv', header=0)
data_test

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,318439,21,c,3,Z,3,gynecology,S,A,2.0,17006,2.0,Emergency,Moderate,2,71-80,3095.0
1,318440,29,a,4,X,2,gynecology,S,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4018.0
2,318441,26,b,2,Y,3,gynecology,Q,D,4.0,17006,2.0,Emergency,Moderate,3,71-80,4492.0
3,318442,6,a,6,X,3,gynecology,Q,F,2.0,17006,2.0,Trauma,Moderate,3,71-80,4173.0
4,318443,28,b,11,X,2,gynecology,R,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4161.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137052,455491,11,b,2,Y,4,anesthesia,Q,D,3.0,41160,3.0,Emergency,Minor,4,41-50,6313.0
137053,455492,25,e,1,X,2,radiotherapy,R,E,4.0,30985,7.0,Emergency,Moderate,2,0-10,3510.0
137054,455493,30,c,3,Z,2,anesthesia,R,A,4.0,81811,12.0,Urgent,Minor,2,0-10,7190.0
137055,455494,5,a,1,X,2,anesthesia,R,E,4.0,57021,10.0,Trauma,Minor,2,41-50,5435.0


In [5]:
# Veamos la descripción del dataset:
data_train.describe(include='all')

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
count,318438.0,318438.0,318438,318438.0,318438,318438.0,318438,318438,318438,318325.0,318438.0,313906.0,318438,318438,318438.0,318438,318438.0,318438
unique,,,7,,3,,5,6,6,,,,3,3,,10,,11
top,,,a,,X,,gynecology,R,F,,,,Trauma,Moderate,,41-50,,21-30
freq,,,143425,,133336,,249486,127947,112753,,,,152261,175843,,63749,,87491
mean,159219.5,18.318841,,4.771717,,3.197627,,,,2.625807,65747.579472,7.251859,,,3.284099,,4880.749392,
std,91925.276847,8.633755,,3.102535,,1.168171,,,,0.873146,37979.93644,4.745266,,,1.764061,,1086.776254,
min,1.0,1.0,,1.0,,0.0,,,,1.0,1.0,1.0,,,0.0,,1800.0,
25%,79610.25,11.0,,2.0,,2.0,,,,2.0,32847.0,4.0,,,2.0,,4186.0,
50%,159219.5,19.0,,5.0,,3.0,,,,3.0,65724.5,8.0,,,3.0,,4741.0,
75%,238828.75,26.0,,7.0,,4.0,,,,3.0,98470.0,8.0,,,4.0,,5409.0,


In [6]:
# Veamos la descripción del dataset test:
data_test.describe(include='all')

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
count,137057.0,137057.0,137057,137057.0,137057,137057.0,137057,137057,137057,137022.0,137057.0,134900.0,137057,137057,137057.0,137057,137057.0
unique,,,7,,3,,5,6,6,,,,3,3,,10,
top,,,a,,X,,gynecology,R,F,,,,Trauma,Moderate,,41-50,
freq,,,61305,,57513,,107202,54992,48717,,,,65411,75722,,27746,
mean,386967.0,18.343747,,4.758692,,3.192686,,,,2.634489,65877.903515,7.243996,,,3.284531,,4869.731097
std,39565.092259,8.634694,,3.102245,,1.16425,,,,0.869295,37942.997623,4.790625,,,1.77727,,1080.766723
min,318439.0,1.0,,1.0,,0.0,,,,1.0,3.0,1.0,,,0.0,,1800.0
25%,352703.0,11.0,,2.0,,2.0,,,,2.0,32945.0,4.0,,,2.0,,4178.0
50%,386967.0,19.0,,5.0,,3.0,,,,3.0,65786.0,8.0,,,3.0,,4731.0
75%,421231.0,26.0,,7.0,,4.0,,,,3.0,98851.0,8.0,,,4.0,,5398.0


In [7]:
# Verifiquemos si hay datos faltantes:
data_train.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

In [8]:
data_test.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                              35
patientid                               0
City_Code_Patient                    2157
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
dtype: int64

In [9]:
# Se elimina las columnas de codigo del pacientes, id del caso y numero de visitantes debido a que no agregan valor al modelo.
data_train.drop(columns=['City_Code_Patient','Visitors with Patient','case_id'],inplace=True)

In [10]:
data_test.drop(columns=['City_Code_Patient','Visitors with Patient','case_id'],inplace=True)

In [11]:
# Eliminemos los registros de bed grade que esten vacios
data_train.dropna(inplace=True)

In [12]:
data_test.dropna(inplace=True)

In [13]:
#Comprobamos que no hay más datos nulos
data_train.isna().sum()

Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
patientid                            0
Type of Admission                    0
Severity of Illness                  0
Age                                  0
Admission_Deposit                    0
Stay                                 0
dtype: int64

In [14]:
data_test.isna().sum()

Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
patientid                            0
Type of Admission                    0
Severity of Illness                  0
Age                                  0
Admission_Deposit                    0
dtype: int64

In [15]:
#Se aplica el one-hot enconding para las columnas de Departamento del hospital, tipo de admisión y severidad de la enfermedad.
columnas = ['Department', 'Type of Admission','Severity of Illness']
for col in columnas:
  data_train = pd.concat([data_train.drop(columns=col), pd.get_dummies(data_train[col])], axis=1)

In [16]:
#Se aplica también el one-hot enconding para las columnas de Departamento del hospital, tipo de admisión y severidad de la enfermedad del conjunto test,
# para que al hacer la validación no haya problemas
columnas_test = ['Department', 'Type of Admission','Severity of Illness']
for col in columnas_test:
  data_test = pd.concat([data_test.drop(columns=col), pd.get_dummies(data_test[col])], axis=1)

In [17]:
#Se aplica el one-hot enconding para las columnas de la lista, pero con un prefijo del nombre de la columna, 
#debido a que los posibles valores de la variable no indican mucho del dato para el analísis
columnas = ['Ward_Type',	'Ward_Facility_Code','Hospital_type_code','Hospital_region_code','Hospital_code',	'City_Code_Hospital','Bed Grade','Age']
for col in columnas:
  data_train = pd.concat([data_train.drop(columns=col), pd.get_dummies(data_train[col], prefix=col)], axis=1)

In [18]:
#Se aplica el one-hot enconding para las columnas de la lista en el conjunto test, pero con un prefijo del nombre de la columna, 
#debido a que los posibles valores de la variable no indican mucho del dato para el analísis
columnas = ['Ward_Type',	'Ward_Facility_Code','Hospital_type_code','Hospital_region_code','Hospital_code',	'City_Code_Hospital','Bed Grade','Age']
for col in columnas:
  data_test = pd.concat([data_test.drop(columns=col), pd.get_dummies(data_test[col], prefix=col)], axis=1)

In [19]:
#se convierte la columna admission deposit en un entero
data_train['Admission_Deposit'] = data_train['Admission_Deposit'].astype('int')

In [20]:
#se convierte la columna admission deposit en un entero del conjunto de test.
data_test['Admission_Deposit'] = data_test['Admission_Deposit'].astype('int')

In [21]:
#vamos a conocer que valores tiene la variable objetivo
data_train['Stay'].value_counts()

21-30                 87454
11-20                 78120
31-40                 55137
51-60                 35005
0-10                  23602
41-50                 11735
71-80                 10250
More than 100 Days     6681
81-90                  4837
91-100                 2764
61-70                  2740
Name: Stay, dtype: int64

In [22]:
#Se aplica un ordinal encoding a la variable objetivo para dejarlo en 11 categorías
data_train['Stay'].replace(
    ['0-10','11-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90','91-100','More than 100 Days'],
    [0,1,2,3,4,5,6,7,8,9,10],
    inplace= True
)


In [23]:
data_train['Stay'].value_counts()

2     87454
1     78120
3     55137
5     35005
0     23602
4     11735
7     10250
10     6681
8      4837
9      2764
6      2740
Name: Stay, dtype: int64

In [24]:
# Separemos nuestros datos en conjuntos de entrenamiento y prueba 
#(Test es tomado como la totalidad de los datos)
X_train, X_test, y_train, y_test = train_test_split(
    data_train.drop(columns='Stay'),
    data_train['Stay'],
    test_size=0.2,
    stratify=data_train['Stay'],
    random_state=random_state
)

In [25]:
# Definamos un pipeline de scikit-learn con nuestro modelo base:
model_base = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(solver='saga', max_iter=1e3))
])

In [26]:
# Definamos las distribuciones de parámetros sobre las que haremos la búsqueda:
param_distributions = {
    'poly__degree': list(range(1, 3)), #(posible mover)
    'classifier__C': reciprocal(1e-3, 1e3)  #(Posible mover)
}

In [27]:
# Definamos nuestro modelo mediante RandomizedSearchCV:
model = RandomizedSearchCV(
    model_base,
    cv=3,  #Parámetro entre 3 y 5 (posible mover)
    param_distributions=param_distributions,
    n_iter=50,
    random_state=random_state
)

In [None]:
%%time
# Entrenemos el modelo:
model.fit(X_train, y_train)