In [1]:
import numpy as np
import pandas as pd

import urllib.request
from PIL import Image
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import h2o
from h2o.automl import H2OAutoML

In [2]:
df = pd.read_csv("hospital_train.csv", index_col=0)
df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
161528,6,a,6,X,2,gynecology,R,F,4.0,45810,2.0,Urgent,Moderate,2,21-30,2817.0,0-10
159472,23,a,6,X,4,gynecology,Q,F,2.0,128565,15.0,Trauma,Moderate,4,51-60,4498.0,21-30
309765,2,c,5,Z,2,anesthesia,S,F,3.0,46565,5.0,Urgent,Moderate,2,71-80,4573.0,11-20
279614,32,f,9,Y,3,gynecology,S,B,4.0,124546,6.0,Emergency,Moderate,4,11-20,7202.0,51-60
147791,14,a,1,X,3,gynecology,S,E,2.0,22729,8.0,Urgent,Moderate,2,51-60,3398.0,51-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237869,12,a,9,Y,3,gynecology,R,B,3.0,82914,3.0,Emergency,Moderate,6,51-60,3966.0,More than 100 Days
254763,28,b,11,X,2,gynecology,R,F,2.0,40026,5.0,Urgent,Moderate,3,21-30,4005.0,51-60
69788,6,a,6,X,3,gynecology,Q,F,3.0,92346,2.0,Trauma,Minor,2,31-40,5215.0,31-40
204442,32,f,9,Y,2,gynecology,S,B,4.0,113798,15.0,Trauma,Moderate,3,41-50,5092.0,11-20


In [3]:
class HospitalEncoder:

    @staticmethod
    def encode_admission(x):
        data = {'Urgent': 2, 'Trauma': 1, 'Emergency': 3}
        return data[x]

    @staticmethod
    def encode_severity(x):
        data = {'Moderate': 2, 'Extreme': 3, 'Minor': 1}
        return data[x]

    @staticmethod
    def encode_age(x):
        data = {'21-30': 2, '51-60': 5, '71-80': 7, '11-20': 1, 
        '31-40': 3, '0-10': 0, '61-70': 6, '41-50': 4, '81-90': 4, '91-100': 9}
        return data[x]

In [4]:
# Quitamos las columnas no relevantes
filtered = df.drop(['1', '3', '4', '7', '8', '10', '11','14','16'], axis=1)

# Aplicamos los encoders
filtered['12'] = filtered['12'].apply(HospitalEncoder.encode_admission)
filtered['13'] = filtered['13'].apply(HospitalEncoder.encode_severity)
filtered['15'] = filtered['15'].apply(HospitalEncoder.encode_age)

# Get dummies
features = filtered[['2', '6']]
features = pd.get_dummies(features)
filtered.drop(['2', '6'], axis=1, inplace=True)
train = pd.concat([filtered, features], axis=1)

# Sustituimos Nans por la media
inputer = SimpleImputer()
train['9'] = inputer.fit_transform(np.array(train['9']).reshape(-1,1))

In [5]:
h2o.init()
h2train = h2o.H2OFrame(train)
columnas = [a for a in h2train.columns if a != "17"][1:]
x = columnas
y = "17"

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.11+9-LTS-194, mixed mode)
  Starting server from C:\Users\jgnsa\AppData\Local\Programs\Python\Python37\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\jgnsa\AppData\Local\Temp\tmpphjxva1g
  JVM stdout: C:\Users\jgnsa\AppData\Local\Temp\tmpphjxva1g\h2o_Jorge_started_from_python.out
  JVM stderr: C:\Users\jgnsa\AppData\Local\Temp\tmpphjxva1g\h2o_Jorge_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.3
H2O_cluster_version_age:,1 month and 6 days
H2O_cluster_name:,H2O_from_python_Jorge_phg4j8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.965 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
automl = H2OAutoML(max_models=100, seed=42, max_runtime_secs=600, sort_metric='mean_per_class_error')
automl.train(x=x, y=y, training_frame=h2train)

AutoML progress: |
10:47:39.745: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%


In [7]:
print('[INFO] Models leader board:')
leader_board = automl.leaderboard
leader_board.head()

[INFO] Models leader board:


model_id,mean_per_class_error,logloss,rmse,mse,auc,aucpr
DRF_1_AutoML_20210626_104739,0.888526,3.92699,0.809687,0.655593,,
StackedEnsemble_AllModels_AutoML_20210626_104739,0.888602,1.84642,0.806049,0.649716,,
GBM_grid__1_AutoML_20210626_104739_model_5,0.888841,1.85955,0.811415,0.658394,,
GBM_4_AutoML_20210626_104739,0.889149,1.92316,0.83515,0.697475,,
GBM_2_AutoML_20210626_104739,0.889359,1.89137,0.82656,0.683202,,
GBM_grid__1_AutoML_20210626_104739_model_4,0.889564,1.85151,0.810234,0.656479,,
StackedEnsemble_BestOfFamily_AutoML_20210626_104739,0.889658,1.848,0.806584,0.650577,,
GBM_3_AutoML_20210626_104739,0.890224,1.89968,0.829092,0.687394,,
GBM_1_AutoML_20210626_104739,0.890251,1.8885,0.826007,0.682287,,
GBM_5_AutoML_20210626_104739,0.890634,1.95009,0.84275,0.710228,,




In [8]:
h2o.save_model(model=automl.leader, path='h2o_models', force=True)

'D:\\Documentos\\TheBridge\\bridge_datascience_JorgeGarcia\\Kaggle_Hospital\\h2o_models\\DRF_1_AutoML_20210626_104739'

In [9]:
test = pd.read_csv('hospital_test.csv', index_col=0)
# Operaciones de transformación.
# Quitamos las columnas no relevantes
filtered = test.drop(['1', '3', '4', '7', '8', '10', '11', '14','16'], axis=1)

# Aplicamos los encoders
filtered['12'] = filtered['12'].apply(HospitalEncoder.encode_admission)
filtered['13'] = filtered['13'].apply(HospitalEncoder.encode_severity)
filtered['15'] = filtered['15'].apply(HospitalEncoder.encode_age)

# Get dummies
features = filtered[['2', '6']]
features = pd.get_dummies(features)
filtered.drop(['2', '6'], axis=1, inplace=True)
test = pd.concat([filtered, features], axis=1)

# Creamos X
inputer = SimpleImputer()
test['9'] = inputer.fit_transform(np.array(test['9']).reshape(-1,1))
test

Unnamed: 0_level_0,5,9,12,13,15,2_a,2_b,2_c,2_d,2_e,2_f,2_g,6_TB & Chest disease,6_anesthesia,6_gynecology,6_radiotherapy,6_surgery
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
314114,4,2.0,3,1,4,1,0,0,0,0,0,0,0,0,1,0,0
208989,3,2.0,1,2,3,0,0,1,0,0,0,0,0,0,1,0,0
305872,4,4.0,3,1,7,0,0,0,0,1,0,0,0,0,1,0,0
266099,4,2.0,2,3,3,0,0,1,0,0,0,0,1,0,0,0,0
13228,4,1.0,3,1,2,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318155,6,3.0,2,1,5,0,0,0,1,0,0,0,0,1,0,0,0
144850,2,2.0,3,2,6,1,0,0,0,0,0,0,0,0,1,0,0
180676,3,3.0,1,1,3,0,0,1,0,0,0,0,0,0,1,0,0
39933,4,2.0,1,3,3,0,1,0,0,0,0,0,0,0,1,0,0


In [10]:
h2test = h2o.H2OFrame(test)
predictions = automl.leader.predict(h2test)
results = pd.DataFrame({'id':test.index, 'days':predictions.as_data_frame()['predict']})

Parse progress: |█████████████████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%


In [11]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    sample = pd.read_csv("sample_submission.csv")
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                df_to_submit.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")

In [12]:
chequeator(results)

You're ready to submit!


URLError: <urlopen error [Errno 11001] getaddrinfo failed>