In [1]:
import numpy as np
import pandas as pd

import urllib.request
from PIL import Image
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("hospital_train.csv", index_col=0)
df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
161528,6,a,6,X,2,gynecology,R,F,4.0,45810,2.0,Urgent,Moderate,2,21-30,2817.0,0-10
159472,23,a,6,X,4,gynecology,Q,F,2.0,128565,15.0,Trauma,Moderate,4,51-60,4498.0,21-30
309765,2,c,5,Z,2,anesthesia,S,F,3.0,46565,5.0,Urgent,Moderate,2,71-80,4573.0,11-20
279614,32,f,9,Y,3,gynecology,S,B,4.0,124546,6.0,Emergency,Moderate,4,11-20,7202.0,51-60
147791,14,a,1,X,3,gynecology,S,E,2.0,22729,8.0,Urgent,Moderate,2,51-60,3398.0,51-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237869,12,a,9,Y,3,gynecology,R,B,3.0,82914,3.0,Emergency,Moderate,6,51-60,3966.0,More than 100 Days
254763,28,b,11,X,2,gynecology,R,F,2.0,40026,5.0,Urgent,Moderate,3,21-30,4005.0,51-60
69788,6,a,6,X,3,gynecology,Q,F,3.0,92346,2.0,Trauma,Minor,2,31-40,5215.0,31-40
204442,32,f,9,Y,2,gynecology,S,B,4.0,113798,15.0,Trauma,Moderate,3,41-50,5092.0,11-20


In [3]:
class HospitalEncoder:

    @staticmethod
    def encode_admission(x):
        data = {'Urgent': 2, 'Trauma': 1, 'Emergency': 3}
        return data[x]

    @staticmethod
    def encode_severity(x):
        data = {'Moderate': 2, 'Extreme': 3, 'Minor': 1}
        return data[x]

    @staticmethod
    def encode_age(x):
        data = {'21-30': 2, '51-60': 5, '71-80': 7, '11-20': 1, 
        '31-40': 3, '0-10': 0, '61-70': 6, '41-50': 4, '81-90': 4, '91-100': 9}
        return data[x]

In [7]:
df['17'].unique()

array(['0-10', '21-30', '11-20', '51-60', '31-40', '71-80',
       'More than 100 Days', '41-50', '81-90', '61-70', '91-100'],
      dtype=object)

In [13]:
# Quitamos las columnas no relevantes
filtered = df.drop(['1', '3', '4', '7', '8', '10', '11','14','16'], axis=1)

# Quitamos las filas donde la columna 9 tiene valor nan (33 registros)
#filtered.dropna(inplace=True)

# Aplicamos los encoders
filtered['12'] = filtered['12'].apply(HospitalEncoder.encode_admission)
filtered['13'] = filtered['13'].apply(HospitalEncoder.encode_severity)
filtered['15'] = filtered['15'].apply(HospitalEncoder.encode_age)

# Get dummies
features = filtered[['2', '6']]
features = pd.get_dummies(features)
filtered.drop(['2', '6'], axis=1, inplace=True)
train = pd.concat([filtered, features], axis=1)

# Creamos X e y
X = np.array(train.drop(['17'], axis=1))
y = np.array(train['17'])

# Conjuntos train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [17]:
def percentage_null(df):
    data = {'Column':[], '%_Null':[]}
    for col in df.columns:
        data['Column'].append(col)
        data['%_Null'].append((df[col].isnull().sum())*100/len(df))
    return pd.DataFrame(data)

nulls = percentage_null(df)

In [55]:
nulls[nulls['%_Null'] != 0]

Unnamed: 0,Column,%_Null
8,9,0.033
10,11,1.483


In [11]:
meaning = pd.read_csv('columns_meaning.csv')
meaning

Unnamed: 0,Column,Description
0,0,Case_ID registered in Hospital
1,1,Unique code for the Hospital
2,2,Unique code for the type of Hospital
3,3,City Code of the Hospital
4,4,Region Code of the Hospital
5,5,Number of Extra rooms available in the Hospital
6,6,Department overlooking the case
7,7,Code for the Ward type
8,8,Code for the Ward Facility
9,9,Condition of Bed in the Ward


In [5]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [98]:
rfc = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('rfc', RandomForestClassifier(random_state=42))
])

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.3267480244073222


In [93]:
grad = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('grad', GradientBoostingClassifier(random_state=42))
])

grad.fit(X_train, y_train)
y_pred = grad.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.30804241272381716


In [94]:
wildcat = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('cat', CatBoostClassifier(logging_level='Silent'))
])

wildcat.fit(X_train, y_train)
y_pred = wildcat.predict(X_test)
print(accuracy_score(y_test, y_pred))

9698	total: 39.9s	remaining: 25.2s
613:	learn: 1.7348063	total: 40s	remaining: 25.1s
614:	learn: 1.7346965	total: 40.1s	remaining: 25.1s
615:	learn: 1.7345934	total: 40.1s	remaining: 25s
616:	learn: 1.7344809	total: 40.2s	remaining: 24.9s
617:	learn: 1.7344224	total: 40.3s	remaining: 24.9s
618:	learn: 1.7343597	total: 40.3s	remaining: 24.8s
619:	learn: 1.7342650	total: 40.4s	remaining: 24.7s
620:	learn: 1.7341711	total: 40.4s	remaining: 24.7s
621:	learn: 1.7341189	total: 40.5s	remaining: 24.6s
622:	learn: 1.7340408	total: 40.6s	remaining: 24.5s
623:	learn: 1.7339093	total: 40.6s	remaining: 24.5s
624:	learn: 1.7338392	total: 40.7s	remaining: 24.4s
625:	learn: 1.7337924	total: 40.8s	remaining: 24.4s
626:	learn: 1.7337020	total: 40.8s	remaining: 24.3s
627:	learn: 1.7335835	total: 40.9s	remaining: 24.2s
628:	learn: 1.7334501	total: 41s	remaining: 24.2s
629:	learn: 1.7333836	total: 41s	remaining: 24.1s
630:	learn: 1.7333215	total: 41.1s	remaining: 24s
631:	learn: 1.7332057	total: 41.1s	rema

In [95]:
xgb = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('xgb', XGBClassifier())
])

xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.30614184255276583


In [11]:
R = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('rfc', RandomForestClassifier(random_state=42))
])

G = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('grad', GradientBoostingClassifier(random_state=42))
])

X = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('xgb', XGBClassifier())
])

C = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('cat', CatBoostClassifier(logging_level='Silent'))
])


estimators = [('G', G), ('X', X), ('C', C)]
voting = VotingClassifier(estimators, voting='soft')

voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.30989296789036713


In [130]:
y_test.shape

(19994, 1)

In [131]:
X_test.shape

(19994, 17)

In [133]:
y_pred1 = voting.predict(X_test)
print(accuracy_score(y_test, y_pred1))

  return array(a, dtype, copy=False, order=order)


ValueError: could not broadcast input array from shape (19994,1) into shape (19994)

In [17]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    sample = pd.read_csv("sample_submission.csv")
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                df_to_submit.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")

filepath = 'hospital_test.csv'

def prepare_test(model, filepath):
    df = pd.read_csv(filepath, index_col=0)
    # Operaciones de transformación.
    # Quitamos las columnas no relevantes
    filtered = df.drop(['1', '3', '4', '7', '8', '10', '11', '14','16'], axis=1)

    # Aplicamos los encoders
    filtered['12'] = filtered['12'].apply(HospitalEncoder.encode_admission)
    filtered['13'] = filtered['13'].apply(HospitalEncoder.encode_severity)
    filtered['15'] = filtered['15'].apply(HospitalEncoder.encode_age)

    # Get dummies
    features = filtered[['2', '6']]
    features = pd.get_dummies(features)
    filtered.drop(['2', '6'], axis=1, inplace=True)
    test = pd.concat([filtered, features], axis=1)

    # Creamos X
    X = np.array(test)

    # Cambiamos Nan por la media
    sim = SimpleImputer()
    X = sim.fit_transform(X)

    # Cogemos índice de sample_submission.csv
    sample = pd.read_csv('sample_submission.csv')

    # Preparamos dataframe de test.
    predictions_submit = model.predict(X)
    submission = pd.DataFrame({"id": sample.id, "days": predictions_submit.ravel()})
    
    return submission

In [16]:
import pickle
'''
filename = 'hospital_cat_grad_xgb.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(voting, archivo_salida)
'''
filepath = 'hospital_test.csv'
sim = SimpleImputer()
X_processed = sim.fit_transform(X)
voting.fit(X_processed, y)
submission = prepare_test(voting, filepath)
chequeator(submission)

You're ready to submit!


In [111]:
submission = prepare_test(wildcat, filepath)
chequeator(submission)

You're ready to submit!


In [14]:
sim = SimpleImputer()
sim.fit_transform(X_train)

array([[3., 3., 1., ..., 0., 1., 0.],
       [2., 2., 1., ..., 1., 0., 0.],
       [3., 2., 2., ..., 1., 0., 0.],
       ...,
       [3., 3., 1., ..., 1., 0., 0.],
       [4., 2., 1., ..., 1., 0., 0.],
       [3., 2., 3., ..., 1., 0., 0.]])

In [15]:
wildcat = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('cat', CatBoostClassifier())
])

wildcat.fit(X_train, y_train)
y_pred = wildcat.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.3125


# H2O

In [18]:
from sklearn.preprocessing import LabelEncoder
import h2o
from h2o.automl import H2OAutoML

In [24]:
inputer = SimpleImputer()
train['9'] = inputer.fit_transform(np.array(train['9']).reshape(-1,1))
train.head()

Unnamed: 0_level_0,5,9,12,13,15,17,2_a,2_b,2_c,2_d,2_e,2_f,2_g,6_TB & Chest disease,6_anesthesia,6_gynecology,6_radiotherapy,6_surgery
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
161528,2,4.0,2,2,2,0-10,1,0,0,0,0,0,0,0,0,1,0,0
159472,4,2.0,1,2,5,21-30,1,0,0,0,0,0,0,0,0,1,0,0
309765,2,3.0,2,2,7,11-20,0,0,1,0,0,0,0,0,1,0,0,0
279614,3,4.0,3,2,1,51-60,0,0,0,0,0,1,0,0,0,1,0,0
147791,3,2.0,2,2,5,51-60,1,0,0,0,0,0,0,0,0,1,0,0


In [25]:
h2o.init()
h2train = h2o.H2OFrame(train)
columnas = [a for a in h2train.columns if a != "17"][1:]
x = columnas
y = "17"

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.11+9-LTS-194, mixed mode)
  Starting server from C:\Users\jgnsa\AppData\Local\Programs\Python\Python37\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\jgnsa\AppData\Local\Temp\tmpcea8s21m
  JVM stdout: C:\Users\jgnsa\AppData\Local\Temp\tmpcea8s21m\h2o_Jorge_started_from_python.out
  JVM stderr: C:\Users\jgnsa\AppData\Local\Temp\tmpcea8s21m\h2o_Jorge_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.3
H2O_cluster_version_age:,1 month and 6 days
H2O_cluster_name:,H2O_from_python_Jorge_x0h9ig
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.965 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [48]:
automl = H2OAutoML(max_models=50, seed=42, max_runtime_secs=300, sort_metric='mean_per_class_error')
automl.train(x=x, y=y, training_frame=h2train)

AutoML progress: |
22:05:37.554: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%


In [49]:
print('[INFO] Models leader board:')
leader_board = automl.leaderboard
leader_board.head()

[INFO] Models leader board:


model_id,mean_per_class_error,logloss,rmse,mse,auc,aucpr
DRF_1_AutoML_20210625_220537,0.88766,4.36615,0.810293,0.656574,,
GBM_grid__1_AutoML_20210625_220537_model_4,0.888217,2.20092,0.886738,0.786304,,
StackedEnsemble_AllModels_AutoML_20210625_220537,0.889321,1.84695,0.806156,0.649887,,
GBM_4_AutoML_20210625_220537,0.889737,2.05132,0.863671,0.745928,,
GBM_2_AutoML_20210625_220537,0.889796,2.01377,0.856825,0.734148,,
GBM_grid__1_AutoML_20210625_220537_model_1,0.889832,1.852,0.810218,0.656454,,
GBM_grid__1_AutoML_20210625_220537_model_2,0.890044,1.89918,0.82803,0.685633,,
GBM_5_AutoML_20210625_220537,0.890322,2.08404,0.869738,0.756444,,
GBM_1_AutoML_20210625_220537,0.890749,1.99433,0.852916,0.727465,,
StackedEnsemble_BestOfFamily_AutoML_20210625_220537,0.890752,1.85188,0.807425,0.651935,,




In [40]:
test = pd.read_csv('hospital_test.csv', index_col=0)
# Operaciones de transformación.
# Quitamos las columnas no relevantes
filtered = test.drop(['1', '3', '4', '7', '8', '10', '11', '14','16'], axis=1)

# Aplicamos los encoders
filtered['12'] = filtered['12'].apply(HospitalEncoder.encode_admission)
filtered['13'] = filtered['13'].apply(HospitalEncoder.encode_severity)
filtered['15'] = filtered['15'].apply(HospitalEncoder.encode_age)

# Get dummies
features = filtered[['2', '6']]
features = pd.get_dummies(features)
filtered.drop(['2', '6'], axis=1, inplace=True)
test = pd.concat([filtered, features], axis=1)

# Creamos X
inputer = SimpleImputer()
test['9'] = inputer.fit_transform(np.array(test['9']).reshape(-1,1))
test

Unnamed: 0_level_0,5,9,12,13,15,2_a,2_b,2_c,2_d,2_e,2_f,2_g,6_TB & Chest disease,6_anesthesia,6_gynecology,6_radiotherapy,6_surgery
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
314114,4,2.0,3,1,4,1,0,0,0,0,0,0,0,0,1,0,0
208989,3,2.0,1,2,3,0,0,1,0,0,0,0,0,0,1,0,0
305872,4,4.0,3,1,7,0,0,0,0,1,0,0,0,0,1,0,0
266099,4,2.0,2,3,3,0,0,1,0,0,0,0,1,0,0,0,0
13228,4,1.0,3,1,2,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318155,6,3.0,2,1,5,0,0,0,1,0,0,0,0,1,0,0,0
144850,2,2.0,3,2,6,1,0,0,0,0,0,0,0,0,1,0,0
180676,3,3.0,1,1,3,0,0,1,0,0,0,0,0,0,1,0,0
39933,4,2.0,1,3,3,0,1,0,0,0,0,0,0,0,1,0,0


In [41]:
h2test = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [50]:
predictions = automl.leader.predict(h2test)

drf prediction progress: |████████████████████████████████████████████████| 100%


In [43]:
predictions

predict,0-10,11-20,21-30,31-40,41-50,51-60,61-70,71-80,81-90,91-100,More than 100 Days
11-20,0.097719,0.243708,0.207902,0.119254,0.0569011,0.0649143,0.0406576,0.0444299,0.0420169,0.0406512,0.0418469
21-30,0.064805,0.116454,0.266677,0.153936,0.0668626,0.0971287,0.0434597,0.0579094,0.0438132,0.0440445,0.0449095
11-20,0.105562,0.180165,0.157403,0.155248,0.0608578,0.0931876,0.045579,0.0529422,0.0512261,0.0516718,0.0461566
21-30,0.0796335,0.136689,0.242062,0.121854,0.0500898,0.117185,0.0425033,0.0608948,0.0538979,0.0407633,0.0544266
11-20,0.166062,0.223186,0.183456,0.100832,0.0565383,0.0715933,0.0392016,0.0433106,0.0389993,0.0384742,0.0383462
11-20,0.0776222,0.192272,0.182939,0.145463,0.0578198,0.0940424,0.0439316,0.0592519,0.0512658,0.0437337,0.0516598
21-30,0.0782714,0.140636,0.256238,0.136611,0.0629971,0.0899258,0.0434048,0.0577149,0.0444525,0.0449515,0.0447964
11-20,0.111688,0.241189,0.213193,0.0958381,0.0596201,0.0698285,0.0398717,0.0426497,0.0407053,0.0400877,0.0453298
11-20,0.0678554,0.161468,0.150132,0.137066,0.0636075,0.112252,0.0565984,0.0746397,0.0528309,0.0530824,0.0704685
11-20,0.106201,0.195854,0.175788,0.128701,0.0564147,0.09617,0.0453129,0.055687,0.0496899,0.0440256,0.0461567




In [51]:
results = pd.DataFrame({'id':test.index, 'days':predictions.as_data_frame()['predict']})
results

Unnamed: 0,id,days
0,314114,11-20
1,208989,21-30
2,305872,31-40
3,266099,11-20
4,13228,21-30
...,...,...
133739,318155,0-10
133740,144850,11-20
133741,180676,21-30
133742,39933,21-30


In [52]:
chequeator(results)

You're ready to submit!


In [53]:
h2o.save_model(model=automl.leader, path='h2o_models', force=True)

'D:\\Documentos\\TheBridge\\bridge_datascience_JorgeGarcia\\Kaggle_Hospital\\h2o_models\\DRF_1_AutoML_20210625_220537'