# Base Transformations

In [1]:
import numpy as np
import pandas as pd

import urllib.request
from PIL import Image
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("hospital_train.csv", index_col=0)

In [3]:
df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
161528,6,a,6,X,2,gynecology,R,F,4.0,45810,2.0,Urgent,Moderate,2,21-30,2817.0,0-10
159472,23,a,6,X,4,gynecology,Q,F,2.0,128565,15.0,Trauma,Moderate,4,51-60,4498.0,21-30
309765,2,c,5,Z,2,anesthesia,S,F,3.0,46565,5.0,Urgent,Moderate,2,71-80,4573.0,11-20
279614,32,f,9,Y,3,gynecology,S,B,4.0,124546,6.0,Emergency,Moderate,4,11-20,7202.0,51-60
147791,14,a,1,X,3,gynecology,S,E,2.0,22729,8.0,Urgent,Moderate,2,51-60,3398.0,51-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237869,12,a,9,Y,3,gynecology,R,B,3.0,82914,3.0,Emergency,Moderate,6,51-60,3966.0,More than 100 Days
254763,28,b,11,X,2,gynecology,R,F,2.0,40026,5.0,Urgent,Moderate,3,21-30,4005.0,51-60
69788,6,a,6,X,3,gynecology,Q,F,3.0,92346,2.0,Trauma,Minor,2,31-40,5215.0,31-40
204442,32,f,9,Y,2,gynecology,S,B,4.0,113798,15.0,Trauma,Moderate,3,41-50,5092.0,11-20


In [4]:
class HospitalEncoder:

    @staticmethod
    def encode_admission(x):
        data = {'Urgent': 2, 'Trauma': 1, 'Emergency': 3}
        return data[x]

    @staticmethod
    def encode_severity(x):
        data = {'Moderate': 2, 'Extreme': 3, 'Minor': 1}
        return data[x]

    @staticmethod
    def encode_age(x):
        data = {'21-30': 2, '51-60': 5, '71-80': 7, '11-20': 1, 
        '31-40': 3, '0-10': 0, '61-70': 6, '41-50': 4, '81-90': 4, '91-100': 9}
        return data[x]

In [5]:
# Quitamos las columnas no relevantes
filtered = df.drop(['1', '3', '4', '7', '8','9', '10', '11','14','16'], axis=1)

# Quitamos las filas donde la columna 9 tiene valor nan (33 registros)
#filtered.dropna(inplace=True)

# Aplicamos los encoders
filtered['12'] = filtered['12'].apply(HospitalEncoder.encode_admission)
filtered['13'] = filtered['13'].apply(HospitalEncoder.encode_severity)
filtered['15'] = filtered['15'].apply(HospitalEncoder.encode_age)

# Get dummies
features = filtered[['2', '6']]
features = pd.get_dummies(features)
filtered.drop(['2', '6'], axis=1, inplace=True)
train = pd.concat([filtered, features], axis=1)

# Creamos X e y
X = np.array(train.drop(['17'], axis=1))
y = np.array(train['17'])

# Conjuntos train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

## 1. Gradient Boosting

In [7]:
grad = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('standard', StandardScaler()),
    ('grad', GradientBoostingClassifier())
])

grad_params = {
    'grad__random_state': [42, 1, 10, 18122003],
    'grad__loss': ['deviance', 'exponential'],
    'grad__learning_rate': [0.1, 0.01, 0.05, 0.2],
    'grad__n_estimators': [100, 200, 500, 1000],
    'grad__min_samples_split': [2, 3, 5],
    'grad__min_samples_leaf': [1, 2],
    'grad__max_depth': [2, 3, 4, 5]
}

grid = GridSearchCV(grad,
                    grad_params,
                    cv=5,
                    scoring='accuracy',
                    verbose=10)

In [6]:
grad = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('grad', GradientBoostingClassifier(learning_rate=0.01, n_estimators=200, verbose=10))
])

In [7]:
grad.fit(X_train, y_train)

y_pred = grad.predict(X_test)
print('Test:', accuracy_score(y_test, y_pred))
y_pred_train = grad.predict(X_train)
print('Train:', accuracy_score(y_train, y_pred_train))

      Iter       Train Loss   Remaining Time 
         1           1.9029            3.92m
         2           1.9020            3.71m
         3           1.9011            3.56m
         4           1.9003            3.60m
         5           1.8995            3.44m
         6           1.8987            3.31m
         7           1.8979            3.24m
         8           1.8971            3.20m
         9           1.8964            3.25m
        10           1.8956            3.23m
        11           1.8949            3.21m
        12           1.8942            3.19m
        13           1.8935            3.17m
        14           1.8928            3.15m
        15           1.8922            3.13m
        16           1.8915            3.11m
        17           1.8909            3.10m
        18           1.8903            3.09m
        19           1.8897            3.07m
        20           1.8891            3.05m
        21           1.8885            3.03m
        2

In [8]:

print('Test:', accuracy_score(y_test, y_pred))

print('Train:', accuracy_score(y_train, y_pred_train))

Test: 0.30735
Train: 0.302775


In [None]:
import pickle

filename = 'hospital_grad_final.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(grad, archivo_salida)

## 2. Cat Boosting

In [7]:
wildcat = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('grad', CatBoostClassifier(depth=6, learning_rate=0.1, l2_leaf_reg=5, iterations=1200))
])

In [27]:
wildcat.fit(X_train, y_train)
y_pred = wildcat.predict(X_test)
print(accuracy_score(y_test, y_pred))

l: 44.7s	remaining: 21s
817:	learn: 1.7291348	total: 44.8s	remaining: 20.9s
818:	learn: 1.7290584	total: 44.8s	remaining: 20.9s
819:	learn: 1.7289881	total: 44.9s	remaining: 20.8s
820:	learn: 1.7289280	total: 44.9s	remaining: 20.7s
821:	learn: 1.7288317	total: 45s	remaining: 20.7s
822:	learn: 1.7287742	total: 45.1s	remaining: 20.6s
823:	learn: 1.7286936	total: 45.1s	remaining: 20.6s
824:	learn: 1.7286019	total: 45.2s	remaining: 20.5s
825:	learn: 1.7285196	total: 45.2s	remaining: 20.5s
826:	learn: 1.7284575	total: 45.3s	remaining: 20.4s
827:	learn: 1.7283746	total: 45.4s	remaining: 20.4s
828:	learn: 1.7283021	total: 45.4s	remaining: 20.3s
829:	learn: 1.7282188	total: 45.5s	remaining: 20.3s
830:	learn: 1.7281640	total: 45.5s	remaining: 20.2s
831:	learn: 1.7280733	total: 45.6s	remaining: 20.2s
832:	learn: 1.7279583	total: 45.6s	remaining: 20.1s
833:	learn: 1.7278547	total: 45.7s	remaining: 20.1s
834:	learn: 1.7277705	total: 45.8s	remaining: 20s
835:	learn: 1.7276851	total: 45.8s	remaining

In [None]:
import pickle

filename = 'hospital_cat_final.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(wildcat, archivo_salida)

## 3. XGBoosting

In [12]:
xgb = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('grad', XGBClassifier(eta=0.1, min_child_weight=10))
])

In [28]:
xgb.fit(X_train, y_train)
y_pred = wildcat.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.3117


## 4. Random Forest

In [15]:
forest_gump = Pipeline(steps=[
    ('inputer', SimpleImputer()),
    ('rfc', RandomForestClassifier(n_estimators=100, min_samples_split=10, random_state=42, min_samples_leaf=2, max_depth=20))
])

In [16]:
forest_gump.fit(X_train, y_train)
y_pred = forest_gump.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.2943


## 6. Voting Classifier

In [31]:
estimators = [('grad', grad), ('cat', wildcat), ('xgb', xgb)]
voting = VotingClassifier(estimators=estimators, voting='soft')

In [33]:
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(accuracy_score(y_test, y_pred))

280733	total: 45.2s	remaining: 20s
832:	learn: 1.7279583	total: 45.2s	remaining: 19.9s
833:	learn: 1.7278547	total: 45.3s	remaining: 19.9s
834:	learn: 1.7277705	total: 45.3s	remaining: 19.8s
835:	learn: 1.7276851	total: 45.4s	remaining: 19.8s
836:	learn: 1.7275815	total: 45.4s	remaining: 19.7s
837:	learn: 1.7274873	total: 45.5s	remaining: 19.6s
838:	learn: 1.7273559	total: 45.5s	remaining: 19.6s
839:	learn: 1.7272660	total: 45.6s	remaining: 19.5s
840:	learn: 1.7272140	total: 45.6s	remaining: 19.5s
841:	learn: 1.7271330	total: 45.7s	remaining: 19.4s
842:	learn: 1.7271033	total: 45.8s	remaining: 19.4s
843:	learn: 1.7270310	total: 45.8s	remaining: 19.3s
844:	learn: 1.7269776	total: 45.9s	remaining: 19.3s
845:	learn: 1.7269194	total: 45.9s	remaining: 19.2s
846:	learn: 1.7268354	total: 46s	remaining: 19.2s
847:	learn: 1.7266822	total: 46s	remaining: 19.1s
848:	learn: 1.7266109	total: 46.1s	remaining: 19.1s
849:	learn: 1.7265313	total: 46.1s	remaining: 19s
850:	learn: 1.7264516	total: 46.2s	

## 7. Final Transformations

In [9]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    sample = pd.read_csv("sample_submission.csv")
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                df_to_submit.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")

filepath = 'hospital_test.csv'

def prepare_test(model, filepath):
    df = pd.read_csv(filepath, index_col=0)
    # Operaciones de transformación.
    # Quitamos las columnas no relevantes
    filtered = df.drop(['1', '3', '4', '7', '8', '9','10', '11', '14','16'], axis=1)

    # Aplicamos los encoders
    filtered['12'] = filtered['12'].apply(HospitalEncoder.encode_admission)
    filtered['13'] = filtered['13'].apply(HospitalEncoder.encode_severity)
    filtered['15'] = filtered['15'].apply(HospitalEncoder.encode_age)

    # Get dummies
    features = filtered[['2', '6']]
    features = pd.get_dummies(features)
    filtered.drop(['2', '6'], axis=1, inplace=True)
    test = pd.concat([filtered, features], axis=1)

    # Creamos X
    X = np.array(test)

    # Cambiamos Nan por la media
    sim = SimpleImputer()
    X = sim.fit_transform(X)

    # Cogemos índice de sample_submission.csv
    sample = pd.read_csv('sample_submission.csv')

    # Preparamos dataframe de test.
    predictions_submit = model.predict(X)
    submission = pd.DataFrame({"id": sample.id, "days": predictions_submit.ravel()})
    
    return submission

In [10]:
grad.fit(X, y)
submission = prepare_test(grad, filepath)
chequeator(submission)

      Iter       Train Loss   Remaining Time 
         1           1.8996            3.71m
         2           1.8987            3.56m
         3           1.8979            3.49m
         4           1.8970            3.51m
         5           1.8962            3.46m
         6           1.8954            3.41m
         7           1.8947            3.38m
         8           1.8939            3.35m
         9           1.8931            3.32m
        10           1.8924            3.30m
        11           1.8917            3.27m
        12           1.8910            3.25m
        13           1.8903            3.23m
        14           1.8896            3.20m
        15           1.8890            3.18m
        16           1.8883            3.16m
        17           1.8877            3.14m
        18           1.8871            3.13m
        19           1.8865            3.11m
        20           1.8859            3.09m
        21           1.8853            3.07m
        2

URLError: <urlopen error [WinError 10060] Se produjo un error durante el intento de conexión ya que la parte conectada no respondió adecuadamente tras un periodo de tiempo, o bien se produjo un error en la conexión establecida ya que el host conectado no ha podido responder>

In [81]:
submission = prepare_test(grad, filepath)
chequeator(submission)

You're ready to submit!


In [9]:
wildcat.fit(X, y)
submission = prepare_test(wildcat, filepath)
chequeator(submission)

otal: 55.5s	remaining: 25.7s
820:	learn: 1.7416068	total: 55.6s	remaining: 25.6s
821:	learn: 1.7415192	total: 55.6s	remaining: 25.6s
822:	learn: 1.7414533	total: 55.7s	remaining: 25.5s
823:	learn: 1.7413479	total: 55.7s	remaining: 25.4s
824:	learn: 1.7413008	total: 55.8s	remaining: 25.4s
825:	learn: 1.7412532	total: 55.9s	remaining: 25.3s
826:	learn: 1.7411332	total: 55.9s	remaining: 25.2s
827:	learn: 1.7410443	total: 56s	remaining: 25.2s
828:	learn: 1.7409631	total: 56.1s	remaining: 25.1s
829:	learn: 1.7409158	total: 56.1s	remaining: 25s
830:	learn: 1.7408286	total: 56.2s	remaining: 25s
831:	learn: 1.7407665	total: 56.3s	remaining: 24.9s
832:	learn: 1.7407211	total: 56.3s	remaining: 24.8s
833:	learn: 1.7406648	total: 56.4s	remaining: 24.7s
834:	learn: 1.7405666	total: 56.4s	remaining: 24.7s
835:	learn: 1.7405095	total: 56.5s	remaining: 24.6s
836:	learn: 1.7404500	total: 56.6s	remaining: 24.5s
837:	learn: 1.7403568	total: 56.6s	remaining: 24.5s
838:	learn: 1.7403139	total: 56.7s	remain

NameError: name 'prepare_test' is not defined

In [11]:
submission = prepare_test(wildcat, filepath)
chequeator(submission)

You're ready to submit!


In [14]:
xgb.fit(X, y)
submission = prepare_test(xgb, filepath)
chequeator(submission)

You're ready to submit!


In [17]:
forest_gump.fit(X, y)
submission = prepare_test(forest_gump, filepath)
chequeator(submission)

You're ready to submit!


In [34]:
voting.fit(X, y)
submission = prepare_test(voting, filepath)
chequeator(submission)

	remaining: 32.5s
841:	learn: 1.7401449	total: 1m 16s	remaining: 32.4s
842:	learn: 1.7401066	total: 1m 16s	remaining: 32.3s
843:	learn: 1.7400677	total: 1m 16s	remaining: 32.2s
844:	learn: 1.7399974	total: 1m 16s	remaining: 32.1s
845:	learn: 1.7399590	total: 1m 16s	remaining: 32s
846:	learn: 1.7399252	total: 1m 16s	remaining: 31.9s
847:	learn: 1.7398809	total: 1m 16s	remaining: 31.8s
848:	learn: 1.7398101	total: 1m 16s	remaining: 31.7s
849:	learn: 1.7397571	total: 1m 16s	remaining: 31.6s
850:	learn: 1.7397135	total: 1m 16s	remaining: 31.5s
851:	learn: 1.7396315	total: 1m 16s	remaining: 31.4s
852:	learn: 1.7395712	total: 1m 17s	remaining: 31.4s
853:	learn: 1.7394877	total: 1m 17s	remaining: 31.3s
854:	learn: 1.7394419	total: 1m 17s	remaining: 31.2s
855:	learn: 1.7393575	total: 1m 17s	remaining: 31.1s
856:	learn: 1.7392992	total: 1m 17s	remaining: 31s
857:	learn: 1.7392508	total: 1m 17s	remaining: 30.9s
858:	learn: 1.7391718	total: 1m 17s	remaining: 30.8s
859:	learn: 1.7391190	total: 1m 