In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
import h2o
from h2o.automl import H2OAutoML

In [2]:
df = pd.read_csv('hospital_train.csv')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,161528,6,a,6,X,2,gynecology,R,F,4.0,45810,2.0,Urgent,Moderate,2,21-30,2817.0,0-10
1,159472,23,a,6,X,4,gynecology,Q,F,2.0,128565,15.0,Trauma,Moderate,4,51-60,4498.0,21-30
2,309765,2,c,5,Z,2,anesthesia,S,F,3.0,46565,5.0,Urgent,Moderate,2,71-80,4573.0,11-20
3,279614,32,f,9,Y,3,gynecology,S,B,4.0,124546,6.0,Emergency,Moderate,4,11-20,7202.0,51-60
4,147791,14,a,1,X,3,gynecology,S,E,2.0,22729,8.0,Urgent,Moderate,2,51-60,3398.0,51-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,237869,12,a,9,Y,3,gynecology,R,B,3.0,82914,3.0,Emergency,Moderate,6,51-60,3966.0,More than 100 Days
99996,254763,28,b,11,X,2,gynecology,R,F,2.0,40026,5.0,Urgent,Moderate,3,21-30,4005.0,51-60
99997,69788,6,a,6,X,3,gynecology,Q,F,3.0,92346,2.0,Trauma,Minor,2,31-40,5215.0,31-40
99998,204442,32,f,9,Y,2,gynecology,S,B,4.0,113798,15.0,Trauma,Moderate,3,41-50,5092.0,11-20


In [3]:
colummns = pd.read_csv('columns_meaning.csv')

In [4]:
df.columns = colummns.iloc[:,1]

In [5]:
df

Description,Case_ID registered in Hospital,Unique code for the Hospital,Unique code for the type of Hospital,City Code of the Hospital,Region Code of the Hospital,Number of Extra rooms available in the Hospital,Department overlooking the case,Code for the Ward type,Code for the Ward Facility,Condition of Bed in the Ward,Unique Patient Id,City Code for the patient,Admission Type registered by the Hospital,Severity of the illness recorded at the time of admission,Number of Visitors with the patient,Age of the patient,Deposit at the Admission Time,Stay Days by the patient
0,161528,6,a,6,X,2,gynecology,R,F,4.0,45810,2.0,Urgent,Moderate,2,21-30,2817.0,0-10
1,159472,23,a,6,X,4,gynecology,Q,F,2.0,128565,15.0,Trauma,Moderate,4,51-60,4498.0,21-30
2,309765,2,c,5,Z,2,anesthesia,S,F,3.0,46565,5.0,Urgent,Moderate,2,71-80,4573.0,11-20
3,279614,32,f,9,Y,3,gynecology,S,B,4.0,124546,6.0,Emergency,Moderate,4,11-20,7202.0,51-60
4,147791,14,a,1,X,3,gynecology,S,E,2.0,22729,8.0,Urgent,Moderate,2,51-60,3398.0,51-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,237869,12,a,9,Y,3,gynecology,R,B,3.0,82914,3.0,Emergency,Moderate,6,51-60,3966.0,More than 100 Days
99996,254763,28,b,11,X,2,gynecology,R,F,2.0,40026,5.0,Urgent,Moderate,3,21-30,4005.0,51-60
99997,69788,6,a,6,X,3,gynecology,Q,F,3.0,92346,2.0,Trauma,Minor,2,31-40,5215.0,31-40
99998,204442,32,f,9,Y,2,gynecology,S,B,4.0,113798,15.0,Trauma,Moderate,3,41-50,5092.0,11-20


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column                                                     Non-Null Count   Dtype  
---  ------                                                     --------------   -----  
 0   Case_ID registered in Hospital                             100000 non-null  int64  
 1   Unique code for the Hospital                               100000 non-null  int64  
 2   Unique code for the type of Hospital                       100000 non-null  object 
 3   City Code of the Hospital                                  100000 non-null  int64  
 4   Region Code of the Hospital                                100000 non-null  object 
 5   Number of Extra rooms available in the Hospital            100000 non-null  int64  
 6   Department overlooking the case                            100000 non-null  object 
 7   Code for the Ward type                                     100000 non-null  object 


In [7]:
df = df.drop(['Unique Patient Id', 'Case_ID registered in Hospital'], axis= 1)

In [8]:
df = df.dropna()

In [9]:
df

Description,Unique code for the Hospital,Unique code for the type of Hospital,City Code of the Hospital,Region Code of the Hospital,Number of Extra rooms available in the Hospital,Department overlooking the case,Code for the Ward type,Code for the Ward Facility,Condition of Bed in the Ward,City Code for the patient,Admission Type registered by the Hospital,Severity of the illness recorded at the time of admission,Number of Visitors with the patient,Age of the patient,Deposit at the Admission Time,Stay Days by the patient
0,6,a,6,X,2,gynecology,R,F,4.0,2.0,Urgent,Moderate,2,21-30,2817.0,0-10
1,23,a,6,X,4,gynecology,Q,F,2.0,15.0,Trauma,Moderate,4,51-60,4498.0,21-30
2,2,c,5,Z,2,anesthesia,S,F,3.0,5.0,Urgent,Moderate,2,71-80,4573.0,11-20
3,32,f,9,Y,3,gynecology,S,B,4.0,6.0,Emergency,Moderate,4,11-20,7202.0,51-60
4,14,a,1,X,3,gynecology,S,E,2.0,8.0,Urgent,Moderate,2,51-60,3398.0,51-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,12,a,9,Y,3,gynecology,R,B,3.0,3.0,Emergency,Moderate,6,51-60,3966.0,More than 100 Days
99996,28,b,11,X,2,gynecology,R,F,2.0,5.0,Urgent,Moderate,3,21-30,4005.0,51-60
99997,6,a,6,X,3,gynecology,Q,F,3.0,2.0,Trauma,Minor,2,31-40,5215.0,31-40
99998,32,f,9,Y,2,gynecology,S,B,4.0,15.0,Trauma,Moderate,3,41-50,5092.0,11-20


In [10]:
df.describe()

Description,Unique code for the Hospital,City Code of the Hospital,Number of Extra rooms available in the Hospital,Condition of Bed in the Ward,City Code for the patient,Number of Visitors with the patient,Deposit at the Admission Time
count,98484.0,98484.0,98484.0,98484.0,98484.0,98484.0,98484.0
mean,18.339578,4.783894,3.195707,2.620446,7.2269,3.291966,4880.696976
std,8.625392,3.104605,1.169959,0.873974,4.717302,1.783924,1082.590128
min,1.0,1.0,0.0,1.0,1.0,0.0,1800.0
25%,11.0,2.0,2.0,2.0,4.0,2.0,4189.0
50%,19.0,5.0,3.0,3.0,8.0,3.0,4741.0
75%,26.0,7.0,4.0,3.0,8.0,4.0,5411.0
max,32.0,13.0,24.0,4.0,38.0,32.0,10842.0


In [11]:
df['Number of Extra rooms available in the Hospital'].unique()

array([ 2,  4,  3,  5,  6,  7,  1,  8, 10,  9,  0, 11, 14, 21, 12, 24],
      dtype=int64)

In [12]:
for x in df.columns:
    print(x)
    print(df[x].unique())
    print('############')

Unique code for the Hospital
[ 6 23  2 32 14 15 12 19 11 21 26 27 29  9  8 28 24 10 17 25 30 18 16 22
  5  1  7 31  4 13  3 20]
Unique code for the type of Hospital
['a' 'c' 'f' 'b' 'd' 'e' 'g']
City Code of the Hospital
[ 6  5  9  1  7  2  3  4 11 13 10]
Region Code of the Hospital
['X' 'Z' 'Y']
Number of Extra rooms available in the Hospital
[ 2  4  3  5  6  7  1  8 10  9  0 11 14 21 12 24]
Department overlooking the case
['gynecology' 'anesthesia' 'radiotherapy' 'TB & Chest disease' 'surgery']
Code for the Ward type
['R' 'Q' 'S' 'P' 'T' 'U']
Code for the Ward Facility
['F' 'B' 'E' 'C' 'D' 'A']
Condition of Bed in the Ward
[4. 2. 3. 1.]
City Code for the patient
[ 2. 15.  5.  6.  8.  7.  4.  1.  9. 23. 31. 12. 22.  3. 30. 16. 10. 32.
 27. 20. 14. 13. 21. 26. 19. 18. 28. 24. 25. 11. 34. 35. 37. 33. 29. 36.
 38.]
Admission Type registered by the Hospital
['Urgent' 'Trauma' 'Emergency']
Severity of the illness recorded at the time of admission
['Moderate' 'Extreme' 'Minor']
Number of Vi

In [13]:
df['City Code for the patient'].dtype

dtype('float64')

In [14]:
for x in df.columns:
    if df[x].dtype == object:
        df[x] = LabelEncoder().fit_transform(df[x])


In [15]:
df

Description,Unique code for the Hospital,Unique code for the type of Hospital,City Code of the Hospital,Region Code of the Hospital,Number of Extra rooms available in the Hospital,Department overlooking the case,Code for the Ward type,Code for the Ward Facility,Condition of Bed in the Ward,City Code for the patient,Admission Type registered by the Hospital,Severity of the illness recorded at the time of admission,Number of Visitors with the patient,Age of the patient,Deposit at the Admission Time,Stay Days by the patient
0,6,0,6,0,2,2,2,5,4.0,2.0,2,2,2,2,2817.0,0
1,23,0,6,0,4,2,1,5,2.0,15.0,1,2,4,5,4498.0,2
2,2,2,5,2,2,1,3,5,3.0,5.0,2,2,2,7,4573.0,1
3,32,5,9,1,3,2,3,1,4.0,6.0,0,2,4,1,7202.0,5
4,14,0,1,0,3,2,3,4,2.0,8.0,2,2,2,5,3398.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,12,0,9,1,3,2,2,1,3.0,3.0,0,2,6,5,3966.0,10
99996,28,1,11,0,2,2,2,5,2.0,5.0,2,2,3,2,4005.0,5
99997,6,0,6,0,3,2,1,5,3.0,2.0,1,1,2,3,5215.0,3
99998,32,5,9,1,2,2,3,1,4.0,15.0,1,2,3,4,5092.0,1


In [16]:
x = np.array(df.drop(['Stay Days by the patient'], axis= 1))
x.shape

(98484, 15)

In [17]:
y = np.array(df['Stay Days by the patient'])
y.shape


(98484,)

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state= 42)

In [19]:
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [24]:
log_reg.score(x_test,y_test)

0.3119764431131644

In [20]:
rand_forest = RandomForestClassifier()
rand_forest.fit(x_train, y_train)

RandomForestClassifier()

In [21]:
rand_forest.score(x_test, y_test)

0.37208711986596943

In [25]:
rand_forest.score(x_train, y_train)

0.999809613261071

In [23]:
cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
clf = GridSearchCV(estimator=pipe, param_grid=search_space, cv=cv, verbose=0, n_jobs=-1)

In [27]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
ada = AdaBoostClassifier()

Voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf), ('Adab', ada)], voting= 'hard')

In [28]:
Voting_clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC()),
                             ('Adab', AdaBoostClassifier())])

In [29]:
Voting_clf.score(x_test, y_test)

0.35127176727420417

In [30]:
Voting_clf.score(x_train, y_train)

0.48378539606787924

In [31]:
Voting_clf.fit(x, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC()),
                             ('Adab', AdaBoostClassifier())])

In [119]:
x_pred = pd.read_csv('hospital_test.csv')
x_pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,314114,19,a,7,Y,4,gynecology,S,C,2.0,59278,8.0,Emergency,Minor,2,41-50,4778.0
1,208989,15,c,5,Z,3,gynecology,S,F,2.0,102253,15.0,Trauma,Moderate,3,31-40,5734.0
2,305872,17,e,1,X,4,gynecology,R,E,4.0,5828,4.0,Emergency,Minor,3,71-80,5064.0
3,266099,3,c,3,Z,4,TB & Chest disease,R,A,2.0,56642,9.0,Urgent,Extreme,4,31-40,3254.0
4,13228,6,a,6,X,4,gynecology,R,F,1.0,116266,8.0,Emergency,Minor,3,21-30,4639.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133739,318155,18,d,13,Y,6,anesthesia,Q,B,3.0,123269,25.0,Urgent,Minor,4,51-60,5089.0
133740,144850,27,a,7,Y,2,gynecology,S,C,2.0,1293,8.0,Emergency,Moderate,3,61-70,6713.0
133741,180676,16,c,3,Z,3,gynecology,R,A,3.0,112921,5.0,Trauma,Minor,3,31-40,5326.0
133742,39933,28,b,11,X,4,gynecology,R,F,2.0,585,2.0,Trauma,Extreme,2,31-40,7072.0


In [122]:
ind_list = [x for x in x_pred['0']]


In [123]:

for i, x in enumerate(colummns.iloc[:-1,1]):
    print(i, x)


0 Case_ID registered in Hospital
1 Unique code for the Hospital
2 Unique code for the type of Hospital
3 City Code of the Hospital
4 Region Code of the Hospital
5 Number of Extra rooms available in the Hospital
6 Department overlooking the case
7 Code for the Ward type
8 Code for the Ward Facility
9 Condition of Bed in the Ward
10 Unique Patient Id
11 City Code for the patient
12 Admission Type registered by the Hospital
13 Severity of the illness recorded at the time of admission
14 Number of Visitors with the patient
15 Age of the patient
16 Deposit at the Admission Time


In [79]:
x_pred = x_pred.drop(['0', '10'], axis= 1)


In [80]:
for x in x_pred.columns:
    if x_pred[x].dtype == object:
        x_pred[x] = LabelEncoder().fit_transform(x_pred[x])

In [81]:
x_pred

Unnamed: 0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16
0,19,0,7,1,4,2,3,2,2.0,8.0,0,1,2,4,4778.0
1,15,2,5,2,3,2,3,5,2.0,15.0,1,2,3,3,5734.0
2,17,4,1,0,4,2,2,4,4.0,4.0,0,1,3,7,5064.0
3,3,2,3,2,4,0,2,0,2.0,9.0,2,0,4,3,3254.0
4,6,0,6,0,4,2,2,5,1.0,8.0,0,1,3,2,4639.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133739,18,3,13,1,6,1,1,1,3.0,25.0,2,1,4,5,5089.0
133740,27,0,7,1,2,2,3,2,2.0,8.0,0,2,3,6,6713.0
133741,16,2,3,2,3,2,2,0,3.0,5.0,1,1,3,3,5326.0
133742,28,1,11,0,4,2,2,5,2.0,2.0,1,0,2,3,7072.0


In [82]:
x_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133744 entries, 0 to 133743
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   1       133744 non-null  int64  
 1   2       133744 non-null  int32  
 2   3       133744 non-null  int64  
 3   4       133744 non-null  int32  
 4   5       133744 non-null  int64  
 5   6       133744 non-null  int32  
 6   7       133744 non-null  int32  
 7   8       133744 non-null  int32  
 8   9       133704 non-null  float64
 9   11      131927 non-null  float64
 10  12      133744 non-null  int32  
 11  13      133744 non-null  int32  
 12  14      133744 non-null  int64  
 13  15      133744 non-null  int32  
 14  16      133744 non-null  float64
dtypes: float64(3), int32(8), int64(4)
memory usage: 11.2 MB


In [84]:
x_pred.loc[:,'9'].fillna(x_pred.loc[:,'9'].mode()[0], inplace = True)

In [88]:
x_pred.loc[:,'11'].fillna(x_pred.loc[:,'11'].mode()[0], inplace = True)

In [89]:
x_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133744 entries, 0 to 133743
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   1       133744 non-null  int64  
 1   2       133744 non-null  int32  
 2   3       133744 non-null  int64  
 3   4       133744 non-null  int32  
 4   5       133744 non-null  int64  
 5   6       133744 non-null  int32  
 6   7       133744 non-null  int32  
 7   8       133744 non-null  int32  
 8   9       133744 non-null  float64
 9   11      133744 non-null  float64
 10  12      133744 non-null  int32  
 11  13      133744 non-null  int32  
 12  14      133744 non-null  int64  
 13  15      133744 non-null  int32  
 14  16      133744 non-null  float64
dtypes: float64(3), int32(8), int64(4)
memory usage: 11.2 MB


In [90]:
predictions_submit = Voting_clf.predict(x_pred)

In [None]:
y_encoded = La

In [95]:
predictions_submit

array([1, 2, 1, ..., 2, 2, 2])

In [97]:
new_df = pd.read_csv('hospital_train.csv')

In [99]:
new_df.columns = colummns.iloc[:,1]

In [100]:
y_encoder = LabelEncoder()
y_encoder.fit_transform(new_df['Stay Days by the patient'])

array([0, 2, 1, ..., 3, 1, 4])

In [103]:
predictions_submit = y_encoder.inverse_transform(predictions_submit)
predictions_submit

array(['11-20', '21-30', '11-20', ..., '21-30', '21-30', '21-30'],
      dtype=object)

In [124]:
submission = pd.DataFrame({"id": ind_list, "days": predictions_submit})

In [125]:
submission

Unnamed: 0,id,days
0,314114,11-20
1,208989,21-30
2,305872,11-20
3,266099,31-40
4,13228,21-30
...,...,...
133739,318155,11-20
133740,144850,21-30
133741,180676,21-30
133742,39933,21-30


In [117]:
sample = pd.read_csv("sample_submission.csv")

In [118]:
sample

Unnamed: 0,id,days
0,314114,11-20
1,208989,31-40
2,305872,81-90
3,266099,21-30
4,13228,31-40
...,...,...
133739,318155,41-50
133740,144850,21-30
133741,180676,11-20
133742,39933,21-30


In [113]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                submission.to_csv("submission_12.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")

In [126]:
chequeator(submission)

You're ready to submit!


NameError: name 'urllib' is not defined