## Install h2o
- Java https://www.oracle.com/java/technologies/javase-jdk13-downloads.html (Install v13 or below)
- http://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html#install-in-python
 - (Run in Terminal) pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

# Import Libraries

In [3]:
#Basic Packages
import pandas as pd
import numpy as numpy

#H2O
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

#Evaluation Packages
from sklearn import metrics
from sklearn.metrics import roc_auc_score

In [5]:
#Initialize H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 13.0.2+8, mixed mode, sharing)
  Starting server from C:\Users\DELL\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\DELL\AppData\Local\Temp\tmp1ee1jf21
  JVM stdout: C:\Users\DELL\AppData\Local\Temp\tmp1ee1jf21\h2o_DELL_started_from_python.out
  JVM stderr: C:\Users\DELL\AppData\Local\Temp\tmp1ee1jf21\h2o_DELL_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Asia/Kolkata
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.1.2
H2O cluster version age:,11 days
H2O cluster name:,H2O_from_python_DELL_83xmvp
H2O cluster total nodes:,1
H2O cluster free memory:,3.965 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


# Import train and test Datasets

In [6]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## Check for Missing Values

In [7]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# Treat Missing Values

In [9]:
all = pd.concat([train, test], sort = False)
all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    1309 non-null int64
Survived       891 non-null float64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [10]:
#Fill Missing numbers with median for Age and Fare
all['Age'] = all['Age'].fillna(value=all['Age'].median())
all['Fare'] = all['Fare'].fillna(value=all['Fare'].median())

#Treat Embarked
all['Embarked'] = all['Embarked'].fillna(value=all['Embarked'].mode()[0])

#Bin Age
all.loc[ all['Age'] <= 16, 'Age'] = 0
all.loc[(all['Age'] > 16) & (all['Age'] <= 32), 'Age'] = 1
all.loc[(all['Age'] > 32) & (all['Age'] <= 48), 'Age'] = 2
all.loc[(all['Age'] > 48) & (all['Age'] <= 64), 'Age'] = 3
all.loc[ all['Age'] > 64, 'Age'] = 4 

#Cabin
all['Cabin'] = all['Cabin'].fillna('Missing')
all['Cabin'] = all['Cabin'].str[0]

#Family Size & Alone 
all['Family_Size'] = all['SibSp'] + all['Parch'] + 1
all['IsAlone'] = 0
all.loc[all['Family_Size']==1, 'IsAlone'] = 1

In [11]:
all.isnull().sum()

PassengerId      0
Survived       418
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
Family_Size      0
IsAlone          0
dtype: int64

## Extra Features: Title

In [12]:
#Extract Title from Name
all['Title'] = all['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [13]:
all['Title'].value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Ms            2
Major         2
Dona          1
Sir           1
Mme           1
Capt          1
Countess      1
Lady          1
Jonkheer      1
Don           1
Name: Title, dtype: int64

In [14]:
#We will combine a few categories, since few of them are unique 
all['Title'] = all['Title'].replace(['Capt', 'Dr', 'Major', 'Rev'], 'Officer')
all['Title'] = all['Title'].replace(['Lady', 'Countess', 'Don', 'Sir', 'Jonkheer', 'Dona'], 'Royal')
all['Title'] = all['Title'].replace(['Mlle', 'Ms'], 'Miss')
all['Title'] = all['Title'].replace(['Mme'], 'Mrs')
all['Title'].value_counts()

Mr         757
Miss       264
Mrs        198
Master      61
Officer     19
Royal        6
Col          4
Name: Title, dtype: int64

In [15]:
#Drop unwanted variables
all = all.drop(['Name', 'Ticket'], axis = 1)
all.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family_Size,IsAlone,Title
0,1,0.0,3,male,1.0,1,0,7.25,M,S,2,0,Mr
1,2,1.0,1,female,2.0,1,0,71.2833,C,C,2,0,Mrs
2,3,1.0,3,female,1.0,0,0,7.925,M,S,1,1,Miss
3,4,1.0,1,female,2.0,1,0,53.1,C,S,2,0,Mrs
4,5,0.0,3,male,2.0,0,0,8.05,M,S,1,1,Mr


## Create Dummy Values
We will drop one of them using drop_first = True

In [16]:
all_dummies = pd.get_dummies(all, drop_first = True)
all_dummies.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Family_Size,IsAlone,Sex_male,...,Cabin_M,Cabin_T,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royal
0,1,0.0,3,1.0,1,0,7.25,2,0,1,...,1,0,0,1,0,0,1,0,0,0
1,2,1.0,1,2.0,1,0,71.2833,2,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,1.0,3,1.0,0,0,7.925,1,1,0,...,1,0,0,1,0,1,0,0,0,0
3,4,1.0,1,2.0,1,0,53.1,2,0,0,...,0,0,0,1,0,0,0,1,0,0
4,5,0.0,3,2.0,0,0,8.05,1,1,1,...,1,0,0,1,0,0,1,0,0,0


## Covert Pandas Dataframe to H2O Frame

In [17]:
all_train = h2o.H2OFrame(all_dummies[all_dummies['Survived'].notna()])
all_test = h2o.H2OFrame(all_dummies[all_dummies['Survived'].isna()])

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


# Train/Test Split

In [18]:
#Get columns names for Building H2O Models
target = 'Survived'
predictors = [f for f in all_train.columns if f not in ['Survived','PassengerId']]

### Diving the dataset into Train, Validation and Test
- **Train:** will be used to build model
- **Validation** is used to help improve the evaluation metric (We will not use this in this kernel)
- **Test** is used to help us evaluate the model we built

In [19]:
train_df, valid_df, test_df = all_train.split_frame(ratios=[0.7, 0.15], seed=2018)

In [20]:
#Covert dtype to factor as per H2O implementation
train_df[target] = train_df[target].asfactor()
valid_df[target] = valid_df[target].asfactor()
test_df[target] = test_df[target].asfactor()

# Build Model

In [21]:
#Check X Variables
predictors

['Pclass',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Family_Size',
 'IsAlone',
 'Sex_male',
 'Cabin_B',
 'Cabin_C',
 'Cabin_D',
 'Cabin_E',
 'Cabin_F',
 'Cabin_G',
 'Cabin_M',
 'Cabin_T',
 'Embarked_Q',
 'Embarked_S',
 'Title_Master',
 'Title_Miss',
 'Title_Mr',
 'Title_Mrs',
 'Title_Officer',
 'Title_Royal']

In [22]:
# initialize the H2O GBM 
gbm = H2OGradientBoostingEstimator()

# train with the initialized model
gbm.train(x=predictors, y=target, training_frame=train_df)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [27]:
#Predict on Test Frame to evaluate how well our model performed
#as_data_frame() converts the data to Pandas DataFrame
test_pred_val = gbm.predict(test_df[predictors])[0].as_data_frame()
test_pred_val

gbm prediction progress: |████████████████████████████████████████████████| 100%


Unnamed: 0,predict
0,1
1,1
2,0
3,0
4,1
...,...
110,1
111,0
112,1
113,0


# Check Accuracy

In [30]:
#Test Accuracy
test_true_val = (test_df[target]).as_data_frame()
prediction_auc = roc_auc_score(test_pred_val, test_true_val)
print(f'Test : {prediction_auc:.3f}')

Test : 0.813


In [31]:
#Predict Train
train_pred_val = gbm.predict(train_df[predictors])[0].as_data_frame()
train_pred_val

#Check Train Accuracy 
train_true_val = (train_df[target]).as_data_frame()
prediction_auc = roc_auc_score(train_pred_val, train_true_val)
print(f'Train : {prediction_auc:.3f}')

gbm prediction progress: |████████████████████████████████████████████████| 100%
Train : 0.915


# Final Predictions for Competition

In [32]:
#Get X Variables from Competition Test Dataset
TestForPred = all_test.drop(['PassengerId', 'Survived'], axis = 1)

In [33]:
#Predict
fin_pred = gbm.predict(TestForPred[predictors])[0].as_data_frame()

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [34]:
#Get Competition Test Ids
PassengerId = all_test['PassengerId'].as_data_frame()

In [35]:
#Make Submission File
h2o_Sub = pd.DataFrame({'PassengerId': PassengerId['PassengerId'].tolist(), 'Survived':fin_pred['predict'].tolist() })
h2o_Sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [36]:
#Export Submission File
h2o_Sub.to_csv("1_h2o_Submission.csv", index = False)