Imports

In [1]:
# imports
import warnings
import pandas as pd
import yaml
import mlflow

import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
from src.preprocess import Preprocessor

# configuration
h2o.init()
pd.set_option('display.max_columns', None)
warnings.simplefilter(action='ignore', category=UserWarning)


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.20.1" 2023-08-24; OpenJDK Runtime Environment (build 11.0.20.1+1-post-Ubuntu-0ubuntu120.04); OpenJDK 64-Bit Server VM (build 11.0.20.1+1-post-Ubuntu-0ubuntu120.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.8/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpw_px8y74
  JVM stdout: /tmp/tmpw_px8y74/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpw_px8y74/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/GMT
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_unknownUser_3non7r
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.742 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Load config

In [2]:
# config:
with open('config.yml', 'r') as config_file:
    CONFIG = yaml.safe_load(config_file)

RANDOM_STATE = CONFIG['MAIN_CONFIG']['RANDOM_STATE']
H20_MAX_RUNTIME = CONFIG['MAIN_CONFIG']['H2O_MAX_RUNTIME']
H20_MAX_MODELS = CONFIG['MAIN_CONFIG']['H2O_MAX_MODELS']
PIPELINE_VERSION = CONFIG['MAIN_CONFIG']['VERSION']
PROJECT_NAME = CONFIG['MAIN_CONFIG']['PROJECT_NAME']

Configure mlflow

In [3]:
mlflow_url = "http://172.28.0.3:5000" # adjust this ip number, or use localhost:5000
mlflow.set_tracking_uri(mlflow_url)

if mlflow.is_tracking_uri_set():
    print(f"The connection to the MLflow server has been established. {mlflow.get_tracking_uri()}")
else:
    print("Failed to connect to the MLflow server.")


mlflow.end_run()
mlflow.set_experiment(PROJECT_NAME)
mlflow.start_run()

# log parameters from config
for key, value in CONFIG['MAIN_CONFIG'].items():
    mlflow.log_param(f'MAIN.{key.lower()}', value)

for key, value in CONFIG['PREPROCESS_CONFIG'].items():
    mlflow.log_param(f'PREPROCESS.{key.lower()}', value)

# log config
mlflow.log_artifact('config.yml')

The connection to the MLflow server has been established. http://172.28.0.3:5000


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Load and split data data

In [4]:
# Load the training data
train_data = pd.read_csv('data/train.csv')
X = train_data.drop(columns=['Transported'])
y = train_data['Transported']

# Load the test data
X_test = pd.read_csv('data/test.csv')

# Print shapes
print("X shape:", X.shape)
print("y shape:", y.shape)
print("X_test shape:", X_test.shape)

X shape: (8693, 13)
y shape: (8693,)
X_test shape: (4277, 13)


### Preprocessing

In [5]:
# preprocess train data
preprocessor = Preprocessor(CONFIG)
X_train_processed = preprocessor.process(X)

df input shape: (8693, 13)

na fill log:

                  action  filled_values_num  remaining_na_in_df
0  VIP_FROM_AGE_TRESHOLD                197                2325

Missing summary:

              Total Missing  Percent (%)
CryoSleep               217     2.496261
ShoppingMall            208     2.392730
HomePlanet              201     2.312205
CabinSide               199     2.289198
CabinNum                199     2.289198
CabinDeck               199     2.289198
VRDeck                  188     2.162660
Spa                     183     2.105142
FoodCourt               183     2.105142
Destination             182     2.093639
RoomService             181     2.082135
Age                     179     2.059128
VIP                       6     0.069021
GroupNum                  0     0.000000
GroupSize                 0     0.000000
Alone                     0     0.000000
WithFamily                0     0.000000
df after preprocessing shape: (8693, 17)


In [6]:
# preprocess test data
X_test_processed = preprocessor.process(X_test)  

df input shape: (4277, 13)



na fill log:

                  action  filled_values_num  remaining_na_in_df
0  VIP_FROM_AGE_TRESHOLD                 90                1133

Missing summary:

              Total Missing  Percent (%)
FoodCourt               106     2.478373
Spa                     101     2.361468
CabinDeck               100     2.338087
CabinSide               100     2.338087
CabinNum                100     2.338087
ShoppingMall             98     2.291326
CryoSleep                93     2.174421
Destination              92     2.151040
Age                      91     2.127660
HomePlanet               87     2.034136
RoomService              82     1.917232
VRDeck                   80     1.870470
VIP                       3     0.070143
GroupNum                  0     0.000000
GroupSize                 0     0.000000
Alone                     0     0.000000
WithFamily                0     0.000000
df after preprocessing shape: (4277, 17)


### Find best model with h2o automl

In [7]:
# Convert pandas dataframe to H2O Frame
hf_train = h2o.H2OFrame(pd.concat([X_train_processed, y], axis=1))
hf_test = h2o.H2OFrame(X_test_processed)

# Define predictors and response variable
y = 'Transported'
X = hf_train.columns
X.remove(y)

# Set up the AutoML parameters
aml = H2OAutoML(max_models=H20_MAX_MODELS,
                seed=RANDOM_STATE,
                # nfolds=H20_NFOLDS, 
                project_name=PROJECT_NAME,
                )

# Train the model
aml.train(x=X, y=y, training_frame=hf_train, )

lb = aml.leaderboard
lb_df = lb.as_data_frame()
lb_df.to_html('output/h2o_Transported_leaderboard.html')
mlflow.log_artifact('output/h2o_Transported_leaderboard.html')

lb.head()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_1_AutoML_1_20230910_154812,0.904785,0.379939,0.91746,0.191556,0.351232,0.123364
StackedEnsemble_BestOfFamily_1_AutoML_1_20230910_154812,0.904455,0.3814,0.916558,0.188116,0.351756,0.123732
GBM_2_AutoML_1_20230910_154812,0.901378,0.389684,0.91413,0.19658,0.355013,0.126034
GBM_1_AutoML_1_20230910_154812,0.900877,0.389001,0.914494,0.191859,0.355229,0.126187
GBM_grid_1_AutoML_1_20230910_154812_model_1,0.900045,0.392144,0.912759,0.194423,0.356712,0.127244
GBM_3_AutoML_1_20230910_154812,0.899358,0.39198,0.912735,0.195389,0.356628,0.127183
GBM_5_AutoML_1_20230910_154812,0.899309,0.393064,0.912373,0.195537,0.356981,0.127435
XGBoost_3_AutoML_1_20230910_154812,0.898576,0.393856,0.911878,0.197692,0.357928,0.128112
GBM_grid_1_AutoML_1_20230910_154812_model_2,0.897381,0.398254,0.911287,0.194733,0.359514,0.12925
GBM_4_AutoML_1_20230910_154812,0.897099,0.397304,0.911119,0.199563,0.359319,0.12911


### Log h2o results

In [8]:
# Convert the first row of the DataFrame to a dictionary with columns as keys
metrics_dict = lb_df.head(1).to_dict(orient='records')[0]

# Now iterate over the metrics and their values
for metric, value in metrics_dict.items():
    try:
        mlflow.log_metric(f'h2o_Transported_{metric}', value)
    except:
        mlflow.log_param(f'h2o_Transported_{metric}', value)

### Generate H2O kaggle submission file

In [9]:
passenger_ids = X_test['PassengerId']
submission_template_df = pd.DataFrame({'PassengerId': passenger_ids})

submission_h2o = submission_template_df.copy()
# Predict on the test set
test_predictions_h2o = aml.leader.predict(hf_test)  # Assuming hf_test is your H2O test frame
test_predictions_df = h2o.as_list(test_predictions_h2o)

# Add predictions to the global dataframe
submission_h2o['Transported'] = test_predictions_df['predict'].values

# Export to CSV
predictions_filename_h2o = f'output/predictions_H20_v{PIPELINE_VERSION}.csv'
submission_h2o.to_csv(predictions_filename_h2o, index=False, columns=['PassengerId', 'Transported'])

# Log with MLflow
mlflow.log_artifact(predictions_filename_h2o)


stackedensemble prediction progress: |

███████████████████████████████████████████| (done) 100%


Create submission file

In [10]:
mlflow.end_run()

TODO:

In [11]:
# TODO: 
# Automl/gridsearch H20 to find missing values (combine test and train set) (feature importance for each column)
# option to choose only GridSearch / autoML
# NN trained on data filled with automl

# fill test target with preds with the best confidence, and retrain NN/automl
