In [10]:
import kaggle
import pandas as pd
import numpy as np
import h2o
from h2o.automl import H2OAutoML
import mlflow

In [None]:
!kaggle competitions download -c spaceship-titanic

In [None]:
h2o.init()

In [None]:
trainRawDF = pd.read_csv('train.csv')
testRawDF = pd.read_csv('test.csv')
def curate(df):
    df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True, n=3)
    df['CabinNumLen'] = df['CabinNum'].str.len()
    df['CabinNum'] = pd.to_numeric(df['CabinNum'], errors='coerce')
    df['CabinRegion'] = pd.qcut(df['CabinNum'], q=7)
    df['AgeDecile'] = pd.qcut(df['Age'], q=10)

    df[['FirstName', 'LastName']] = df['Name'].str.split(' ', expand=True, n=2)
    df['GroupNum'] = df['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)

    df['FamilySize'] = df.groupby(['LastName'])['LastName'].transform('size')
    df['GroupSize'] = df.groupby(['GroupNum'])['GroupNum'].transform('size')
    df['CabinSize'] = df.groupby(['CabinNum'])['CabinNum'].transform('size')

    df['GroupSize'] = df.groupby(['GroupNum'])['GroupNum'].transform('size')
    df['CabinSize'] = df.groupby(['CabinNum'])['CabinNum'].transform('size') 

    df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0, inplace=True)
    df['Expenditure'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    df['LogExpenditure'] = np.log(df['Expenditure'] + 1)
    df['ZeroExpense'] = df['Expenditure'] == 0
    return df

trainProcessedDF = curate(trainRawDF)
testProcessedDF = curate(testRawDF)

log.info(trainProcessedDF.shape)
trainProcessedDF.describe()

In [None]:
trainHF = h2o.H2OFrame(trainProcessedDF)
testHF = h2o.H2OFrame(testProcessedDF)

trainHF.describe()
x = trainHF.columns
y = "Transported"
trainHF[y] = trainHF[y].asfactor()
x.remove(y)

In [None]:
params = {
    "max_models": 10,  # Number of models to train
    "seed": 42,  # Random seed
    "max_runtime_secs": 7200,  # Time in seconds
    "sort_metric": "accuracy"
}

aml = H2OAutoML(**params)
aml.train(x=x, y=y, training_frame=trainHF)
log.info(aml.leaderboard)
model_path = h2o.save_model(model=aml.leader, path="/tmp/mymodel", force=True)
log.info(model_path)

# Raw: Accuracy 79.1 - 80.1%
# Curated: Accuracy 79.4 - 80.9%

In [20]:
log.info(aml.leaderboard)
model_path = h2o.save_model(model=aml.leader, path="/tmp/mymodel", force=True)
log.info(model_path)

model_id                                       accuracy       auc    logloss     aucpr    mean_per_class_error      rmse       mse
GLM_1_AutoML_1_20250225_170628                 0.792822  0.882312   0.429498  0.892951                0.216531  0.37431   0.140108
DRF_1_AutoML_1_20250225_170628                 0.799494  0.887478   0.431241  0.899816                0.212823  0.368555  0.135833
DeepLearning_1_AutoML_1_20250225_170628        0.800299  0.887558   0.426185  0.899028                0.209818  0.371594  0.138082
XRT_1_AutoML_1_20250225_170628                 0.80237   0.885706   0.426571  0.898413                0.20144   0.369724  0.136696
GBM_5_AutoML_1_20250225_170628                 0.803405  0.897829   0.396356  0.910413                0.19707   0.359067  0.128929
GBM_grid_1_AutoML_1_20250225_170628_model_1    0.80352   0.894891   0.402798  0.908003                0.200435  0.361557  0.130723
GBM_1_AutoML_1_20250225_170628                 0.805131  0.894592   0.402693  0.908

In [8]:
model = h2o.load_model(model_path)

In [None]:
log.info(model.varimp(use_pandas=True))
model.varimp_plot()

In [None]:
test = curate(pd.read_csv('test.csv'))
test = h2o.H2OFrame(test)
test.describe()
preds = aml.leader.predict(test)
preds.describe()
fullPredsHF = test.cbind(preds)

In [None]:
fullPredsDF = fullPredsHF.as_data_frame()
fullPredsDF = fullPredsDF[['PassengerId', 'predict']]
fullPredsDF = fullPredsDF.rename(columns={'predict': 'Transported'})
fullPredsDF.to_csv('submission.csv', index=False)

In [19]:
# BEFORE RUNNING THIS CELL. In terminal, run "mlflow UI". Can check "http://localhost:5000" to inspect state.
mlflow.set_tracking_uri("http://localhost:5000")
experiment = mlflow.set_experiment("Titanic4")
log.info(f"Experiment_id: {experiment.experiment_id}")
log.info(f"Artifact Location: {experiment.artifact_location}")
log.info(f"Lifecycle_stage: {experiment.lifecycle_stage}")
log.info(f"Tracking uri: {mlflow.get_tracking_uri()}")

acc = model.accuracy()
if isinstance(model.accuracy(), list):
    acc = acc[0][1]

with mlflow.start_run():
    mlflow.log_param("max_models", params["max_models"])
    mlflow.log_param("seed", params["seed"])
    mlflow.log_param("max_runtime_secs", params["max_runtime_secs"])
    mlflow.log_metric("logloss", model.logloss())
    mlflow.log_metric("auc",model.auc())
    mlflow.log_metric("accuracy", acc)
    mlflow.h2o.log_model(model, "model", pip_requirements="../requirements.txt")

    model_uri = mlflow.get_artifact_uri("model")
    log.info(f'AutoML best model saved in {model_uri}')


Experiment_id: 605466925374621565
Artifact Location: mlflow-artifacts:/605466925374621565
Lifecycle_stage: active
Tracking uri: http://localhost:5000




AutoML best model saved in mlflow-artifacts:/605466925374621565/04aaabe067dd443b9ad7e909f6c4a88e/artifacts/model
🏃 View run aged-ape-271 at: http://localhost:5000/#/experiments/605466925374621565/runs/04aaabe067dd443b9ad7e909f6c4a88e
🧪 View experiment at: http://localhost:5000/#/experiments/605466925374621565


In [None]:
!kaggle competitions submit -c spaceship-titanic -f submission.csv -m "First Pass with H2O"