<a href="https://colab.research.google.com/github/Kaguya2906/Anomaly_Detection_Weekly_Project/blob/main/10.%20H2O.ai%20GBM%2CXGB%2CDeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>




# Libraries, Dataset

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("XYZloan_default_selected_vars.csv")
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,loan_default,AP001,AP002,AP003,AP004,AP005,AP006,...,CD162,CD164,CD166,CD167,CD169,CD170,CD172,CD173,MB005,MB007
0,0,1,1,1,31,2,1,12,2017/7/6 10:21,ios,...,13.0,13.0,0.0,0.0,1449.0,1449.0,2249.0,2249.0,7.0,IPHONE7
1,1,2,2,0,27,1,1,12,2017/4/6 12:51,h5,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,,WEB
2,2,3,3,0,33,1,4,12,2017/7/1 14:11,h5,...,3.0,2.0,33.0,0.0,33.0,0.0,143.0,110.0,8.0,WEB
3,3,4,4,0,34,2,4,12,2017/7/7 10:10,android,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,OPPO
4,4,5,5,0,47,2,1,12,2017/7/6 14:37,h5,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,,WEB


In [None]:
df.shape

(80000, 89)

drop non-useful columns

In [None]:
df= df.drop(columns=['Unnamed: 0.1',	'Unnamed: 0'])
df.head(2)

Unnamed: 0,id,loan_default,AP001,AP002,AP003,AP004,AP005,AP006,AP007,AP008,...,CD162,CD164,CD166,CD167,CD169,CD170,CD172,CD173,MB005,MB007
0,1,1,31,2,1,12,2017/7/6 10:21,ios,3,3,...,13.0,13.0,0.0,0.0,1449.0,1449.0,2249.0,2249.0,7.0,IPHONE7
1,2,0,27,1,1,12,2017/4/6 12:51,h5,5,4,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,,WEB


Check dtypes and convert those are not numericals if needed

In [None]:
df.dtypes

id                int64
loan_default      int64
AP001             int64
AP002             int64
AP003             int64
                 ...   
CD170           float64
CD172           float64
CD173           float64
MB005           float64
MB007            object
Length: 87, dtype: object

In [None]:
# Function to filter column names that are not int64 or float64
def filter_non_numeric_columns(df):
    non_numeric_columns = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])]
    return non_numeric_columns

filtered_columns = filter_non_numeric_columns(df)

print(filtered_columns)

['AP005', 'AP006', 'MB007']


AP005: split datetime into year and month

In [None]:
#Turn AP005 to dateframe that only reflects year and month
from datetime import datetime
df['AP005'] = pd.to_datetime(df['AP005'])
# Extract the year and month into new columns
df['Year'] = df['AP005'].dt.year
df['Month'] = df['AP005'].dt.month

# Drop the original 'AP005' column if needed
df.drop(columns=['AP005'], inplace=True)

df.head(2)

Unnamed: 0,id,loan_default,AP001,AP002,AP003,AP004,AP006,AP007,AP008,AP009,...,CD166,CD167,CD169,CD170,CD172,CD173,MB005,MB007,Year,Month
0,1,1,31,2,1,12,ios,3,3,1,...,0.0,0.0,1449.0,1449.0,2249.0,2249.0,7.0,IPHONE7,2017,7
1,2,0,27,1,1,12,h5,5,4,0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,,WEB,2017,4


AP006: Perform dummy encoding

In [None]:
df['AP006'].unique()

array(['ios', 'h5', 'android', 'api'], dtype=object)

In [None]:
df = pd.get_dummies(df, columns=['AP006'])
df.head(2)

Unnamed: 0,id,loan_default,AP001,AP002,AP003,AP004,AP007,AP008,AP009,TD001,...,CD172,CD173,MB005,MB007,Year,Month,AP006_android,AP006_api,AP006_h5,AP006_ios
0,1,1,31,2,1,12,3,3,1,1,...,2249.0,2249.0,7.0,IPHONE7,2017,7,0,0,0,1
1,2,0,27,1,1,12,5,4,0,2,...,-99.0,-99.0,,WEB,2017,4,0,0,1,0


MB007: Featurehasher

In [None]:
df['MB007'].unique()

array(['IPHONE7', 'WEB', 'OPPO', 'IPHONE8', 'IPHONE9', 'Noinfo', 'HUAWEI',
       'XIAOMI', 'GIONEE', 'VIVO', 'MEIZU', 'IPHONE6', 'IPHONE5', 'LEECO',
       'HONOR', 'SAMSUNG', '360', 'LETV', 'NUBIA', 'COOLPAD', 'IPAD4',
       'ZTE', 'IPAD6', 'IPHONE4', 'MEITU', 'ONEPLUS', 'LENOVO', 'IPAD3',
       'QIKU', 'LGE', 'PROTRULY', 'IPAD5', 'ZUK', 'DOOV', 'HISENSE',
       'SMARTISAN', 'XIAOLAJIAO', 'YUFLY', 'GOOGLE', 'CMDC', 'HTC', 'AND',
       'AMOI', 'KOPO', 'IVVI', 'NOKIA', 'YEPEN', 'SUGAR', 'SONY', 'ALPS',
       'CMCC', 'MANN', 'CHANGHONG', 'DAQ', 'IPOD7', 'PHILIPS', 'MOTOROLA',
       'ASUS', 'IPAD2', 'KOOBEE', 'SKYHON', 'SM-W2016', 'OWWO', 'VERIZON',
       '4G', 'MYTEL', 'YU-FLY', 'FOPAD', 'UOOGOU', 'SGMSGMS', 'KONKA',
       'BIFER', 'IPHONE3', 'TCL', 'GO', 'CHINAMOBILE', 'KINGSUN', 'KDDI',
       'ANDROID', 'ZUOKU', 'HMI', 'MLLED', 'HONGLAJIAO', 'VEGA', 'BIRD',
       'LINGWIN', 'TINAI', 'APPLE', 'VETAS', 'VOLTE', 'LEPHONE', 'REDGOO',
       'RAMOS', 'CONGMETAL', 'YTONE_L985', 'B

In [None]:
from sklearn.feature_extraction import FeatureHasher

n_features = 10
hasher = FeatureHasher(n_features=n_features, input_type='string')

# Convert the 'MB007' column to a list of dictionaries
MB007 = df['MB007'].apply(lambda x: {'MB007': x}).tolist()

# Perform feature hashing on MB007
hashed_features = hasher.transform(MB007)

# Convert the hashed features into a DataFrame
hashed_df = pd.DataFrame(hashed_features.toarray())

# Concatenate the hashed features DataFrame with the original DataFrame (optional)
df = pd.concat([df, hashed_df], axis=1)

df.head(2)

Unnamed: 0,id,loan_default,AP001,AP002,AP003,AP004,AP007,AP008,AP009,TD001,...,0,1,2,3,4,5,6,7,8,9
0,1,1,31,2,1,12,3,3,1,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,27,1,1,12,5,4,0,2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


Use FastKNN to Impute Null Values

In [None]:
columns_null = df.columns[df.isnull().any()]
columns_null

Index(['TD022', 'TD023', 'TD024', 'TD025', 'TD026', 'TD027', 'TD028', 'TD029',
       'TD044', 'TD048', 'TD051', 'TD054', 'TD055', 'TD061', 'TD062', 'PA022',
       'PA023', 'PA028', 'PA029', 'PA030', 'PA031', 'CD008', 'CD018', 'CD071',
       'CD072', 'CD088', 'CD100', 'CD101', 'CD106', 'CD107', 'CD108', 'CD113',
       'CD114', 'CD115', 'CD117', 'CD118', 'CD120', 'CD121', 'CD123', 'CD130',
       'CD131', 'CD132', 'CD133', 'CD135', 'CD136', 'CD137', 'CD152', 'CD153',
       'CD160', 'CD162', 'CD164', 'CD166', 'CD167', 'CD169', 'CD170', 'CD172',
       'CD173', 'MB005'],
      dtype='object')

In [None]:
!pip install impyute

Collecting impyute
  Downloading impyute-0.0.8-py2.py3-none-any.whl (31 kB)
Installing collected packages: impyute
Successfully installed impyute-0.0.8


In [None]:
from impyute.imputation.cs import fast_knn
from sklearn.preprocessing import LabelEncoder

# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include='number').columns.tolist()
categorical_cols = df.select_dtypes(exclude='number').columns.tolist()

# KNN imputation on numeric columns
#numeric_data = df[numeric_cols].values
#imputed_numeric_data = fast_knn(numeric_data, k=30)

# Convert the imputed numeric array back to a DataFrame
#imputed_df_numeric = pd.DataFrame(imputed_numeric_data, columns=numeric_cols)

# Convert categorical columns to numerical using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    label_encoder = LabelEncoder()
    df[col] = label_encoder.fit_transform(df[col].astype(str))
    label_encoders[col] = label_encoder

In [None]:
# Merge imputed numeric DataFrame with categorical columns
#df = pd.concat([df[categorical_cols], imputed_df_numeric], axis=1)

# Convert back categorical columns to their original form
for col, label_encoder in label_encoders.items():
    df[col] = label_encoder.inverse_transform(df[col])

Check null values

In [None]:
columns_with_null = df.columns[df.isnull().any()]
#columns_with_null

Now we are clear. Display the new df, and convert it into a new csv file for future reference.

In [None]:
df.to_csv("MortgageDefault_Cleaned.csv")

In [None]:
df = pd.read_csv("MortgageDefault_Cleaned.csv")
df = df.drop(columns=['Unnamed: 0'])

## New df

In [None]:
df

Unnamed: 0,id,loan_default,AP001,AP002,AP003,AP004,AP007,AP008,AP009,TD001,...,0,1,2,3,4,5,6,7,8,9
0,1.0,1.0,31.0,2.0,1.0,12.0,3.0,3.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,0.0,27.0,1.0,1.0,12.0,5.0,4.0,0.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,0.0,33.0,1.0,4.0,12.0,4.0,2.0,0.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,34.0,2.0,4.0,12.0,5.0,5.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,0.0,47.0,2.0,1.0,12.0,4.0,4.0,1.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,79996.0,0.0,39.0,2.0,1.0,12.0,5.0,3.0,0.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
79996,79997.0,0.0,31.0,1.0,4.0,12.0,5.0,4.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
79997,79998.0,0.0,37.0,1.0,1.0,12.0,3.0,3.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
79998,79999.0,0.0,38.0,2.0,1.0,12.0,3.0,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# ROC_AUC func

In [None]:
def ROC_AUC(my_result,df,target):
    from sklearn.metrics import roc_curve,auc
    from sklearn.metrics import average_precision_score
    from sklearn.metrics import precision_recall_curve
    import matplotlib.pyplot as plt

    # ROC
    y_actual = df[target].as_data_frame()
    y_pred = my_result.predict(df).as_data_frame()
    fpr = list()
    tpr = list()
    roc_auc = list()
    fpr,tpr,_ = roc_curve(y_actual['loan_default'],y_pred['p1'])
    roc_auc = auc(fpr,tpr)

    # Precision-Recall
    average_precision = average_precision_score(y_actual['loan_default'],y_pred['p1'])

    # plotting
    plt.figure(figsize=(10,4))

    # ROC
    plt.subplot(1,2,1)
    plt.plot(fpr,tpr,color='darkorange',lw=2,label='ROC curve (aare=%0.2f)' % roc_auc)
    plt.plot([0,1],[0,1],color='navy',lw=3,linestyle='--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic: AUC={0:0.4f}'.format(roc_auc))
    plt.legend(loc='lower right')

    # Precision-Recall
    plt.subplot(1,2,2)
    precision,recall,_ = precision_recall_curve(y_actual['loan_default'],y_pred['p1'])
    plt.step(recall,precision,color='b',alpha=0.2,where='post')
    plt.fill_between(recall,precision,step='post',alpha=0.2,color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0,1.05])
    plt.xlim([0.0,1.0])
    plt.title('Precision-Recall curve: PR={0:0.4f}'.format(average_precision))
    plt.show()

# H2O Model Preparations

H2O is a fully open-source, distributed in-memory machine-learning platform with linear scalability. H2O supports the most widely used statistical & machine learning algorithms including gradient-boosted machines, generalized linear models, deep learning, and more. H2O also has an industry-leading AutoML functionality that automatically runs through all the algorithms and their hyperparameters to produce a leaderboard of the best models. The H2O platform is used by over 18,000 organizations globally and is extremely popular in both the R & Python communities.

## libs

In [None]:
pip install h2o

Collecting h2o
  Downloading h2o-3.42.0.2.tar.gz (249.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.42.0.2-py2.py3-none-any.whl size=249153908 sha256=9618b682e9de63532cbd7d4e6f00a5bdc70f20b7d0d511a6bb7894b0da143728
  Stored in directory: /root/.cache/pip/wheels/31/f7/e0/e32942d9f76cb1cb14c949b7772eb78979d2e0132aae6c6780
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.42.0.2


## train test

In [None]:
import h2o
h2o.init()

from sklearn.model_selection import train_test_split
train,test = train_test_split(df,test_size=0.4,random_state=1234)
df_hex = h2o.H2OFrame(df)
train_hex = h2o.H2OFrame(train)
test_hex = h2o.H2OFrame(test)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.19" 2023-04-18; OpenJDK Runtime Environment (build 11.0.19+7-post-Ubuntu-0ubuntu122.04.1); OpenJDK 64-Bit Server VM (build 11.0.19+7-post-Ubuntu-0ubuntu122.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpzonmjq9t
  JVM stdout: /tmp/tmpzonmjq9t/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpzonmjq9t/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.2
H2O_cluster_version_age:,2 days
H2O_cluster_name:,H2O_from_python_unknownUser_fb74xm
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


## predictors and target

In [None]:
X = df.drop(columns=['loan_default']) # predictors
y = df['loan_default'] # target

predictors = X.columns.tolist()
target = 'loan_default'

# (A)H2O Gradient Boosting Machine (GBM)

Since for GBM, the most important factors that could impact the model performance are learning rate(0.01/0.1) and ntrees(1000/2000), I'll slightly test the different combination and the resulting auc score, that is, 4 groups of parameter combinations. I choose these parameters based on the dataset size.

Here I also wanna briefly talk about the two parameters in GBM model. For anomaly detection using GBM, the most critical hyperparameter to tune is the learning rate (also known as the shrinkage rate or step size). The learning rate controls the contribution of each tree to the overall ensemble and helps prevent overfitting. Lower values (e.g., 0.01 - 0.1) make the model more robust, but they require more trees for convergence.

Ntrees represents the maximum number of weak learners to be added to the ensemble. A higher number of trees can improve model performance, but it increases computational cost. The optimal value depends on the size and complexity of the dataset and should be chosen carefully during cross-validation.

The model can get two AUC scores, one on the train data and the other one on cross-validation data. We'll rely on the results on the latter one, as it has the ability to avoid bias and overfitting.

In [None]:
# Set 1: ntrees = 2000, learning_rate = 0.1, AUC Score = 0.6955361069446807
from h2o.estimators import H2OGradientBoostingEstimator

# Convert the response column to a factor
train_hex[target] = train_hex[target].asfactor()

# Model Training
GBM_modl1 = H2OGradientBoostingEstimator(
        model_id = 'GBM_modl',
        ntrees = 2000,
        nfolds=10,
        min_rows=100,
        learn_rate=0.1,
        seed=1234)

In [None]:
GBM_modl1.train(x = predictors,
              y = target,
              training_frame = train_hex)

gbm Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,2000.0,2000.0,469729.0,5.0,5.0,5.0,7.0,32.0,14.049

Unnamed: 0,0,1,Error,Rate
0,32714.0,5979.0,0.1545,(5979.0/38693.0)
1,3812.0,5495.0,0.4096,(3812.0/9307.0)
Total,36526.0,11474.0,0.204,(9791.0/48000.0)

metric,threshold,value,idx
max f1,0.2658993,0.5288485,186.0
max f2,0.1518883,0.650197,275.0
max f0point5,0.3693585,0.542981,125.0
max accuracy,0.3746348,0.8349167,122.0
max precision,0.8822211,1.0,0.0
max recall,0.0298745,1.0,387.0
max specificity,0.8822211,1.0,0.0
max absolute_mcc,0.2735477,0.404681,181.0
max min_per_class_accuracy,0.211101,0.7315018,224.0
max mean_per_class_accuracy,0.211101,0.7336998,224.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.5854572,4.6953906,4.6953906,0.9104167,0.6437991,0.9104167,0.6437991,0.0469539,0.0469539,369.5390566,369.5390566,0.0458426
2,0.02,0.5300553,4.0077361,4.3515633,0.7770833,0.5568176,0.84375,0.6003084,0.0400774,0.0870313,300.7736113,335.1563339,0.0831546
3,0.03,0.4936043,3.8143333,4.1724867,0.7395833,0.5113688,0.8090278,0.5706619,0.0381433,0.1251746,281.4333298,317.2486659,0.1180674
4,0.04,0.4664132,3.4919953,4.0023638,0.6770833,0.4795072,0.7760417,0.5478732,0.03492,0.1600946,249.1995272,300.2363812,0.1489814
5,0.05,0.4460495,3.3200817,3.8659074,0.64375,0.4561447,0.7495833,0.5295275,0.0332008,0.1932954,232.0081659,286.5907382,0.1777628
6,0.1,0.371769,2.7699581,3.3179327,0.5370833,0.4057174,0.6433333,0.4676224,0.1384979,0.3317933,176.9958096,231.7932739,0.2875475
7,0.15,0.3233646,2.1166864,2.9175173,0.4104167,0.3466076,0.5656944,0.4272841,0.1058343,0.4376276,111.6686365,191.7517281,0.3568119
8,0.2,0.2879317,1.8244332,2.6442463,0.35375,0.3051154,0.5127083,0.396742,0.0912217,0.5288493,82.4433222,164.4246266,0.4079488
9,0.3,0.2343819,1.4247341,2.2377422,0.27625,0.2597785,0.4338889,0.3510875,0.1424734,0.6713227,42.4734071,123.7742201,0.4606386
10,0.4,0.1960626,1.0218115,1.9337595,0.198125,0.2144037,0.3749479,0.3169165,0.1021812,0.7735038,2.181154,93.3759536,0.4633444

Unnamed: 0,0,1,Error,Rate
0,24638.0,14055.0,0.3632,(14055.0/38693.0)
1,3306.0,6001.0,0.3552,(3306.0/9307.0)
Total,27944.0,20056.0,0.3617,(17361.0/48000.0)

metric,threshold,value,idx
max f1,0.1912835,0.4087457,241.0
max f2,0.1034513,0.577311,322.0
max f0point5,0.303965,0.368694,156.0
max accuracy,0.6307684,0.8065417,20.0
max precision,0.6837348,0.640625,12.0
max recall,0.0075278,1.0,399.0
max specificity,0.8157412,0.9999742,0.0
max absolute_mcc,0.2441753,0.2294244,199.0
max min_per_class_accuracy,0.1925136,0.6399484,240.0
max mean_per_class_accuracy,0.1844122,0.641284,247.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.5596356,2.5357258,2.5357258,0.4916667,0.6183918,0.4916667,0.6183918,0.0253573,0.0253573,153.5725798,153.5725798,0.0190512
2,0.02,0.5094293,2.2993446,2.4175352,0.4458333,0.532602,0.46875,0.5754969,0.0229934,0.0483507,129.9344579,141.7535189,0.03517
3,0.03,0.4763399,2.4067906,2.4139537,0.4666667,0.490984,0.4680556,0.5473259,0.0240679,0.0724186,140.6790588,141.3953655,0.0526217
4,0.04,0.4517541,2.2133878,2.3638122,0.4291667,0.4639137,0.4583333,0.5264729,0.0221339,0.0945525,121.3387773,136.3812184,0.0676742
5,0.05,0.430414,2.181154,2.3272805,0.4229167,0.4404432,0.45125,0.509267,0.0218115,0.116364,118.115397,132.7280542,0.0823269
6,0.1,0.3600555,1.9512195,2.13925,0.3783333,0.3915746,0.4147917,0.4504208,0.097561,0.213925,95.1219512,113.9250027,0.1413279
7,0.15,0.3162171,1.7019448,1.9934816,0.33,0.3373721,0.3865278,0.4127379,0.0850972,0.2990222,70.1944773,99.3481609,0.1848672
8,0.2,0.2832957,1.5278822,1.8770818,0.29625,0.2991953,0.3639583,0.3843522,0.0763941,0.3754164,52.7882239,87.7081766,0.21761
9,0.3,0.234522,1.3785323,1.7108986,0.2672917,0.2574966,0.3317361,0.342067,0.1378532,0.5132696,37.8532288,71.0898607,0.2645683
10,0.4,0.1964592,1.1013216,1.5585044,0.2135417,0.2147829,0.3021875,0.310246,0.1101322,0.6234017,10.1321586,55.8504352,0.2771376

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.640549,0.0467026,0.6753465,0.5452818,0.6898984,0.6060041,0.6344871,0.6679005,0.6972268,0.6100965,0.65948,0.6197687
auc,0.6955969,0.0077551,0.6942143,0.6880588,0.702419,0.6952683,0.6898885,0.7026318,0.6995039,0.6800302,0.7046995,0.6992542
err,0.359451,0.0467026,0.3246535,0.4547182,0.3101016,0.3939958,0.3655129,0.3320995,0.3027732,0.3899035,0.34052,0.3802313
err_count,1724.8,219.26736,1546.0,2154.0,1495.0,1903.0,1753.0,1615.0,1463.0,1900.0,1611.0,1808.0
f0point5,0.3409308,0.0170186,0.3455154,0.3204154,0.3608317,0.327909,0.3333333,0.3464923,0.3674632,0.3148072,0.3520626,0.3404779
f1,0.4136518,0.0091442,0.4099237,0.4133987,0.4185142,0.4128355,0.4091675,0.4125136,0.4196747,0.3925831,0.4244373,0.4234694
f2,0.528563,0.0302443,0.5038469,0.5824125,0.4981481,0.5571286,0.5296684,0.5096171,0.4891807,0.5213994,0.5342687,0.5599595
lift_top_group,2.4006348,0.3267525,1.8677095,2.7066193,2.7127059,2.7826896,2.169743,1.9044863,2.3576732,2.4862244,2.5711956,2.4473014
logloss,0.4540221,0.0050278,0.4491447,0.465153,0.4518232,0.4493976,0.4541995,0.4502813,0.4594684,0.4550283,0.4511,0.454625
max_per_class_error,0.4153309,0.0454226,0.4053156,0.5186065,0.4294804,0.4223587,0.3713549,0.3955224,0.4501039,0.4032381,0.3543478,0.4029812

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2023-07-28 01:39:42,1:29:47.804,0.0,0.3953483,0.4918229,0.5,0.1938958,1.0,0.8061042
,2023-07-28 01:39:43,1:29:49.037,1.0,0.3933200,0.4867574,0.6605053,0.3085469,2.4294797,0.3690833
,2023-07-28 01:39:44,1:29:49.589,2.0,0.3916418,0.4826303,0.6669602,0.3160750,2.6301069,0.3810625
,2023-07-28 01:39:44,1:29:49.887,3.0,0.3902012,0.4791284,0.6733778,0.3218052,2.6974291,0.374375
,2023-07-28 01:39:45,1:29:50.265,4.0,0.3889445,0.4761023,0.6764770,0.3262540,2.6826080,0.3610625
,2023-07-28 01:39:45,1:29:50.540,5.0,0.3878778,0.4735156,0.6787174,0.3289882,2.6999075,0.37975
,2023-07-28 01:39:45,1:29:50.818,6.0,0.3869065,0.4711842,0.6814258,0.3332346,2.8663893,0.3585
,2023-07-28 01:39:45,1:29:51.120,7.0,0.3859873,0.4690315,0.6860639,0.3387525,2.9332760,0.3391667
,2023-07-28 01:39:46,1:29:51.401,8.0,0.3851779,0.4670428,0.6894433,0.3424199,2.9655098,0.3928542
,2023-07-28 01:39:46,1:29:51.672,9.0,0.3844054,0.4652842,0.6936915,0.3466833,2.8890047,0.3754167

variable,relative_importance,scaled_importance,percentage
TD044,4179.7070312,1.0,0.2756516
TD051,1582.9155273,0.3787145,0.1043932
TD054,1299.8426514,0.3109889,0.0857246
TD048,707.8044434,0.1693431,0.0466797
TD013,703.4257202,0.1682955,0.0463909
AP003,521.1041260,0.1246748,0.0343668
AP004,515.8940430,0.1234283,0.0340232
MB005,256.8387451,0.0614490,0.0169385
CR015,223.4551239,0.0534619,0.0147369
TD009,213.5543671,0.0510931,0.0140839


In [None]:
# Set 2: ntrees = 2000, learning_rate = 0.01, AUC score = 0.6988537610508462
from h2o.estimators import H2OGradientBoostingEstimator

# Convert the response column to a factor
train_hex[target] = train_hex[target].asfactor()

# Model Training
GBM_modl2 = H2OGradientBoostingEstimator(
        model_id = 'GBM_modl',
        ntrees = 2000,
        nfolds=10,
        min_rows=100,
        learn_rate=0.01,
        seed=1234)

GBM_modl2.train(x = predictors,
              y = target,
              training_frame = train_hex)

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,2000.0,2000.0,698426.0,5.0,5.0,5.0,7.0,32.0,22.9405

Unnamed: 0,0,1,Error,Rate
0,32212.0,6481.0,0.1675,(6481.0/38693.0)
1,3681.0,5626.0,0.3955,(3681.0/9307.0)
Total,35893.0,12107.0,0.2117,(10162.0/48000.0)

metric,threshold,value,idx
max f1,0.2583596,0.5254506,190.0
max f2,0.1643626,0.649416,268.0
max f0point5,0.341886,0.5350963,137.0
max accuracy,0.3876461,0.8332083,113.0
max precision,0.8362319,1.0,0.0
max recall,0.0243955,1.0,391.0
max specificity,0.8362319,1.0,0.0
max absolute_mcc,0.2706273,0.3987251,182.0
max min_per_class_accuracy,0.2125968,0.7302031,227.0
max mean_per_class_accuracy,0.2069299,0.7318294,232.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.5713188,4.5986892,4.5986892,0.8916667,0.6298435,0.8916667,0.6298435,0.0459869,0.0459869,359.8689159,359.8689159,0.044643
2,0.02,0.5198234,3.9432685,4.2709788,0.7645833,0.5448979,0.828125,0.5873707,0.0394327,0.0854196,294.3268508,327.0978833,0.0811552
3,0.03,0.484334,3.7283765,4.0901114,0.7229167,0.5011888,0.7930556,0.5586434,0.0372838,0.1227033,272.8376491,309.0111386,0.1150017
4,0.04,0.4593081,3.2985925,3.8922317,0.6395833,0.4716476,0.7546875,0.5368944,0.0329859,0.1556893,229.8592457,289.2231654,0.1435165
5,0.05,0.4386629,3.1696572,3.7477168,0.6145833,0.4492572,0.7266667,0.519367,0.0316966,0.1873858,216.9657247,274.7716772,0.1704319
6,0.1,0.368775,2.7807027,3.2642097,0.5391667,0.3998346,0.6329167,0.4596008,0.1390351,0.326421,178.0702697,226.4209735,0.280883
7,0.15,0.3214849,2.1123885,2.8802693,0.4095833,0.3440488,0.5584722,0.4210835,0.1056194,0.4320404,111.2388525,188.0269331,0.3498808
8,0.2,0.2864633,1.8265821,2.6168475,0.3541667,0.3031764,0.5073958,0.3916067,0.0913291,0.5233695,82.6582142,161.6847534,0.401151
9,0.3,0.2355587,1.4139895,2.2158948,0.2741667,0.2595683,0.4296528,0.3475939,0.1413989,0.6647685,41.398947,121.5894846,0.4525078
10,0.4,0.197316,1.0594176,1.9267755,0.2054167,0.2155599,0.3735938,0.3145854,0.1059418,0.7707102,5.9417643,92.6775545,0.4598788

Unnamed: 0,0,1,Error,Rate
0,25663.0,13030.0,0.3368,(13030.0/38693.0)
1,3507.0,5800.0,0.3768,(3507.0/9307.0)
Total,29170.0,18830.0,0.3445,(16537.0/48000.0)

metric,threshold,value,idx
max f1,0.2006669,0.4122685,231.0
max f2,0.1090706,0.5800926,316.0
max f0point5,0.2759982,0.3725439,170.0
max accuracy,0.5689444,0.8065833,31.0
max precision,0.8070054,1.0,0.0
max recall,0.0150714,1.0,397.0
max specificity,0.8070054,1.0,0.0
max absolute_mcc,0.2380253,0.2323463,200.0
max min_per_class_accuracy,0.194627,0.6422048,236.0
max mean_per_class_accuracy,0.2006669,0.6432167,231.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.5430425,2.6001934,2.6001934,0.5041667,0.6008765,0.5041667,0.6008765,0.0260019,0.0260019,160.0193403,160.0193403,0.019851
2,0.02,0.4997718,2.2456216,2.4229075,0.4354167,0.5187339,0.4697917,0.5598052,0.0224562,0.0484581,124.5621575,142.2907489,0.0353033
3,0.03,0.4672896,2.2671108,2.3709753,0.4395833,0.4826595,0.4597222,0.53409,0.0226711,0.0711293,126.7110777,137.0975252,0.0510223
4,0.04,0.4429536,2.3315784,2.361126,0.4520833,0.4548408,0.4578125,0.5142777,0.0233158,0.094445,133.1578382,136.1126034,0.067541
5,0.05,0.4224588,2.0844526,2.3057913,0.4041667,0.4327114,0.4470833,0.4979644,0.0208445,0.1152896,108.4452563,130.579134,0.080994
6,0.1,0.3568409,2.0479209,2.1768561,0.3970833,0.3860013,0.4220833,0.4419829,0.102396,0.2176856,104.792092,117.685613,0.1459931
7,0.15,0.3129101,1.7019448,2.0185523,0.33,0.3336326,0.3913889,0.4058661,0.0850972,0.3027829,70.1944773,101.8552344,0.1895324
8,0.2,0.2814415,1.5644139,1.9050177,0.3033333,0.2963973,0.369375,0.3784989,0.0782207,0.3810035,56.4413882,90.5017729,0.2245411
9,0.3,0.2340343,1.3527452,1.7209269,0.2622917,0.2566472,0.3336806,0.3378817,0.1352745,0.5162781,35.2745246,72.0926901,0.2683004
10,0.4,0.1976448,1.1507467,1.5783819,0.223125,0.2149514,0.3060417,0.3071491,0.1150747,0.6313527,15.074675,57.8381863,0.2870011

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.6550863,0.0376411,0.7074758,0.5691366,0.6872018,0.6449276,0.6567973,0.6748921,0.6307947,0.6429305,0.6683577,0.6683491
auc,0.6989539,0.007496,0.6970599,0.6926264,0.7031861,0.6985549,0.6947504,0.7039656,0.6995184,0.684126,0.711181,0.7045704
err,0.3449137,0.0376411,0.2925242,0.4308634,0.3127982,0.3550725,0.3432027,0.325108,0.3692053,0.3570696,0.3316424,0.3316509
err_count,1655.4,178.18542,1393.0,2041.0,1508.0,1715.0,1646.0,1581.0,1784.0,1740.0,1569.0,1577.0
f0point5,0.3457662,0.0128504,0.359001,0.3242583,0.3611441,0.3374509,0.3440772,0.3520776,0.3417573,0.3280543,0.356094,0.3537474
f1,0.4152748,0.0072396,0.4094956,0.4126619,0.4208909,0.4120672,0.415483,0.4172503,0.4181344,0.4,0.4259056,0.4208593
f2,0.5213555,0.0236119,0.4765194,0.5673366,0.5043254,0.5290493,0.5242875,0.5120318,0.5384744,0.5123675,0.5297651,0.5193981
lift_top_group,2.570789,0.3381929,2.3071706,2.6025186,2.8170407,3.3178222,2.169743,2.2219007,2.7677033,2.4862244,2.4640625,2.5537057
logloss,0.4523442,0.00504,0.4477079,0.4628219,0.4513217,0.447598,0.452075,0.4495572,0.4593593,0.4521257,0.4482696,0.4526056
max_per_class_error,0.3979494,0.0427034,0.4651163,0.4776986,0.4188759,0.3568688,0.3648208,0.3965885,0.3780362,0.3695652,0.3673913,0.3845327

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2023-07-28 03:21:16,1:27:22.030,0.0,0.3953483,0.4918229,0.5,0.1938958,1.0,0.8061042
,2023-07-28 03:21:16,1:27:22.320,1.0,0.3951382,0.4912922,0.6605053,0.3085469,2.4294797,0.3690833
,2023-07-28 03:21:17,1:27:22.556,2.0,0.3949323,0.4907735,0.6607746,0.3088951,2.4294797,0.3570625
,2023-07-28 03:21:17,1:27:22.792,3.0,0.3947299,0.4902645,0.6638480,0.3110741,2.4294797,0.3824583
,2023-07-28 03:21:17,1:27:23.022,4.0,0.3945317,0.4897674,0.6638899,0.3105013,2.4294797,0.3703333
,2023-07-28 03:21:17,1:27:23.264,5.0,0.3943371,0.4892803,0.6638556,0.3105414,2.4294797,0.3771667
,2023-07-28 03:21:18,1:27:23.506,6.0,0.3941453,0.4888005,0.6649597,0.3149574,2.6301069,0.3703333
,2023-07-28 03:21:18,1:27:23.736,7.0,0.3939579,0.4883331,0.6647471,0.3143096,2.6301069,0.3771667
,2023-07-28 03:21:18,1:27:23.967,8.0,0.3937733,0.4878727,0.6649856,0.3150571,2.6301069,0.3703333
,2023-07-28 03:21:18,1:27:24.206,9.0,0.3935927,0.4874236,0.6652170,0.3151883,2.6301069,0.3774792

variable,relative_importance,scaled_importance,percentage
TD013,6548.5439453,1.0,0.0947079
AP003,5008.4497070,0.7648188,0.0724344
AP004,4887.3374023,0.7463243,0.0706828
MB005,2449.7849121,0.3740961,0.0354299
CR015,2135.2419434,0.3260636,0.0308808
TD009,1965.6937256,0.3001726,0.0284287
TD005,1734.9910889,0.2649430,0.0250922
CD123,1576.0002441,0.2406642,0.0227928
TD014,1557.4163818,0.2378264,0.0225240
CD114,1532.4775391,0.2340181,0.0221634


In [None]:
# Set 3: ntrees = 1000, learning_rate = 0.1, AUC Score = 0.6955370469202277
from h2o.estimators import H2OGradientBoostingEstimator

# Convert the response column to a factor
train_hex[target] = train_hex[target].asfactor()

# Model Training
GBM_modl3 = H2OGradientBoostingEstimator(
        model_id = 'GBM_modl',
        ntrees = 1000,
        nfolds=10,
        min_rows=100,
        learn_rate=0.1,
        seed=1234)

GBM_modl3.train(x = predictors,
              y = target,
              training_frame = train_hex)

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,1000.0,1000.0,247788.0,5.0,5.0,5.0,7.0,32.0,15.098

Unnamed: 0,0,1,Error,Rate
0,32800.0,5893.0,0.1523,(5893.0/38693.0)
1,3842.0,5465.0,0.4128,(3842.0/9307.0)
Total,36642.0,11358.0,0.2028,(9735.0/48000.0)

metric,threshold,value,idx
max f1,0.2671814,0.5289136,185.0
max f2,0.150713,0.6500047,277.0
max f0point5,0.3693906,0.5428837,123.0
max accuracy,0.3745895,0.8349583,120.0
max precision,0.8822211,1.0,0.0
max recall,0.0298751,1.0,386.0
max specificity,0.8822211,1.0,0.0
max absolute_mcc,0.2671814,0.4045395,185.0
max min_per_class_accuracy,0.2111263,0.7316052,227.0
max mean_per_class_accuracy,0.218741,0.7337091,221.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.5854574,4.6953906,4.6953906,0.9104167,0.643796,0.9104167,0.643796,0.0469539,0.0469539,369.5390566,369.5390566,0.0458426
2,0.02,0.5300556,4.0077361,4.3515633,0.7770833,0.5568217,0.84375,0.6003088,0.0400774,0.0870313,300.7736113,335.1563339,0.0831546
3,0.03,0.4937211,3.8035887,4.1689051,0.7375,0.5113716,0.8083333,0.5706631,0.0380359,0.1250672,280.3588697,316.8905125,0.1179341
4,0.04,0.4664138,3.5027399,4.0023638,0.6791667,0.4795068,0.7760417,0.547874,0.0350274,0.1600946,250.2739873,300.2363812,0.1489814
5,0.05,0.4460501,3.3200817,3.8659074,0.64375,0.4561456,0.7495833,0.5295283,0.0332008,0.1932954,232.0081659,286.5907382,0.1777628
6,0.1,0.371766,2.7699581,3.3179327,0.5370833,0.4057177,0.6433333,0.467623,0.1384979,0.3317933,176.9958096,231.7932739,0.2875475
7,0.15,0.3233646,2.1166864,2.9175173,0.4104167,0.3466083,0.5656944,0.4272848,0.1058343,0.4376276,111.6686365,191.7517281,0.3568119
8,0.2,0.287928,1.8244332,2.6442463,0.35375,0.3051156,0.5127083,0.3967425,0.0912217,0.5288493,82.4433222,164.4246266,0.4079488
9,0.3,0.234381,1.4258085,2.2381004,0.2764583,0.2597776,0.4339583,0.3510875,0.1425809,0.6714301,42.5808531,123.8100355,0.4607718
10,0.4,0.1960605,1.0207371,1.9337595,0.1979167,0.2144035,0.3749479,0.3169165,0.1020737,0.7735038,2.073708,93.3759536,0.4633444

Unnamed: 0,0,1,Error,Rate
0,24640.0,14053.0,0.3632,(14053.0/38693.0)
1,3305.0,6002.0,0.3551,(3305.0/9307.0)
Total,27945.0,20055.0,0.3616,(17358.0/48000.0)

metric,threshold,value,idx
max f1,0.1913042,0.4088277,241.0
max f2,0.1034658,0.5773556,322.0
max f0point5,0.303779,0.3687475,156.0
max accuracy,0.6307682,0.8065417,20.0
max precision,0.6841689,0.640625,12.0
max recall,0.0087681,1.0,398.0
max specificity,0.8157415,0.9999742,0.0
max absolute_mcc,0.2441878,0.2294751,199.0
max min_per_class_accuracy,0.1925136,0.639841,240.0
max mean_per_class_accuracy,0.1844126,0.6412412,247.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.5598961,2.5357258,2.5357258,0.4916667,0.6184641,0.4916667,0.6184641,0.0253573,0.0253573,153.5725798,153.5725798,0.0190512
2,0.02,0.509378,2.2993446,2.4175352,0.4458333,0.5326365,0.46875,0.5755503,0.0229934,0.0483507,129.9344579,141.7535189,0.03517
3,0.03,0.4761419,2.4067906,2.4139537,0.4666667,0.4910073,0.4680556,0.5473693,0.0240679,0.0724186,140.6790588,141.3953655,0.0526217
4,0.04,0.4517536,2.234877,2.3691845,0.4333333,0.4638642,0.459375,0.526493,0.0223488,0.0947674,123.4876974,136.9184485,0.0679408
5,0.05,0.4302581,2.1704094,2.3294295,0.4208333,0.4403708,0.4516667,0.5092686,0.0217041,0.1164715,117.0409369,132.9429462,0.0824602
6,0.1,0.3600413,1.9512195,2.1403245,0.3783333,0.3915814,0.415,0.450425,0.097561,0.2140324,95.1219512,114.0324487,0.1414612
7,0.15,0.3162633,1.7019448,1.9941979,0.33,0.3373456,0.3866667,0.4127318,0.0850972,0.2991297,70.1944773,99.4197916,0.1850005
8,0.2,0.283298,1.5257333,1.8770818,0.2958333,0.2992015,0.3639583,0.3843493,0.0762867,0.3754164,52.5733319,87.7081766,0.21761
9,0.3,0.2345235,1.3785323,1.7108986,0.2672917,0.257488,0.3317361,0.3420622,0.1378532,0.5132696,37.8532288,71.0898607,0.2645683
10,0.4,0.1964242,1.1002471,1.5582357,0.2133333,0.2147829,0.3021354,0.3102423,0.1100247,0.6232943,10.0247126,55.8235737,0.2770043

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.6406332,0.0466618,0.6753465,0.5452818,0.6898984,0.6060041,0.6344871,0.6679005,0.6972268,0.6100965,0.65948,0.6206099
auc,0.6956038,0.0077585,0.6942147,0.6880573,0.702419,0.6952607,0.6898968,0.7026318,0.6994886,0.6800288,0.7046965,0.6993435
err,0.3593668,0.0466618,0.3246535,0.4547182,0.3101016,0.3939958,0.3655129,0.3320995,0.3027732,0.3899035,0.34052,0.3793901
err_count,1724.4,219.10231,1546.0,2154.0,1495.0,1903.0,1753.0,1615.0,1463.0,1900.0,1611.0,1804.0
f0point5,0.34094,0.0170183,0.3455154,0.3204154,0.3608317,0.327909,0.3333333,0.3464923,0.3674632,0.3148072,0.3520626,0.34057
f1,0.4136322,0.0091211,0.4099237,0.4133987,0.4185142,0.4128355,0.4091675,0.4125136,0.4196747,0.3925831,0.4244373,0.4232737
f2,0.5284698,0.030138,0.5038469,0.5824125,0.4981481,0.5571286,0.5296684,0.5096171,0.4891807,0.5213994,0.5342687,0.5590272
lift_top_group,2.4219158,0.3369018,1.8677095,2.7066193,2.7127059,2.7826896,2.169743,1.9044863,2.3576732,2.4862244,2.5711956,2.66011
logloss,0.4540093,0.0050258,0.4491447,0.465153,0.4518232,0.4493977,0.4541995,0.4502813,0.4594684,0.4549873,0.4511,0.4545387
max_per_class_error,0.415174,0.0454726,0.4053156,0.5186065,0.4294804,0.4223587,0.3713549,0.3955224,0.4501039,0.4032381,0.3543478,0.4014121

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2023-07-28 04:13:10,43 min 2.100 sec,0.0,0.3953483,0.4918229,0.5,0.1938958,1.0,0.8061042
,2023-07-28 04:13:10,43 min 2.425 sec,1.0,0.3933200,0.4867574,0.6605053,0.3085469,2.4294797,0.3690833
,2023-07-28 04:13:11,43 min 2.898 sec,2.0,0.3916418,0.4826303,0.6669602,0.3160750,2.6301069,0.3810625
,2023-07-28 04:13:11,43 min 3.306 sec,3.0,0.3902012,0.4791284,0.6733778,0.3218052,2.6974291,0.374375
,2023-07-28 04:13:12,43 min 3.799 sec,4.0,0.3889445,0.4761023,0.6764770,0.3262540,2.6826080,0.3610625
,2023-07-28 04:13:12,43 min 4.298 sec,5.0,0.3878778,0.4735156,0.6787174,0.3289882,2.6999075,0.37975
,2023-07-28 04:13:13,43 min 4.581 sec,6.0,0.3869065,0.4711842,0.6814258,0.3332346,2.8663893,0.3585
,2023-07-28 04:13:13,43 min 4.847 sec,7.0,0.3859873,0.4690315,0.6860639,0.3387525,2.9332760,0.3391667
,2023-07-28 04:13:13,43 min 5.122 sec,8.0,0.3851779,0.4670428,0.6894433,0.3424199,2.9655098,0.3928542
,2023-07-28 04:13:13,43 min 5.395 sec,9.0,0.3844054,0.4652842,0.6936915,0.3466833,2.8890047,0.3754167

variable,relative_importance,scaled_importance,percentage
TD044,1868.1754150,1.0,0.1720904
TD013,703.4257202,0.3765309,0.0647974
TD051,690.8513794,0.3698001,0.0636390
AP003,521.1041260,0.2789375,0.0480025
AP004,515.8940430,0.2761486,0.0475225
TD048,465.0143433,0.2489136,0.0428356
TD054,438.9985657,0.2349879,0.0404392
MB005,256.8387451,0.1374811,0.0236592
CR015,223.4551239,0.1196114,0.0205840
TD009,213.5543671,0.1143117,0.0196720


In [None]:
# Set 4: ntrees = 1000, learning_rate = 0.01, AUC Score = 0.6996308709085041(The best)
from h2o.estimators import H2OGradientBoostingEstimator

# Convert the response column to a factor
train_hex[target] = train_hex[target].asfactor()

# Model Training
GBM_modl4 = H2OGradientBoostingEstimator(
        model_id = 'GBM_modl',
        ntrees = 1000,
        nfolds=10,
        min_rows=100,
        learn_rate=0.01,
        seed=1234)

GBM_modl4.train(x = predictors,
              y = target,
              training_frame = train_hex)

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,1000.0,1000.0,382849.0,5.0,5.0,5.0,12.0,32.0,25.653

Unnamed: 0,0,1,Error,Rate
0,30854.0,7839.0,0.2026,(7839.0/38693.0)
1,3854.0,5453.0,0.4141,(3854.0/9307.0)
Total,34708.0,13292.0,0.2436,(11693.0/48000.0)

metric,threshold,value,idx
max f1,0.243885,0.4825877,190.0
max f2,0.1508194,0.6251348,276.0
max f0point5,0.3279215,0.4775966,130.0
max accuracy,0.4143028,0.8190833,83.0
max precision,0.7968941,1.0,0.0
max recall,0.0249322,1.0,392.0
max specificity,0.7968941,1.0,0.0
max absolute_mcc,0.2528615,0.3408477,183.0
max min_per_class_accuracy,0.2080249,0.7017299,220.0
max mean_per_class_accuracy,0.1967985,0.7041207,230.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.5361623,4.0936929,4.0936929,0.79375,0.5902262,0.79375,0.5902262,0.0409369,0.0409369,309.3692919,309.3692919,0.0383783
2,0.02,0.4912677,3.4060385,3.7498657,0.6604167,0.5115956,0.7270833,0.5509109,0.0340604,0.0749973,240.6038466,274.9865692,0.0682261
3,0.03,0.4603811,3.0514666,3.517066,0.5916667,0.4754602,0.6819444,0.5257607,0.0305147,0.105512,205.1466638,251.7066008,0.0936752
4,0.04,0.4361134,2.9547652,3.3764908,0.5729167,0.4479802,0.6546875,0.5063156,0.0295477,0.1350596,195.476523,237.6490813,0.1179248
5,0.05,0.4175299,2.7076394,3.2427205,0.525,0.4264568,0.62875,0.4903438,0.0270764,0.162136,170.7639411,224.2720533,0.1391086
6,0.1,0.3524522,2.5013431,2.8720318,0.485,0.3816218,0.556875,0.4359828,0.1250672,0.2872032,150.1343075,187.2031804,0.232232
7,0.15,0.3113694,2.019985,2.5880162,0.3916667,0.3306332,0.5018056,0.4008663,0.1009992,0.3882024,101.9984958,158.8016189,0.2954983
8,0.2,0.2809455,1.723434,2.3718706,0.3341667,0.2957442,0.4598958,0.3745858,0.0861717,0.4743741,72.3433974,137.1870635,0.3403706
9,0.3,0.2339772,1.3967981,2.0468465,0.2708333,0.2562662,0.396875,0.3351459,0.1396798,0.6140539,39.6798109,104.684646,0.3895947
10,0.4,0.1990167,1.1464489,1.8217471,0.2222917,0.2158659,0.3532292,0.3053259,0.1146449,0.7286988,14.6448909,82.1747072,0.4077622

Unnamed: 0,0,1,Error,Rate
0,24898.0,13795.0,0.3565,(13795.0/38693.0)
1,3308.0,5999.0,0.3554,(3308.0/9307.0)
Total,28206.0,19794.0,0.3563,(17103.0/48000.0)

metric,threshold,value,idx
max f1,0.1956511,0.4122882,237.0
max f2,0.1115422,0.5804672,320.0
max f0point5,0.2851495,0.3751288,164.0
max accuracy,0.574962,0.8067083,24.0
max precision,0.6838907,0.68,6.0
max recall,0.0192418,1.0,397.0
max specificity,0.7646074,0.9999742,0.0
max absolute_mcc,0.2150532,0.2327445,219.0
max min_per_class_accuracy,0.1956511,0.6434756,237.0
max mean_per_class_accuracy,0.1956511,0.6440221,237.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.526241,2.5787042,2.5787042,0.5,0.5811757,0.5,0.5811757,0.025787,0.025787,157.8704201,157.8704201,0.0195844
2,0.02,0.4836873,2.396046,2.4873751,0.4645833,0.5039555,0.4822917,0.5425656,0.0239605,0.0497475,139.6045987,148.7375094,0.0369028
3,0.03,0.4559522,2.2563662,2.4103721,0.4375,0.4694824,0.4673611,0.5182045,0.0225637,0.0723112,125.6366176,141.0372121,0.0524885
4,0.04,0.4310888,2.4067906,2.4094767,0.4666667,0.4433714,0.4671875,0.4994962,0.0240679,0.0963791,140.6790588,140.9476738,0.0699402
5,0.05,0.4126537,2.0951972,2.3466208,0.40625,0.421704,0.455,0.4839378,0.020952,0.117331,109.5197163,134.6620823,0.0835265
6,0.1,0.349563,2.0006447,2.1736327,0.3879167,0.3778407,0.4214583,0.4308892,0.1000322,0.2173633,100.0644676,117.363275,0.1455932
7,0.15,0.3087705,1.7083915,2.0185523,0.33125,0.3279913,0.3913889,0.3965899,0.0854196,0.3027829,70.8391533,101.8552344,0.1895324
8,0.2,0.278673,1.6267326,1.9205974,0.3154167,0.2928646,0.3723958,0.3706586,0.0813366,0.3841195,62.6732567,92.05974,0.2284066
9,0.3,0.2334678,1.3140647,1.7184198,0.2547917,0.2549341,0.3331944,0.3320838,0.1314065,0.5155259,31.4064682,71.8419827,0.2673674
10,0.4,0.1987942,1.1550446,1.577576,0.2239583,0.2152501,0.3058854,0.3028754,0.1155045,0.6310304,15.504459,57.7576018,0.2866012

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.6549271,0.0328171,0.6629567,0.5780029,0.7040033,0.6476191,0.6457465,0.679416,0.6419702,0.6739175,0.6609596,0.6546793
auc,0.6996969,0.0081554,0.69554,0.6936017,0.7048988,0.6982487,0.694771,0.7055289,0.7000878,0.6845247,0.7129531,0.7068142
err,0.3450729,0.0328171,0.3370433,0.421997,0.2959967,0.352381,0.3542535,0.320584,0.3580298,0.3260825,0.3390404,0.3453207
err_count,1655.6,148.6713,1605.0,1999.0,1427.0,1702.0,1699.0,1559.0,1730.0,1589.0,1604.0,1642.0
f0point5,0.3457283,0.012689,0.3393275,0.3250745,0.3708151,0.3381368,0.3401514,0.3551602,0.3456681,0.3373016,0.3540085,0.3516395
f1,0.4157848,0.0088025,0.4070927,0.4111929,0.4238999,0.4118867,0.4147434,0.4193669,0.4190732,0.3987893,0.4267334,0.42507
f2,0.5226796,0.0217629,0.508678,0.5593845,0.494723,0.526781,0.531239,0.5119113,0.53206,0.487692,0.5370637,0.5372632
lift_top_group,2.548763,0.3344687,2.526901,2.9148207,2.8170407,2.9967427,2.061256,2.116096,2.7677033,2.270031,2.3569293,2.66011
logloss,0.4518853,0.0048335,0.4481241,0.4620761,0.4508367,0.4481303,0.4521217,0.4490112,0.4588227,0.451299,0.4474797,0.4509514
max_per_class_error,0.3889434,0.0423628,0.3898117,0.4615994,0.4432662,0.3528773,0.356129,0.3997868,0.3596899,0.4271739,0.3510869,0.3480129

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2023-07-28 05:00:49,43 min 25.321 sec,0.0,0.3953483,0.4918229,0.5,0.1938958,1.0,0.8061042
,2023-07-28 05:00:49,43 min 25.655 sec,1.0,0.3951382,0.4912922,0.6605053,0.3085469,2.4294797,0.3690833
,2023-07-28 05:00:50,43 min 25.893 sec,2.0,0.3949323,0.4907735,0.6607746,0.3088951,2.4294797,0.3570625
,2023-07-28 05:00:50,43 min 26.129 sec,3.0,0.3947299,0.4902645,0.6638480,0.3110741,2.4294797,0.3824583
,2023-07-28 05:00:50,43 min 26.358 sec,4.0,0.3945317,0.4897674,0.6638899,0.3105013,2.4294797,0.3703333
,2023-07-28 05:00:50,43 min 26.590 sec,5.0,0.3943371,0.4892803,0.6638556,0.3105414,2.4294797,0.3771667
,2023-07-28 05:00:51,43 min 26.829 sec,6.0,0.3941453,0.4888005,0.6649597,0.3149574,2.6301069,0.3703333
,2023-07-28 05:00:51,43 min 27.060 sec,7.0,0.3939579,0.4883331,0.6647471,0.3143096,2.6301069,0.3771667
,2023-07-28 05:00:51,43 min 27.293 sec,8.0,0.3937733,0.4878727,0.6649856,0.3150571,2.6301069,0.3703333
,2023-07-28 05:00:51,43 min 27.521 sec,9.0,0.3935927,0.4874236,0.6652170,0.3151883,2.6301069,0.3774792

variable,relative_importance,scaled_importance,percentage
TD013,6365.3789062,1.0,0.1168081
AP003,4968.4614258,0.7805445,0.0911739
AP004,4862.4741211,0.7638939,0.0892290
MB005,2163.3994141,0.3398697,0.0396995
CR015,2000.1376953,0.3142213,0.0367036
TD009,1760.8281250,0.2766258,0.0323121
TD005,1504.7496338,0.2363959,0.0276130
TD014,1409.4892578,0.2214305,0.0258649
CD123,1360.9567871,0.2138061,0.0249743
CD114,1338.9670410,0.2103515,0.0245708


1000 trees, 0.1 learning rate reaches the highest AUC score among the four. It also requires less training time because of the tree numbers.

# (B)H2O Extreme Gradient Boosting (XGB)

XGB is an optimized version of GBM. It provides faster fittings and has many built-in features, like L1 and L2 regularizations and categorical features handling.

In [None]:
X = df.drop(columns=['loan_default']) # predictors
y = df['loan_default'] # target

predictors = X.columns.tolist()
target = 'loan_default'

In [None]:
from h2o.estimators import H2OXGBoostEstimator

# Convert the response column to a factor
train_hex[target] = train_hex[target].asfactor()

# Model 1 nTrees = 1000, learning_rate = 0.01, AUC Score = 0.7001708222976339
XGB_modl1 = H2OXGBoostEstimator(
        model_id = 'XGB_modl',
        ntrees = 1000,
        nfolds=10,
        min_rows=100,
        learn_rate=0.01,
        seed=1234)

XGB_modl1.train(x = predictors,
              y = target,
              training_frame = train_hex)

xgboost Model Build progress: |



██████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,1000.0

Unnamed: 0,0,1,Error,Rate
0,30266.0,8427.0,0.2178,(8427.0/38693.0)
1,3728.0,5579.0,0.4006,(3728.0/9307.0)
Total,33994.0,14006.0,0.2532,(12155.0/48000.0)

metric,threshold,value,idx
max f1,0.2400101,0.4786171,199.0
max f2,0.153736,0.6212498,275.0
max f0point5,0.3421318,0.4708047,125.0
max accuracy,0.3804749,0.8183958,102.0
max precision,0.753851,1.0,0.0
max recall,0.0292232,1.0,391.0
max specificity,0.753851,1.0,0.0
max absolute_mcc,0.2709819,0.3334589,175.0
max min_per_class_accuracy,0.2074975,0.696465,226.0
max mean_per_class_accuracy,0.2083794,0.6967684,225.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.5320454,3.9432685,3.9432685,0.7645833,0.5843786,0.7645833,0.5843786,0.0394327,0.0394327,294.3268508,294.3268508,0.0365123
2,0.02,0.4863407,3.2556141,3.5994413,0.63125,0.5066369,0.6979167,0.5455077,0.0325561,0.0719888,225.5614054,259.9441281,0.0644939
3,0.03,0.459485,3.0514666,3.4167831,0.5916667,0.4724534,0.6625,0.5211563,0.0305147,0.1025035,205.1466638,241.6783067,0.0899431
4,0.04,0.4362179,2.7506178,3.2502418,0.5333333,0.4480268,0.6302083,0.5028739,0.0275062,0.1300097,175.0617815,225.0241754,0.1116601
5,0.05,0.4187086,2.879553,3.176104,0.5583333,0.4274676,0.6158333,0.4877927,0.0287955,0.1588052,187.9553025,217.6104008,0.1349766
6,0.1,0.3548145,2.4970452,2.8365746,0.4841667,0.3828169,0.55,0.4353048,0.1248523,0.2836575,149.7045235,183.6574621,0.2278334
7,0.15,0.3132569,1.9963468,2.5564987,0.3870833,0.332741,0.4956944,0.4011169,0.0998173,0.3834748,99.6346836,155.6498693,0.2896335
8,0.2,0.2821297,1.7105405,2.3450091,0.3316667,0.2969992,0.4546875,0.3750875,0.085527,0.4690018,71.0540453,134.5009133,0.333706
9,0.3,0.2356975,1.3978726,2.0292969,0.2710417,0.2581845,0.3934722,0.3361198,0.1397873,0.6087891,39.7872569,102.9296945,0.3830635
10,0.4,0.2006126,1.0701622,1.7895133,0.2075,0.2175305,0.3469792,0.3064725,0.1070162,0.7158053,7.0162243,78.951327,0.3917674

Unnamed: 0,0,1,Error,Rate
0,25790.0,12903.0,0.3335,(12903.0/38693.0)
1,3548.0,5759.0,0.3812,(3548.0/9307.0)
Total,29338.0,18662.0,0.3427,(16451.0/48000.0)

metric,threshold,value,idx
max f1,0.2043265,0.4118131,227.0
max f2,0.1127327,0.5814885,318.0
max f0point5,0.2985181,0.375613,151.0
max accuracy,0.56734,0.80675,24.0
max precision,0.6757038,0.5909091,5.0
max recall,0.0192736,1.0,398.0
max specificity,0.7382383,0.9999742,0.0
max absolute_mcc,0.214124,0.2327144,218.0
max min_per_class_accuracy,0.1962068,0.641775,234.0
max mean_per_class_accuracy,0.2006778,0.6428117,230.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01,0.5262918,2.5894488,2.5894488,0.5020833,0.5801526,0.5020833,0.5801526,0.0258945,0.0258945,158.9448802,158.9448802,0.0197177
2,0.02,0.4848253,2.4067906,2.4981197,0.4666667,0.5037702,0.484375,0.5419614,0.0240679,0.0499624,140.6790588,149.8119695,0.0371694
3,0.03,0.4568761,2.3530676,2.449769,0.45625,0.4698189,0.475,0.5179139,0.0235307,0.0734931,135.3067584,144.9768991,0.0539547
4,0.04,0.4332438,2.3100892,2.414849,0.4479167,0.4447171,0.4682292,0.4996147,0.0231009,0.096594,131.008918,141.4849038,0.0702068
5,0.05,0.4144634,2.2241324,2.3767057,0.43125,0.4235082,0.4608333,0.4843934,0.0222413,0.1188353,122.4132373,137.6705705,0.0853925
6,0.1,0.3511264,2.0070914,2.1918986,0.3891667,0.3792713,0.425,0.4318324,0.1003546,0.2191899,100.7091437,119.1898571,0.1478591
7,0.15,0.3117668,1.7298807,2.0378926,0.3354167,0.3302713,0.3951389,0.3979787,0.086494,0.3056839,72.9880735,103.7892626,0.1931312
8,0.2,0.2817946,1.5751585,1.9222091,0.3054167,0.2960679,0.3727083,0.372501,0.0787579,0.3844418,57.5158483,92.220909,0.2288064
9,0.3,0.2353376,1.3237348,1.7227177,0.2566667,0.2573128,0.3340278,0.3341049,0.1323735,0.5168153,32.3734823,72.2717668,0.2689669
10,0.4,0.1998127,1.1335554,1.5754271,0.2197917,0.2170841,0.3054688,0.3048497,0.1133555,0.6301708,13.3555388,57.5427098,0.2855349

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.6606132,0.0261902,0.6551869,0.6012244,0.6560879,0.6540373,0.67598,0.6864076,0.6779801,0.6412888,0.6666667,0.6912723
auc,0.7002127,0.0068706,0.6961915,0.6985347,0.7081116,0.6985987,0.6960114,0.7041354,0.6980374,0.6867502,0.7093515,0.7064042
err,0.3393868,0.0261902,0.3448131,0.3987756,0.3439121,0.3459627,0.32402,0.3135924,0.3220199,0.3587113,0.3333333,0.3087277
err_count,1628.8,122.44255,1642.0,1889.0,1658.0,1671.0,1554.0,1525.0,1556.0,1748.0,1577.0,1468.0
f0point5,0.3463885,0.0124859,0.3343777,0.3316733,0.3476841,0.3413957,0.351022,0.3544137,0.3578745,0.326602,0.3536466,0.3651951
f1,0.4142369,0.0080211,0.403343,0.4135362,0.4182456,0.4138899,0.4157895,0.4136871,0.4181002,0.3984859,0.4229784,0.4243137
f2,0.5160081,0.0153745,0.5081487,0.5490519,0.5247403,0.525472,0.5098654,0.4967682,0.5026978,0.5109425,0.5261242,0.5062699
lift_top_group,2.5792608,0.4801423,2.526901,2.6025186,3.4430497,2.5686367,1.8442816,2.010291,2.870211,2.1619344,2.785462,2.9793234
logloss,0.4515784,0.0044617,0.4475892,0.460028,0.4489223,0.4486349,0.4521412,0.4491549,0.4591788,0.4507291,0.448216,0.4511894
max_per_class_error,0.394309,0.0262908,0.3853821,0.4241224,0.3679746,0.359392,0.3995657,0.4264392,0.4189189,0.3706522,0.3717391,0.4189044

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2023-07-28 06:01:23,43 min 59.663 sec,0.0,0.5,0.6931472,0.5,0.1938958,1.0,0.8061042
,2023-07-28 06:01:24,44 min 0.772 sec,1.0,0.4979498,0.6890551,0.6721442,0.3217525,2.6483989,0.3712083
,2023-07-28 06:01:24,44 min 1.232 sec,2.0,0.4959325,0.6850447,0.6725398,0.3225011,2.6483989,0.3745417
,2023-07-28 06:01:25,44 min 1.717 sec,3.0,0.4939489,0.6811163,0.6736908,0.3232925,2.6483989,0.4004583
,2023-07-28 06:01:25,44 min 2.043 sec,4.0,0.4919948,0.6772608,0.6735995,0.3234084,2.6483989,0.4125833
,2023-07-28 06:01:26,44 min 2.533 sec,5.0,0.4900740,0.6734841,0.6740365,0.3237004,2.6483989,0.4018542
,2023-07-28 06:01:26,44 min 2.948 sec,6.0,0.4881820,0.6697768,0.6740217,0.3239534,2.6483989,0.3999583
,2023-07-28 06:01:26,44 min 3.236 sec,7.0,0.4863223,0.6661446,0.6742108,0.3241119,2.6483989,0.39975
,2023-07-28 06:01:26,44 min 3.509 sec,8.0,0.4844926,0.6625823,0.6741803,0.3239743,2.6483989,0.3961875
,2023-07-28 06:01:27,44 min 3.798 sec,9.0,0.4826905,0.6590841,0.6741799,0.3240257,2.6483989,0.3914375

variable,relative_importance,scaled_importance,percentage
TD013,40650.6718750,1.0,0.1225665
AP004,31903.7050781,0.7848260,0.0961934
AP003,28178.8300781,0.6931947,0.0849624
MB005,12742.6855469,0.3134680,0.0384207
CR015,12441.1972656,0.3060515,0.0375116
CD123,9301.6757812,0.2288197,0.0280456
TD005,8259.1494141,0.2031737,0.0249023
TD014,8134.7065430,0.2001125,0.0245271
TD009,7351.5151367,0.1808461,0.0221657
AP006_android,5840.6513672,0.1436791,0.0176102


Hyperparameters [ntrees = 1000, nfolds=10,min_rows=100,learn_rate=0.1] reaches the best result, with an AUC of 0.7001708222976339.Compared to GBM, the whole process is faster, yet the result is also better than any 4 of them.

# (C)H2O Deep Learning

Deep learning's parameters are not intuitive and can be more complicated compared to previous ones. Instead just modifying the learning rate and the ntrees like before, there are a lot of hyperparameters that needed to be took care of. I tried to write a grid search to do hyperparameter tuning, but unfortunatelly not all of the parameters are compatible with the model.I delete the grid search part and choose the one that can run on the GPU and it is as below.

The resulting AUC score is actually the worst, yet still better than last weeks' RF.(which is about 0.67)

In [None]:
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

# Create an instance of the H2ODeepLearningEstimator with your desired parameters
DL_modl = H2ODeepLearningEstimator(
    distribution="tweedie",
    activation="RectifierWithDropout",
    hidden=[50, 100, 50],
    input_dropout_ratio=0.2,
    sparse=True,
    l1=1e-5,
    epochs=10
)

# Now, modify the parameters as needed:
# changing the activation function to "Tanh":
DL_modl.activation = "Tanh"
# Adjusting the learning rate to 0.01:
DL_modl.learning_rate = 0.01
# Changing the number of hidden layers and neurons in each layer:
DL_modl.hidden = [100, 200, 100]
# Adding L2 regularization with a parameter of 0.001:
DL_modl.l2 = 0.001
# Setting sparse to False (turning off sparse data handling):
DL_modl.sparse = False
# You can also modify the number of epochs:
DL_modl.epochs = 20

# After modifying the parameters, you can train the model as before:
DL_modl.train(
    x=predictors,
    y=target,
    training_frame=train_hex,
    validation_frame=test_hex
)

y_pred = DL_modl.predict(test_hex)
y_actual = test_hex[target].as_data_frame()

# Convert H2OFrame predictions to a pandas DataFrame
y_pred_df = y_pred.as_data_frame()
y_pred_series = y_pred_df['predict']  # Extract the 'predict' column as Series

# Concatenate the actual and predicted values
DL_modl_actual_predict = pd.concat([y_actual, y_pred_series], axis=1)
DL_modl_actual_predict.columns = ['actual', 'pred']

# Calculate the ROC AUC value
DL_modl_roc_auc_value = roc_auc_score(DL_modl_actual_predict['actual'], DL_modl_actual_predict['pred'])
print("DL_modl ROC AUC:", DL_modl_roc_auc_value)

deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
DL_modl ROC AUC: 0.6860183726845477




# Compare the models ROC AUC scores

In [None]:
# Create a dictionary with model names as keys and ROC AUC scores as values
# I choose the best combination of hyperparameters for each model.
import pandas as pd
roc_auc_scores = {
    'GBM': 0.6996308709085041,
    'XGB': 0.7001708222976339,
    'Deep Learning': 0.6860183726845477
}

# Convert the dictionary to a pandas DataFrame
roc_auc_df = pd.DataFrame.from_dict(roc_auc_scores, orient='index', columns=['ROC_AUC_Score'])

roc_auc_df

Unnamed: 0,ROC_AUC_Score
GBM,0.699631
XGB,0.700171
Deep Learning,0.686018


From the above result, XGB beats the other two.