In [2]:
#Imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, confusion_matrix, precision_score, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from Utilities import utils

## Data Processsing

In [3]:
# Load the raw MAFLD data.
data_dir = "Data/NTCMRC_all.xlsx"
df = pd.read_excel(data_dir)

In [4]:
# Create a copy of df as df1
df1 = df.copy()

# Replace '\\N' with NaN
df1 = df1.replace('\\N', np.nan)

# Specify the columns to be converted to FLOAT
columns_to_convert1 = ['BMI', 'Triglyceride_y', 'gamgt', 'waist_y', 'mst', 'egfrn', 'Estimated_GFR_x', 'Alb_Cre_ratio', 'HOMA_IR', 'HS_CRP', \
                       'LDL_C_direct', 'LDL_C_HDL_C', 'Adiponectin', 'Leptin', 'Uric_Acid','Insulin', 'ALT_GPT']

# Specify the columns to be converted to INT
columns_to_convert2 = ['脂肪肝 fatty Liver (0:正常  1:mild 2:moderate 3:severe)', 'smoke', 'smoke_q', \
                       'sex', 'w', 'coffee', 'betel']

# Convert the specified columns to float and fill missing/unconvertible values with NaN
for column in columns_to_convert1:
    df1[column] = pd.to_numeric(df1[column], errors='coerce')

# Convert the specified columns to int and fill missing/unconvertible values with NaN
for column in columns_to_convert2:
    df1[column] = pd.to_numeric(df1[column], errors='coerce').astype(pd.Int64Dtype())

# Calculate FLI using the formula and defined as df2
df2 = df1.copy()
df2['FLI'] = (np.exp(0.953 * np.log(df2['Triglyceride_y']) + 0.139 * df2['BMI'] + 0.718 * np.log(df2['gamgt']) \
     + 0.053 * df2['waist_y'] - 15.745)) / (1 + np.exp(0.953 * np.log(df2['Triglyceride_y']) \
    + 0.139 * df2['BMI'] + 0.718 * np.log(df2['gamgt']) + 0.053 * df2['waist_y'] - 15.745)) * 100

# Derive FL_echo based on ultrasound results column
df2['FL_echo'] = df2['脂肪肝 fatty Liver (0:正常  1:mild 2:moderate 3:severe)']
df2['FL_echo'] = df2['FL_echo'].replace('<NA>', np.nan)
df2['fl_status'] = df2.apply(utils.derive_fl_status, axis=1)

#Derive homa_ir_check, hs_crp_check, and mst_total to determine MAFLD risk factors
df2['homa_ir_check'] = df2['HOMA_IR'].apply(lambda x: 1 if x >= 2.5 else 0)
df2['hs_crp_check'] = df2['HS_CRP'].apply(lambda x: 1 if x > 2 else 0)
df2['mst_total'] = df2[['w', 'hyper', 'HDL', 'fg', 'trig', 'homa_ir_check', 'hs_crp_check']].sum(axis=1)

df3 = utils.derive_MAFLD_with_multi_label(utils.derive_MAFLD(df2))
df4 = utils.derive_CKD(df3)

columns_to_extract = ['HBsAg_x', 'Anti_HCV_x']
df5 = df4.copy()
for column in columns_to_extract:
    new_column_name = column + '_num'
    df5[new_column_name] = df5[column].apply(utils.extract_numeric_value)

# Filtering - for patients with same CMRC)id and year_come has more than two records
count_2_or_more_year_come = df5.groupby(['CMRC_id', 'year_come']).filter(lambda x: len(x) >= 2)
unique_patients = count_2_or_more_year_come['CMRC_id'].nunique()
patients_to_remove = df5.groupby(['CMRC_id', 'year_come']).filter(lambda x: len(x) >= 2)['CMRC_id'].unique()
df5_filtered = df5[~df5['CMRC_id'].isin(patients_to_remove)]

df6 = utils.sliding_window_multi_label_data(df5_filtered, input_window_size=1, target_window_size=1)

# Filtering - filter df with first year patient with NON-MAFLD and target year(2nd year) MAFLD status valid
filtered_df1 = df6[(df6['t1_MAFLD_0'] == 1) & (df6['t2_MAFLD_0'] != -1)]

In [5]:
# Feature Selection - manually selected by domain expert

# Drop ID relevant cols in the dataset
columns_to_drop = ['CMRC_id','t1_CMRC_id', 't1_sid','t1_P_Number']
df8 = filtered_df1.drop(columns=columns_to_drop)

#Select key columns for conventional machine learning models
columns = ["sex", "age", "waist_y", "Glucose_AC_y", "Triglyceride_y", "HDL_C_y", "AST_GOT", "ALT_GPT", \
          "gamgt", "Insulin", "T_Cholesterol", "LDL_C_direct", "VLDL_C", "Non_HDL_C", "T_CHOL_HDL_C", \
          "LDL_C_HDL_C", "HS_CRP", "Hb_A1c", "Uric_Acid", "HBsAg_x", "Anti_HCV_x", "HOMA_IR", "Adiponectin", \
           "Leptin", "TotalVitaminD", "smoke", "smoke_q", "coffee", "betel", "BMI", "DM_determine", "w", "hyper", \
           "fg", "HDL", "trig", "sarcf", "ms2", "MNA", "AUDIT", "HBV_", "HCV_", "MAFLD", "CKD", \
           'HBsAg_x_num', 'Anti_HCV_x_num', \
           'MAFLD_0', 'MAFLD_Obesity', 'MAFLD_MD', 'MAFLD_Diabetes', \
           'year_come']
# prefixes = ["t1_", "t2_"]
prefixes = ["t1_"]
renamed_columns = utils.add_prefix(columns, prefixes)

df9 = df8[renamed_columns]
# df9['t2_MAFLD'] = df8['t2_MAFLD']
df9['t2_MAFLD_0'] = df8['t2_MAFLD_0']
df9['t2_MAFLD_Obesity'] = df8['t2_MAFLD_Obesity']
df9['t2_MAFLD_MD'] = df8['t2_MAFLD_MD']
df9['t2_MAFLD_Diabetes'] = df8['t2_MAFLD_Diabetes']

# drop these cols as those been derived for numeric cols, remain alias *_num, and MAFLD status for 1 year
cols_to_drop_only_MAFLD = ['t1_HBsAg_x',  't1_Anti_HCV_x', 't1_MAFLD', 't1_MAFLD_0',\
                           't1_MAFLD_Obesity', 't1_MAFLD_MD', 't1_MAFLD_Diabetes', \
                           ]

df9_processed = df9.drop(cols_to_drop_only_MAFLD, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df9['t2_MAFLD_0'] = df8['t2_MAFLD_0']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df9['t2_MAFLD_Obesity'] = df8['t2_MAFLD_Obesity']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df9['t2_MAFLD_MD'] = df8['t2_MAFLD_MD']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

## Data set split for train-validation-test

In [6]:
# 前4年 training， 4-5 testing
# filter for for training set
filtered_df_train = df9_processed[df9_processed['t1_year_come'] <= 3]

# filter for year4 for test set
filtered_df_test = df9_processed[df9_processed['t1_year_come'] >= 4]

# df9_processed.drop('t1_MAFLD_0', axis=1, inplace=True)
features = df9_processed.columns.drop(['t2_MAFLD_0', 't2_MAFLD_Obesity', 't2_MAFLD_Diabetes', 't2_MAFLD_MD'])

categorical_features = ['t1_sex', 't1_w', 't1_smoke', 't1_smoke_q', 't1_coffee', 't1_betel', 't1_DM_determine', 't1_CKD']
numeric_features = df9_processed.columns.drop(categorical_features).drop(['t2_MAFLD_0', 't2_MAFLD_Obesity', 't2_MAFLD_Diabetes', 't2_MAFLD_MD'])
X_categorical = df9_processed[categorical_features]
X_numeric = df9_processed[numeric_features]

# multi-label for target variable
y = df9_processed[['t2_MAFLD_0', 't2_MAFLD_Obesity', 't2_MAFLD_Diabetes', 't2_MAFLD_MD']]

# Scaling
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Missing value handling
imputer = SimpleImputer(strategy='median')
X_numeric_scaled_imputed = imputer.fit_transform(X_numeric_scaled)

#dummy var
X_categorical_str = X_categorical.astype(str)
X_categorical_encoded = pd.get_dummies(X_categorical_str, drop_first=True)

# concat
X_numeric_scaled_imputed = pd.DataFrame(X_numeric_scaled_imputed, columns=X_numeric.columns)
X_numeric_scaled_imputed.reset_index(drop=True, inplace=True)
X_categorical_encoded.reset_index(drop=True, inplace=True)
X_combined = pd.concat([X_numeric_scaled_imputed, X_categorical_encoded], axis=1)

In [7]:
# 前4年 training， 4-5 testing
# filter for for training set
df_train = df9_processed[df9_processed['t1_year_come'] <= 3]

# filter for year4 for test set
df_test = df9_processed[df9_processed['t1_year_come'] >= 4]

# Separate data with Train, validate, test set
# df_validation = df_test_and_validation.sample(n=500, random_state=1)
# df_test = df_test_and_validation.drop(df_validation.index)

print(f"The shape of the training set is {df_train.shape}")
# print(f"The shape of the validation set is {df_validation.shape}")
print(f"The shape of the testing set is {df_test.shape}")

The shape of the training set is (8671, 48)
The shape of the testing set is (2527, 48)


In [8]:
# Usage
# feature processing for scaling and missing imputation
X_train, y_train = utils.preprocess_features_and_target(df_train)
# X_val, y_val = utils.preprocess_features_and_target(df_validation)
X_test, y_test = utils.preprocess_features_and_target(df_test)

## Experiment and Modeling

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, hamming_loss, f1_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit

In [10]:
# List of target variables - aim to apply 4 classfier for each target label
targets = ['t2_MAFLD_0', 't2_MAFLD_Obesity', 't2_MAFLD_Diabetes', 't2_MAFLD_MD']

### Logistic Regression

In [11]:
# Dictionary to store AUC scores for each target
auc_scores = {}

# Initialize an empty DataFrame for predictions
y_pred_df = pd.DataFrame()

# Define the parameter grid for hyperparameter tuning
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

print("---Starting Training Model with Logistic Regression---")
# Train a separate Logistic Regression model for each target
for target in targets:
    # Initialize Logistic Regression and GridSearchCV
    lr = LogisticRegression(solver='liblinear', random_state=2023)
    grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='roc_auc')

    # Perform hyperparameter tuning
    grid_search.fit(X_train, y_train[target])

    # Best model and parameters after hyperparameter tuning
    best_lr = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"Best hyperparameters for {target}: {best_params}")

    # Predict on the test set
    y_pred_prob = best_lr.predict_proba(X_test)[:, 1]
    
    # Calculate AUC on the test set and store in the dictionary
    auc_score = roc_auc_score(y_test[target], y_pred_prob)
    auc_scores[target] = auc_score

    # Add predictions to the DataFrame
    y_pred_df[target] = best_lr.predict(X_test)

# Calculate accuracy on the test set
print("\n---Apply Best Hyper Param for Test Set---")
accuracy = accuracy_score(y_test[targets], y_pred_df)
print(f"Overall Test Accuracy: {accuracy}")

# Print AUC for each target
print("AUC Scores for Each Target:")
for target, score in auc_scores.items():
    print(f"{target}: {score}")

---Starting Training Model with Logistic Regression---


Best hyperparameters for t2_MAFLD_0: {'C': 0.1}
Best hyperparameters for t2_MAFLD_Obesity: {'C': 0.1}
Best hyperparameters for t2_MAFLD_Diabetes: {'C': 1}
Best hyperparameters for t2_MAFLD_MD: {'C': 1}

---Apply Best Hyper Param for Test Set---
Overall Test Accuracy: 0.8737633557578156
AUC Scores for Each Target:
t2_MAFLD_0: 0.795542583445592
t2_MAFLD_Obesity: 0.8170088656249396
t2_MAFLD_Diabetes: 0.9282339367634536
t2_MAFLD_MD: 0.7903016380343588


### SVM

In [11]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import SVC
# from sklearn.metrics import roc_auc_score, accuracy_score, hamming_loss, f1_score
# import pandas as pd
# import numpy as np

# # Initialize DataFrames for storing probabilities and predictions
# y_prob_df = pd.DataFrame()
# y_pred_df = pd.DataFrame()

# # Define the parameter grid for hyperparameter tuning
# param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['rbf', 'linear']}

# print("---Starting Training Model with SVM---")
# # Loop through each target variable for model training and evaluation
# for target in targets:
#     # Initialize SVC and GridSearchCV
#     svm = SVC(probability=True)
#     grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='roc_auc')

#     # Perform hyperparameter tuning
#     grid_search.fit(X_train, y_train[target])

#     # Best model and parameters after hyperparameter tuning
#     best_svm = grid_search.best_estimator_
#     print(f"Best hyperparameters for {target}: {grid_search.best_params_}")

#     # Predict probabilities and classes on the test set
#     print("\n---Apply Best Hyper Param for Test Set---")
#     y_prob = best_svm.predict_proba(X_test)[:, 1]
#     y_pred = best_svm.predict(X_test)

#     # Store the probabilities and predictions
#     y_prob_df[target] = y_prob
#     y_pred_df[target] = y_pred

#     # Calculate and print AUC for the target
#     print("AUC Scores for Each Target:")
#     auc_score = roc_auc_score(y_test[target], y_prob)
#     print(f"{target}: {auc_score:.4f}")

# # Calculate overall performance metrics
# true_labels_flat = np.hstack([y_test[t] for t in targets])
# pred_probs_flat = np.hstack([y_prob_df[t] for t in targets])
# pred_labels_flat = np.hstack([y_pred_df[t] for t in targets])

# accuracy = accuracy_score(true_labels_flat, pred_labels_flat)

# # Print overall results
# print(f"Overall Test Accuracy: {accuracy}")

---Starting Training Model with SVM---


### Decision Tree

In [12]:
# Initialize DataFrames for storing probabilities and predictions
y_prob_df = pd.DataFrame()
y_pred_df = pd.DataFrame()

# Define the parameter grid for hyperparameter tuning
param_grid = {'max_depth': [None, 10, 20, 30], 
              'min_samples_split': [2, 5, 10], 
              'min_samples_leaf': [1, 2, 4]}

# Loop through each target variable for model training and evaluation
for target in targets:
    # Initialize DecisionTreeClassifier and GridSearchCV
    dt_clf = DecisionTreeClassifier(random_state=2023)
    grid_search = GridSearchCV(dt_clf, param_grid, cv=5, scoring='roc_auc')

    # Perform hyperparameter tuning
    grid_search.fit(X_train, y_train[target])

    # Best model and parameters after hyperparameter tuning
    best_dt = grid_search.best_estimator_
    print(f"Best hyperparameters for {target}: {grid_search.best_params_}")

    # Predict probabilities and classes on the test set
    y_prob = best_dt.predict_proba(X_test)[:, 1]
    y_pred = best_dt.predict(X_test)

    # Store the probabilities and predictions
    y_prob_df[target] = y_prob
    y_pred_df[target] = y_pred

    # Calculate and print AUC for the target
    auc_score = roc_auc_score(y_test[target], y_prob)
    print(f"Decision Tree AUC for {target}: {auc_score:.4f}")

# Calculate overall performance metrics
true_labels_flat = np.hstack([y_test[t] for t in targets])
pred_probs_flat = np.hstack([y_prob_df[t] for t in targets])
pred_labels_flat = np.hstack([y_pred_df[t] for t in targets])

accuracy = accuracy_score(true_labels_flat, pred_labels_flat)
auc_micro = roc_auc_score(true_labels_flat, pred_probs_flat)
auc_macro = np.mean([roc_auc_score(y_test[t], y_prob_df[t]) for t in targets])

# Print overall results
print("---Apply Best Hyper Param for Test Set---")
print(f"Overall Test Accuracy: {accuracy:.4f}")
print(f"AUC Scores for Each Target:")
for target in targets:
    print(f"{target}: {roc_auc_score(y_test[target], y_prob_df[target]):.4f}")


Best hyperparameters for t2_MAFLD_0: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Decision Tree AUC for t2_MAFLD_0: 0.6680
Best hyperparameters for t2_MAFLD_Obesity: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Decision Tree AUC for t2_MAFLD_Obesity: 0.6740
Best hyperparameters for t2_MAFLD_Diabetes: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Decision Tree AUC for t2_MAFLD_Diabetes: 0.7176
Best hyperparameters for t2_MAFLD_MD: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}
Decision Tree AUC for t2_MAFLD_MD: 0.7355
---Apply Best Hyper Param for Test Set---
Overall Test Accuracy: 0.8766
AUC Scores for Each Target:
t2_MAFLD_0: 0.6680
t2_MAFLD_Obesity: 0.6740
t2_MAFLD_Diabetes: 0.7176
t2_MAFLD_MD: 0.7355


### Emsemble method - Adaboost

In [13]:
# Initialize DataFrames for storing probabilities and predictions
y_prob_df = pd.DataFrame()
y_pred_df = pd.DataFrame()

# Define the parameter grid for hyperparameter tuning
param_grid = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]}

# Loop through each target variable for model training and evaluation
for target in targets:
    # Initialize AdaBoost and GridSearchCV
    ada_clf = AdaBoostClassifier(random_state=2023)
    grid_search = GridSearchCV(ada_clf, param_grid, cv=5, scoring='roc_auc')

    # Perform hyperparameter tuning
    grid_search.fit(X_train, y_train[target])

    # Best model and parameters after hyperparameter tuning
    best_ada = grid_search.best_estimator_
    print(f"Best hyperparameters for {target}: {grid_search.best_params_}")

    # Predict probabilities and classes on the test set
    y_prob = best_ada.predict_proba(X_test)[:, 1]
    y_pred = best_ada.predict(X_test)

    # Store the probabilities and predictions
    y_prob_df[target] = y_prob
    y_pred_df[target] = y_pred

    # Calculate and print AUC for the target
    auc_score = roc_auc_score(y_test[target], y_prob)
    print(f"AdaBoost AUC for {target}: {auc_score:.4f}")

# Calculate overall performance metrics
true_labels_flat = np.hstack([y_test[t] for t in targets])
pred_probs_flat = np.hstack([y_prob_df[t] for t in targets])
pred_labels_flat = np.hstack([y_pred_df[t] for t in targets])

accuracy = accuracy_score(true_labels_flat, pred_labels_flat)
auc_micro = roc_auc_score(true_labels_flat, pred_probs_flat)
auc_macro = np.mean([roc_auc_score(y_test[t], y_prob_df[t]) for t in targets])

# Print overall results
print("---Apply Best Hyper Param for Test Set---")
print(f"Overall Test Accuracy: {accuracy:.4f}")
print(f"AUC Scores for Each Target:")
for target in targets:
    print(f"{target}: {roc_auc_score(y_test[target], y_prob_df[target]):.4f}")


Best hyperparameters for t2_MAFLD_0: {'learning_rate': 0.1, 'n_estimators': 100}
AdaBoost AUC for t2_MAFLD_0: 0.7754
Best hyperparameters for t2_MAFLD_Obesity: {'learning_rate': 0.1, 'n_estimators': 100}
AdaBoost AUC for t2_MAFLD_Obesity: 0.8123
Best hyperparameters for t2_MAFLD_Diabetes: {'learning_rate': 0.1, 'n_estimators': 50}
AdaBoost AUC for t2_MAFLD_Diabetes: 0.9330
Best hyperparameters for t2_MAFLD_MD: {'learning_rate': 0.01, 'n_estimators': 200}
AdaBoost AUC for t2_MAFLD_MD: 0.8425
---Apply Best Hyper Param for Test Set---
Overall Test Accuracy: 0.9348
AUC Scores for Each Target:
t2_MAFLD_0: 0.7754
t2_MAFLD_Obesity: 0.8123
t2_MAFLD_Diabetes: 0.9330
t2_MAFLD_MD: 0.8425


### Emsemble method - Bagging

In [14]:
# Initialize DataFrames for storing probabilities and predictions
y_prob_df = pd.DataFrame()
y_pred_df = pd.DataFrame()

# Define the parameter grid for hyperparameter tuning
param_grid = {'n_estimators': [10, 50, 100], 'max_samples': [0.5, 1.0], 'max_features': [0.5, 1.0]}

# Loop through each target variable for model training and evaluation
for target in targets:
    # Initialize BaggingClassifier and GridSearchCV
    bagging_clf = BaggingClassifier(random_state=2023)
    grid_search = GridSearchCV(bagging_clf, param_grid, cv=5, scoring='roc_auc')

    # Perform hyperparameter tuning
    grid_search.fit(X_train, y_train[target])

    # Best model and parameters after hyperparameter tuning
    best_bagging = grid_search.best_estimator_
    print(f"Best hyperparameters for {target}: {grid_search.best_params_}")

    # Predict probabilities and classes on the test set
    y_prob = best_bagging.predict_proba(X_test)[:, 1]
    y_pred = best_bagging.predict(X_test)

    # Store the probabilities and predictions
    y_prob_df[target] = y_prob
    y_pred_df[target] = y_pred

    # Calculate and print AUC for the target
    auc_score = roc_auc_score(y_test[target], y_prob)
    print(f"Bagging AUC for {target}: {auc_score:.4f}")

# Calculate overall performance metrics
true_labels_flat = np.hstack([y_test[t] for t in targets])
pred_probs_flat = np.hstack([y_prob_df[t] for t in targets])
pred_labels_flat = np.hstack([y_pred_df[t] for t in targets])

accuracy = accuracy_score(true_labels_flat, pred_labels_flat)
auc_micro = roc_auc_score(true_labels_flat, pred_probs_flat)
auc_macro = np.mean([roc_auc_score(y_test[t], y_prob_df[t]) for t in targets])

# Print overall results
print("---Apply Best Hyper Param for Test Set---")
print(f"Overall Test Accuracy: {accuracy:.4f}")
print(f"AUC Scores for Each Target:")
for target in targets:
    print(f"{target}: {roc_auc_score(y_test[target], y_prob_df[target]):.4f}")


Best hyperparameters for t2_MAFLD_0: {'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 100}
Bagging AUC for t2_MAFLD_0: 0.7270
Best hyperparameters for t2_MAFLD_Obesity: {'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 100}
Bagging AUC for t2_MAFLD_Obesity: 0.7528
Best hyperparameters for t2_MAFLD_Diabetes: {'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 100}
Bagging AUC for t2_MAFLD_Diabetes: 0.9173
Best hyperparameters for t2_MAFLD_MD: {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 100}
Bagging AUC for t2_MAFLD_MD: 0.7829
---Apply Best Hyper Param for Test Set---
Overall Test Accuracy: 0.9337
AUC Scores for Each Target:
t2_MAFLD_0: 0.7270
t2_MAFLD_Obesity: 0.7528
t2_MAFLD_Diabetes: 0.9173
t2_MAFLD_MD: 0.7829


### Emsemble method - Random Forest

In [16]:
# Initialize DataFrames for storing probabilities and predictions
y_prob_df = pd.DataFrame()
y_pred_df = pd.DataFrame()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300], 
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4]
}

# Loop through each target variable for model training and evaluation
for target in targets:
    # Initialize RandomForestClassifier and GridSearchCV
    rf_clf = RandomForestClassifier(random_state=2023)
    grid_search = GridSearchCV(rf_clf, param_grid, cv=5, scoring='roc_auc')

    # Perform hyperparameter tuning
    grid_search.fit(X_train, y_train[target])

    # Best model and parameters after hyperparameter tuning
    best_rf = grid_search.best_estimator_
    print(f"Best hyperparameters for {target}: {grid_search.best_params_}")

    # Predict probabilities and classes on the test set
    y_prob = best_rf.predict_proba(X_test)[:, 1]
    y_pred = best_rf.predict(X_test)

    # Store the probabilities and predictions
    y_prob_df[target] = y_prob
    y_pred_df[target] = y_pred

    # Calculate and print AUC for the target
    auc_score = roc_auc_score(y_test[target], y_prob)
    print(f"Random Forest AUC for {target}: {auc_score:.4f}")

# Calculate overall performance metrics
true_labels_flat = np.hstack([y_test[t] for t in targets])
pred_probs_flat = np.hstack([y_prob_df[t] for t in targets])
pred_labels_flat = np.hstack([y_pred_df[t] for t in targets])

accuracy = accuracy_score(true_labels_flat, pred_labels_flat)
auc_micro = roc_auc_score(true_labels_flat, pred_probs_flat)
auc_macro = np.mean([roc_auc_score(y_test[t], y_prob_df[t]) for t in targets])

# Print overall results
print("---Apply Best Hyper Param for Test Set---")
print(f"Overall Test Accuracy: {accuracy:.4f}")
print(f"AUC Scores for Each Target:")
for target in targets:
    print(f"{target}: {roc_auc_score(y_test[target], y_prob_df[target]):.4f}")


### Emsemble method - Gradient Boosting

In [None]:
# Initialize DataFrames for storing probabilities and predictions
y_prob_df = pd.DataFrame()
y_pred_df = pd.DataFrame()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Loop through each target variable for model training and evaluation
for target in targets:
    # Initialize GradientBoostingClassifier and GridSearchCV
    gb_clf = GradientBoostingClassifier(random_state=2023)
    grid_search = GridSearchCV(gb_clf, param_grid, cv=5, scoring='roc_auc')

    # Perform hyperparameter tuning
    grid_search.fit(X_train, y_train[target])

    # Best model and parameters after hyperparameter tuning
    best_gb = grid_search.best_estimator_
    print(f"Best hyperparameters for {target}: {grid_search.best_params_}")

    # Predict probabilities and classes on the test set
    y_prob = best_gb.predict_proba(X_test)[:, 1]
    y_pred = best_gb.predict(X_test)

    # Store the probabilities and predictions
    y_prob_df[target] = y_prob
    y_pred_df[target] = y_pred

    # Calculate and print AUC for the target
    auc_score = roc_auc_score(y_test[target], y_prob)
    print(f"Gradient Boosting AUC for {target}: {auc_score:.4f}")

# Calculate overall performance metrics
true_labels_flat = np.hstack([y_test[t] for t in targets])
pred_probs_flat = np.hstack([y_prob_df[t] for t in targets])
pred_labels_flat = np.hstack([y_pred_df[t] for t in targets])

accuracy = accuracy_score(true_labels_flat, pred_labels_flat)
auc_micro = roc_auc_score(true_labels_flat, pred_probs_flat)
auc_macro = np.mean([roc_auc_score(y_test[t], y_prob_df[t]) for t in targets])

# Print overall results
print("---Apply Best Hyper Param for Test Set---")
print(f"Overall Test Accuracy: {accuracy:.4f}")
print(f"AUC Scores for Each Target:")
for target in targets:
    print(f"{target}: {roc_auc_score(y_test[target], y_prob_df[target]):.4f}")


### ANN - 1 hidden layer


### ANN - 3 layers

###  Proposed - Multitask learning with Autoencoder