# Environment setting

In [None]:
kaggle = True

In [None]:
!pip install -q autogluon --no-index --find-links=file:///kaggle/input/download-autogluon

In [None]:
!pip install -q /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install -q /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install -q /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install -q /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install -q /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

## Autogluon W/O FE

In [None]:
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')
from joblib import dump, load

# lifelines
from lifelines import CoxPHFitter
from lifelines import KaplanMeierFitter
from lifelines import NelsonAalenFitter
from lifelines import BreslowFlemingHarringtonFitter

# for models
import lightgbm as lgb
from scipy.stats import rankdata 
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import PowerTransformer

"""
To evaluate the equitable prediction of transplant survival outcomes,
we use the concordance index (C-index) between a series of event
times and a predicted score across each race group.
 
It represents the global assessment of the model discrimination power:
this is the model’s ability to correctly provide a reliable ranking
of the survival times based on the individual risk scores.
 
The concordance index is a value between 0 and 1 where:
 
0.5 is the expected result from random predictions,
1.0 is perfect concordance (with no censoring, otherwise <1.0),
0.0 is perfect anti-concordance (with no censoring, otherwise >0.0)

"""

import pandas as pd
import pandas.api.types
import numpy as np
from lifelines.utils import concordance_index
from colorama import Fore, Back, Style

class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """
    
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))


In [None]:
class Config:
    if not kaggle:
        train_path = '../data/train.csv'
        test_path = '../data/test.csv'
        subm_path = '../data/sample_submission.csv'
    else:
        train_path = Path('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
        test_path = Path('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')
        subm_path = Path('/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv')


    early_stopping_round = 300

    batch_size = 32768
    early_stop = 300
    penalizer = 0.01
    n_splits = 10
    seed = 42

    weights = [1.0, 1.0, 8.0, 4.0, 8.0, 4.0, 6.0, 6.0]
    # weights = [0.0, 0.0, 8.0, 4.0, 8.0, 4.0, 6.0, 6.0]

    ctb_params = {
        'loss_function': 'RMSE',
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'num_trees': 6000,
        'subsample': 0.85,
        'reg_lambda': 8.0,
        'depth': 8,
        # 'thread_count': 12,
    }

    lgb_params = {
        'objective': 'regression',
        'min_child_samples': 32,
        'num_iterations': 6000,
        'learning_rate': 0.03,
        'extra_trees': True,
        'reg_lambda': 8.0,
        'reg_alpha': 0.1,
        'num_leaves': 64,
        'metric': 'rmse',
        'max_depth': 8,
        'device': 'cpu',
        'max_bin': 128,
        'verbose': -1,
        'seed': 42
    }
    
    xgb_params = {
        'device': 'cuda',
        'max_depth': 3,
        'colsample_bytree': 0.5,
        'subsample': 0.8,
        'n_estimators': 5000,
        'learning_rate': 0.03,
        'enable_categorical': True,
        'min_child_weight': 80,
         "early_stopping_rounds": 300,
         "tree_method": "hist",
         "objective": 'reg:squaredlogerror',
         "eval_metric": 'rmse', 
    }

    # Parameters for the first CatBoost model with Cox loss function
    cox1_params = {
        'grow_policy': 'Depthwise',
        'min_child_samples': 8,
        'loss_function': 'Cox',
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'num_trees': 6000,
        'subsample': 0.85,
        'reg_lambda': 8.0,
        'depth': 8
    }

    # Parameters for the second CatBoost model with Cox loss function
    cox2_params = {
        'grow_policy': 'Lossguide',
        'loss_function': 'Cox',
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'num_trees': 6000,
        'subsample': 0.85,
        'reg_lambda': 8.0,
        'num_leaves': 32,
        'depth': 8
    }

    xgb_cox_params = {
        'max_depth': 3,  
        'colsample_bytree': 0.5,  
        'subsample': 0.8,  
        'n_estimators': 2000,  
        'learning_rate': 0.02,  
        'enable_categorical': True,
        'min_child_weight': 80,
        'objective': 'survival:cox',
        'eval_metric': 'cox-nloglik',
    }
    
    lgb_tw_params = {
        'objective': 'tweedie',
        'min_child_samples': 32,
        'num_iterations': 6000,
        'learning_rate': 0.03,
        'extra_trees': True,
        'reg_lambda': 8.0,
        'reg_alpha': 0.1,
        'num_leaves': 64,
        'metric': 'rmse',
        'max_depth': 8,
        'device': 'cpu',
        'max_bin': 128,
        'verbose': -1,
        'seed': 42
    }

In [None]:
class FeatureEngineer:

    def __init__(self, batch_size):
        self._batch_size = batch_size

    def load_data(self, path):

        return pl.read_csv(path, batch_size=self._batch_size)

    def cast_datatypes(self, df):

        num_cols = [ 'hla_high_res_8', 'hla_low_res_8', 'hla_high_res_6', 'hla_low_res_6', 'hla_high_res_10', 'hla_low_res_10', 'hla_match_dqb1_high', 'hla_match_dqb1_low', 'hla_match_drb1_high', 'hla_match_drb1_low', 'hla_nmdp_6', 'year_hct', 'hla_match_a_high', 'hla_match_a_low', 'hla_match_b_high', 'hla_match_b_low', 'hla_match_c_high', 'hla_match_c_low', 'donor_age', 'age_at_hct', 'comorbidity_score', 'karnofsky_score', 'efs', 'efs_time' ]

        # fill null values
        for col in df.columns:
            if col in num_cols:
                # df = df.with_columns(pl.col(col).fill_null(-1).cast(pl.Float32))  
                df = df.with_columns(pl.col(col).cast(pl.Float32))  
            else:
                df = df.with_columns(pl.col(col).fill_null('Unknown').cast(pl.String))  

        return df.with_columns(pl.col('ID').cast(pl.Int32))

    def recalculate_hla_sums(self, df):
        
        df = df.with_columns(
            (pl.col("hla_match_a_low").fill_null(0) + pl.col("hla_match_b_low").fill_null(0) + 
             pl.col("hla_match_drb1_high").fill_null(0)).alias("hla_nmdp_6"),
            
            (pl.col("hla_match_a_low").fill_null(0) + pl.col("hla_match_b_low").fill_null(0) + 
             pl.col("hla_match_drb1_low").fill_null(0)).alias("hla_low_res_6"),
            
            (pl.col("hla_match_a_high").fill_null(0) + pl.col("hla_match_b_high").fill_null(0) + 
             pl.col("hla_match_drb1_high").fill_null(0)).alias("hla_high_res_6"),
            
            (pl.col("hla_match_a_low").fill_null(0) + pl.col("hla_match_b_low").fill_null(0) + 
             pl.col("hla_match_c_low").fill_null(0) + pl.col("hla_match_drb1_low").fill_null(0)
            ).alias("hla_low_res_8"),
            
            (pl.col("hla_match_a_high").fill_null(0) + pl.col("hla_match_b_high").fill_null(0) + 
             pl.col("hla_match_c_high").fill_null(0) + pl.col("hla_match_drb1_high").fill_null(0)
            ).alias("hla_high_res_8"),
            
            (pl.col("hla_match_a_low").fill_null(0) + pl.col("hla_match_b_low").fill_null(0) + 
             pl.col("hla_match_c_low").fill_null(0) + pl.col("hla_match_drb1_low").fill_null(0) +
             pl.col("hla_match_dqb1_low").fill_null(0)).alias("hla_low_res_10"),
            
            (pl.col("hla_match_a_high").fill_null(0) + pl.col("hla_match_b_high").fill_null(0) + 
             pl.col("hla_match_c_high").fill_null(0) + pl.col("hla_match_drb1_high").fill_null(0) +
             pl.col("hla_match_dqb1_high").fill_null(0)).alias("hla_high_res_10"),
        )

        return df
    

    def numeric_fe(self, df):
        # df['num_null_count'] = (df.isna()).sum(axis=1)
        # df['total_null_count'] = df['num_null_count'] + df['cat_null_count']
        # df['age_diff'] = abs(df['donor_age'] - df['age_at_hct'])
        df['age_diff'] = df['donor_age'] - df['age_at_hct']
        df['age_ratio'] = df['donor_age'] / df['age_at_hct']
        # df.loc[((df.donor_age < 0) | (df.age_at_hct < 0)),'age_diff'] = -1
        # df.loc[((df.donor_age < 0) | (df.age_at_hct < 0)),'age_ratio'] = -1
        # df['older_donor'] = df['age_ratio'].apply(lambda x: 'Yes' if x>1 else 'No')
        # df.loc[((df['donor_age'].isna())|(df['age_at_hct'].isna())),'older_donor'] = 'Unknown'
        # df['null_count_diff'] = df['cat_null_count'] - df['num_null_count']

        return df

    def cat_fe(self, df):
        # df['cat_null_count'] = (df=="Unknown").sum(axis=1)

        return df

    def select_features(self, df):
        base_features = ['ID', 'dri_score', 'psych_disturb', 'cyto_score', 'diabetes',
            'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia',
            'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue',
            'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status',
            'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6',
            'hla_match_c_low', 'rituximab', 'hla_match_drb1_low',
            'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail',
            'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct',
            'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe',
            'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer',
            'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue',
            'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score',
            'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related',
            'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high',
            'pulm_moderate', 'hla_low_res_10', 'efs', 'efs_time']

        selected_features = ['older_donor', 'total_null_count', 'null_count_diff', 'age_ratio'] 
        features = list(set(df.columns) & set(base_features + selected_features))
        return df[features]

    def info(self, df):
        
        print(f'\nShape of dataframe: {df.shape}') 
        
        mem = df.memory_usage().sum() / 1024**2
        print('Memory usage: {:.2f} MB\n'.format(mem))

        display(df.head())

    def apply_fe(self, path):

        df = self.load_data(path)
        df = self.cast_datatypes(df)
        df = self.recalculate_hla_sums(df)
        df = df.to_pandas()
        df = self.cat_fe(df)
        df = self.numeric_fe(df)

        # df = self.select_features(df)

        self.info(df)
        
        cat_cols = [col for col in df.columns if df[col].dtype == pl.String]
        print(cat_cols)

        return df, cat_cols

In [None]:
feature_engineer = FeatureEngineer(Config.batch_size)
train_data, cat_cols = feature_engineer.apply_fe(Config.train_path)
test_data, cat_cols = feature_engineer.apply_fe(Config.test_path)

In [None]:
# Due to environment issue, we might not be able to take advantage of autogluon
use_autogluon = True

In [None]:
from autogluon.tabular import TabularPredictor

# autogluon_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-1-2-2/autogluon_2',require_version_match=False)
autogluon_1_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-1/autogluon_1',require_version_match=False)
autogluon_2_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-2/autogluon_2',require_version_match=False)
autogluon_3_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-3/autogluon_3',require_version_match=False)
autogluon_4_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-4/autogluon_4',require_version_match=False)
autogluon_5_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-5/autogluon_5',require_version_match=False)
autogluon_6_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-6/autogluon_6',require_version_match=False)
autogluon_7_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-7/autogluon_7',require_version_match=False)
autogluon_8_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-8/autogluon_8',require_version_match=False)
autogluon_cls_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-classification/autogluon_cls',require_version_match=False)


# autogluon_kaplan_predictor = TabularPredictor.load('/kaggle/input/autogluon-kaplan-no-fe/autogluon_Kaplan_No_FE',require_version_match=False)
# autogluon_nelson_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-nelson-no-fe/autogluon_Nelson_No_FE',require_version_match=False)
# autogluon_bfh_predictor = TabularPredictor.load('/kaggle/input/autogluon-bfh-no-fe-models/autogluon_BreslowFleming-Harrington_No_FE',require_version_match=False)
# autogluon_quantile_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-quantile-no-fe/autogluon_quantile_No_FE',require_version_match=False)
# autogluon_ranklog_predictor = TabularPredictor.load('/kaggle/input/cibmtr-autogluon-ranklog-no-fe/autogluon_ranklog_No_FE',require_version_match=False)
# autogluon_cls_predictor = TabularPredictor.load('/kaggle/input/autogluon-efs-binary-classification/autogluon_classification_No_FE/autogluon_classification_No_FE',require_version_match=False)


autogluon_1_oof_preds = autogluon_1_predictor.predict_oof()
autogluon_2_oof_preds = autogluon_2_predictor.predict_oof()
autogluon_3_oof_preds = autogluon_3_predictor.predict_oof()
autogluon_4_oof_preds = autogluon_4_predictor.predict_oof()
autogluon_5_oof_preds = autogluon_5_predictor.predict_oof()
autogluon_6_oof_preds = autogluon_6_predictor.predict_oof()
autogluon_7_oof_preds = autogluon_7_predictor.predict_oof()
autogluon_8_oof_preds = autogluon_8_predictor.predict_oof()
autogluon_cls_oof_preds = autogluon_cls_predictor.predict_proba_oof()
# autogluon_nelson_oof_preds = autogluon_nelson_predictor.predict_oof()
# autogluon_bfh_oof_preds = autogluon_bfh_predictor.predict_oof()
# autogluon_quantile_oof_preds = autogluon_quantile_predictor.predict_oof()
# autogluon_ranklog_oof_preds = autogluon_ranklog_predictor.predict_oof()
# autogluon_cls_oof_preds = autogluon_cls_predictor.predict_proba_oof()

In [None]:
%%time
if use_autogluon:
    autogluon_1_preds = autogluon_1_predictor.predict(test_data)
    autogluon_2_preds = autogluon_2_predictor.predict(test_data)
    autogluon_3_preds = autogluon_3_predictor.predict(test_data)
    autogluon_4_preds = autogluon_4_predictor.predict(test_data)
    autogluon_5_preds = autogluon_5_predictor.predict(test_data)
    autogluon_6_preds = autogluon_6_predictor.predict(test_data)
    autogluon_7_preds = autogluon_7_predictor.predict(test_data)
    autogluon_8_preds = autogluon_8_predictor.predict(test_data)
    autogluon_cls_preds = autogluon_cls_predictor.predict_proba(test_data)
# autogluon_nelson_preds = autogluon_nelson_predictor.predict(test)
# autogluon_bfh_preds = autogluon_bfh_predictor.predict(test)
# autogluon_quantile_preds = autogluon_qantile_predictor.predict(test)
# autogluon_ranklog_preds = autogluon_ranklog_predictor.predict(test)
# autogluon_cls_preds = autogluon_cls_predictor.predict(test)

## ZDH Augluon

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np, pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

from metric import score

test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
print("Test shape:", test.shape )

train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
print("Train shape:",train.shape)
train.head()

In [None]:
def recalculate_hla_sums(df):
    df = df.copy()
    # Replace NaN with 0 and calculate new columns
    df['hla_nmdp_6'] = (df['hla_match_a_low'].fillna(0) + df['hla_match_b_low'].fillna(0) + df['hla_match_drb1_high'].fillna(0))
    df['hla_low_res_6'] = (df['hla_match_a_low'].fillna(0) + df['hla_match_b_low'].fillna(0) + df['hla_match_drb1_low'].fillna(0))
    df['hla_high_res_6'] = (df['hla_match_a_high'].fillna(0) + df['hla_match_b_high'].fillna(0) + df['hla_match_drb1_high'].fillna(0))
    df['hla_low_res_8'] = (df['hla_match_a_low'].fillna(0) + df['hla_match_b_low'].fillna(0) + df['hla_match_c_low'].fillna(0) + df['hla_match_drb1_low'].fillna(0))
    df['hla_high_res_8'] = (df['hla_match_a_high'].fillna(0) + df['hla_match_b_high'].fillna(0) + df['hla_match_c_high'].fillna(0) + df['hla_match_drb1_high'].fillna(0))
    df['hla_low_res_10'] = (df['hla_match_a_low'].fillna(0) + df['hla_match_b_low'].fillna(0) + df['hla_match_c_low'].fillna(0) + df['hla_match_drb1_low'].fillna(0) + df['hla_match_dqb1_low'].fillna(0))
    df['hla_high_res_10'] = (df['hla_match_a_high'].fillna(0) + df['hla_match_b_high'].fillna(0) + df['hla_match_c_high'].fillna(0) + df['hla_match_drb1_high'].fillna(0) + df['hla_match_dqb1_high'].fillna(0))
    return df
train = recalculate_hla_sums(train)
test = recalculate_hla_sums(test)

In [None]:
RMV = ["ID","efs","efs_time","target"]
FEATURES = [c for c in train.columns if not c in RMV]
CAT_COLS = []
NUM_COLS = []

for c in FEATURES:
    if test[c].dtype == "object":
        CAT_COLS.append(c)
    else:
        NUM_COLS.append(c)
print(f"In these features, there are {len(CAT_COLS)} CATEGORICAL FEATURES: {CAT_COLS}")

def update(df):
    global CAT_COLS
    for c in CAT_COLS:
        df[c] = df[c].astype(str).fillna("NaN").astype("category")
    for c in NUM_COLS:
        if df[c].dtype == "float64":
            df[c] = df[c].fillna(0).astype("float32")
        if df[c].dtype == "int64":
            df[c] = df[c].fillna(0).astype("int32")
    j_ch = ',[]{}:"\\<'
    for ch in j_ch:
        for c in CAT_COLS:
            df[c] = df[c].apply(lambda x: str(x).replace(ch, ""))
    return df
train = update(train)
test = update(test)

In [None]:
from sklearn.model_selection import StratifiedKFold
from lifelines import KaplanMeierFitter, CoxPHFitter, NelsonAalenFitter
from sklearn.preprocessing import OneHotEncoder, quantile_transform, FunctionTransformer, PolynomialFeatures, StandardScaler
n_splits = 10
seed = 42
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)

from sklearn.decomposition import NMF

def scale_with_efs_miya(efs):
    return 1/(np.log(efs+3.5))**1

def shift_points(points, efs):
    points = np.array(points)
    efs = np.array(efs)
    center_efs0 = points[efs == 0].mean(axis=0)
    center_efs1 = points[efs == 1].mean(axis=0)
    shift_vector = center_efs1 - center_efs0
    scaling_factors = scale_with_efs_miya(efs)
    #shift_vector2 = (shift_vector) * scaling_factors[:, None]
    shift_vector2 = shift_vector
    shifted_points = points + efs[:, None] * shift_vector2
    return shifted_points, center_efs0, center_efs1

def transform_quantile(time, event):
    """Transform the target by stretching the range of eventful efs_times and compressing the range of event_free efs_times
    
    From https://www.kaggle.com/code/ambrosm/esp-eda-which-makes-sense"""
    transformed = np.full(len(time), np.nan)
    transformed_dead = quantile_transform(- time[event == 1].values.reshape(-1, 1)).ravel()
    transformed[event == 1] = transformed_dead
    transformed[event == 0] = transformed_dead.min() - 0.3
    return transformed

def update_target_with_survival_probabilities(df, method="kaplan", time_col='efs_time', event_col='efs'):
    res = np.zeros(df.shape[0])
    for train_idx, val_idx in skf.split(df, df["race_group"]):
        X_trn, X_val = df.iloc[train_idx], df.iloc[val_idx]
        if method == "kaplan":
            kmf = KaplanMeierFitter()
            kmf.fit(durations = X_trn[time_col], event_observed = X_trn[event_col])
            res[val_idx] =  kmf.survival_function_at_times(X_val[time_col]).values
        elif method == "nelson":
            naf = NelsonAalenFitter()
            naf.fit(durations=X_trn[time_col], event_observed=X_trn[event_col])
            res[val_idx] = -naf.cumulative_hazard_at_times(X_val[time_col]).values
        elif method == "cox":
            data_trn = pd.get_dummies(X_trn, columns=CAT_COLS, drop_first=True).drop("ID",axis=1)
            data_val = pd.get_dummies(X_val, columns=CAT_COLS, drop_first=True).drop("ID",axis=1)
            # Drop constant columns if they exist
            train_data = data_trn.loc[:, data_trn.nunique() > 1]
            valid_data = data_val[train_data.columns]
            cph = CoxPHFitter(penalizer=0.01)
            cph.fit(train_data, duration_col=time_col, event_col=event_col)
            res[val_idx] = cph.predict_partial_hazard(valid_data).values
        elif method == 'quantile':
            y_tr = transform_quantile(time=X_val.efs_time, event=X_val.efs)
            res[val_idx] = y_tr
    df['target'] = res
    if method == "kaplan":
        df.loc[df[event_col] == 0, 'target'] -= 0.15
    if method == "nelson":
        df.loc[df.efs == 0, "target"] = (-(-df.loc[df.efs == 0, "target"])**0.5)
    return df
train = update_target_with_survival_probabilities(train, method="quantile", time_col='efs_time', event_col='efs')

In [None]:
def collapse_year(x):
    if x == 2020:
        return 2019
    else:
        return x

def cat_fe(df):
    df = df.copy()
    df["KPS_Bin"] = pd.cut(df["karnofsky_score"],bins=[0, 10, 50, 80, 100],labels=["Critical", "Severely_Dependent", "Partially_Independent", "Healthy"],right=False) # 1.111457
    df["Comorbidity_Bin"] = pd.cut(df["comorbidity_score"],bins=[0, 2, 5, np.inf],labels=["Low", "Medium", "High"],right=False) # 1.528398
    df["Combined_Bin"] = df["KPS_Bin"].astype(str) + "_" + df["Comorbidity_Bin"].astype(str) # 1.450107
    # df['is_cyto_score_same'] = (df['cyto_score'].astype(str) == df['cyto_score_detail'].astype(str)).astype(int)
    # df['strange_age'] = df.age_at_hct == 0.044
    df["age_bin"] = pd.cut(df["age_at_hct"],bins=[0,0.0441,16,30,50,100])
    # df["age_group"] = np.where(df["age_at_hct"] < 16, '<16','>=16')
    df[["donor_sex","recipient_sex"]] = df["sex_match"].str.split('-', expand=True)
    df["tbi_status+gvhd_proph"] = df["tbi_status"].astype(str) + '_' + df["gvhd_proph"].astype(str)
    return df

train = cat_fe(train)
test = cat_fe(test)

In [None]:
# gaps = [1,2,3]
hla_high_cols = ['hla_match_a_high', 'hla_match_b_high', 'hla_match_c_high']
hla_low_cols = ['hla_match_a_low', 'hla_match_b_low', 'hla_match_c_low']
def numeric_fe(df):
    df['year_hct'] = df['year_hct'].apply(collapse_year)
    df["age_diff"] = df["donor_age"] - df["age_at_hct"] # 1.560917
    df["age_ratio"] =  df["donor_age"] / df["age_at_hct"] # 1.281368
    df["age_ts"] = df["age_at_hct"] / df["donor_age"]

   # df["KPS/Comorbidity"] = df["karnofsky_score"] / df["comorbidity_score"] # 0.70
    df["KPS_Minus_Comorbidity"] = df["karnofsky_score"] - df["comorbidity_score"] # 2.166210
    df["KPS_Comorbidity_sum"] = df["karnofsky_score"] + df["comorbidity_score"] # 1.533922
    df["KPS_Multi_Comorbidity"] = df["karnofsky_score"] * df["comorbidity_score"] # 0.99
    df["years_since_2000"] = df['year_hct'] - 2000 #1.141439

    df["sex_match_is_same"] = (df["donor_sex"] == df["recipient_sex"]).astype(int)
    return df
train = numeric_fe(train)
test = numeric_fe(test)

In [None]:
from sklearn.cluster import KMeans
def create_kmeans_features(train, test, n_clusters=8, cat_cols=None, num_cols=None, seed=42):
    if cat_cols is None:
        cal_cols = []
    if num_cols is None:
        num_cols = []
    cols = cat_cols + num_cols    
    train_encoded = pd.get_dummies(train[cols], columns=cat_cols, drop_first=True)
    test_encoded  = pd.get_dummies(test[cols],  columns=cat_cols, drop_first=True)
    test_encoded  = test_encoded.reindex(columns=train_encoded.columns,fill_value=0)
    kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
    train_clusters = kmeans.fit_predict(train_encoded)
    train['kmeans_cluster'] = train_clusters
    test_clusters = kmeans.predict(test_encoded)
    test['kmeans_cluster'] = test_clusters
    
    return train, test

train, test = create_kmeans_features(train, test,n_clusters=5, cat_cols=CAT_COLS, num_cols=NUM_COLS)

In [None]:
# new FE version.3 ( for usage within the CV loop )

def race_group_white_FE(df, disease_rank_df=None, conditioning_rank_df=None):
    # Trainデータの処理
    # "White" = 2
    if disease_rank_df is None:
        disease_rank_df = df[df['race_group'] == "White"]['prim_disease_hct'].value_counts().reset_index()
        disease_rank_df.columns = ['prim_disease_hct', 'count']
        disease_rank_df['White_disease_rank'] = disease_rank_df['count'].rank(method='dense', ascending=False).astype(int)

    if conditioning_rank_df is None:
        conditioning_rank_df = df[df['race_group'] == "White"]['conditioning_intensity'].value_counts().reset_index()
        conditioning_rank_df.columns = ['conditioning_intensity', 'count']
        conditioning_rank_df['White_conditioning_rank'] = conditioning_rank_df['count'].rank(method='dense', ascending=False).astype(int)

    # prim_disease_hct のランクを適用
    df = df.merge(disease_rank_df[['prim_disease_hct', 'White_disease_rank']], on='prim_disease_hct', how='left')
    max_disease_rank = disease_rank_df['White_disease_rank'].max() if not disease_rank_df.empty else 0
    df['White_disease_rank'] = df['White_disease_rank'].fillna(max_disease_rank + 1).astype(int)

    # conditioning_intensity のランクを適用
    df = df.merge(conditioning_rank_df[['conditioning_intensity', 'White_conditioning_rank']], on='conditioning_intensity', how='left')
    max_conditioning_rank = conditioning_rank_df['White_conditioning_rank'].max() if not conditioning_rank_df.empty else 0
    df['White_conditioning_rank'] = df['White_conditioning_rank'].fillna(max_conditioning_rank + 1).astype(int)

    # 新しい特徴量の作成
    df['comorbidity_score*WDR'] = df['comorbidity_score'] * df['White_disease_rank']
    df['karnofsky_score/WDR'] = df['karnofsky_score'] / df['White_disease_rank']
    df['donor_age*WDR'] = df['donor_age'] * df['White_disease_rank']
    
    df['comorbidity_score*WCR'] = df['comorbidity_score'] * df['White_conditioning_rank']
    df['karnofsky_score/WCR'] = df['karnofsky_score'] / df['White_conditioning_rank']
    df['donor_age*WCR'] = df['donor_age'] * df['White_conditioning_rank']

    return df, disease_rank_df, conditioning_rank_df

# Trainデータでランクを作成
train, disease_rank_df, conditioning_rank_df = race_group_white_FE(train)

# TestデータにTrainのランクを適用
test, _, _ =race_group_white_FE(test, disease_rank_df, conditioning_rank_df)

In [None]:
def dfrank(newdf: pd.DataFrame): # 添加基础排名因子
    
    num_cols = ['donor_age', 'age_at_hct','prim_disease_hct', 'year_hct']
    columns=[column for column in newdf.columns if column in num_cols]
    for column in columns:
        # 从小到大排名【测试下双排名有效果是因为加上了na_option='bottom'的处理机制还是因为实现的双排名方案】
        newdf=pd.concat([newdf,(newdf[str(column)].rank(method="max", ascending=False,na_option='bottom')/len(newdf)).rename(f"{str(column)}_rank")], axis=1) # 从大到小排序
        # 从大到小排名
        newdf=pd.concat([newdf,(newdf[str(column)].rank(method="max", ascending=True,na_option='bottom')/len(newdf)).rename(f"{str(column)}_rerank")], axis=1) # 从大到小排序
    return newdf
train = dfrank(train)
test = dfrank(test)

In [None]:
RMV = ["ID","efs","efs_time","target"]
FEATURES = [c for c in train.columns if not c in RMV]
cat_cols = []
num_cols = []

for c in FEATURES:
    if test[c].dtype == "object" or test[c].dtype == "category":
        cat_cols.append(c)
    else:
        num_cols.append(c)
print(f"In these features, there are {len(cat_cols)} CATEGORICAL FEATURES: {cat_cols}")

def update(df):
    global cat_cols
    for c in cat_cols:
        df[c] = df[c].astype(str).fillna("NaN").astype("category")
    for c in num_cols:
        if df[c].dtype == "float64":
            df[c] = df[c].fillna(0).astype("float32")
        if df[c].dtype == "int64":
            df[c] = df[c].fillna(0).astype("int32")
    j_ch = ',[]{}:"\\<'
    for ch in j_ch:
        for c in cat_cols:
            df[c] = df[c].apply(lambda x: str(x).replace(ch, ""))
    return df
train = update(train)
test = update(test)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
def tf_fe(train, test, text_columns, max_features=3000, analyzer='char_wb'):
        """
        将多个文本列进行TF-IDF
        :param train: 训练数据集
        :param test: 测试数据集
        :param text_columns: 文本列
        :param max_features: 每个文本列最大提取的特指数
        :param analyzer: 用于分析文本的方式，默认为'char_wb'，表示按字符（基于字符的词法分块）分析文本
        :return:
        """
        train_features = []
        test_features = []

        for col in tqdm(text_columns, desc="Processing text columns", unit="col"):
            vectorizer = TfidfVectorizer(analyzer=analyzer, max_features=max_features)
            train_tfidf_col = vectorizer.fit_transform(train[col])
            test_tfidf_col = vectorizer.transform(test[col])
            train_tfidf_col = pd.DataFrame(train_tfidf_col.toarray(),
                                           columns=[f"tfidf_{col}_{i}" for i in range(train_tfidf_col.shape[1])])
            test_tfidf_col = pd.DataFrame(test_tfidf_col.toarray(),
                                          columns=[f"tfidf_{col}_{i}" for i in range(test_tfidf_col.shape[1])])
            train_features.append(train_tfidf_col)
            test_features.append(test_tfidf_col)

        train_with_tfidf = pd.concat([train, *train_features], axis=1)
        test_with_tfidf = pd.concat([test, *test_features], axis=1)

        return train_with_tfidf, test_with_tfidf
TF_IDF_COLS = ["conditioning_intensity","dri_score","sex_match"]
train, test = tf_fe(train, test, text_columns=TF_IDF_COLS, max_features=1000)

In [None]:
def year_tf(df):
    
    df['cos_year'] = np.cos(df['year_hct'] * (2 * np.pi) / 100)
    df['sin_year'] = np.sin(df['year_hct'] * (2 * np.pi) / 100)
    
    return df

def FE(df):
    df['age_at_hctmin']=df['year_hct']-df['age_at_hct']
    return df
train =  year_tf(train)
test  =  year_tf(test)
train = FE(train)
test  = FE(test)

In [None]:
RMV = ["ID","efs","efs_time","target"]
FEATURES = [c for c in train.columns if not c in RMV]
cat_cols = []
num_cols = []

for c in FEATURES:
    if test[c].dtype == "object" or test[c].dtype == "category":
        cat_cols.append(c)
    else:
        num_cols.append(c)
print(f"In these features, there are {len(cat_cols)} CATEGORICAL FEATURES: {cat_cols}")

In [None]:
combined = pd.concat([train,test],axis=0,ignore_index=True)
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    if c in CAT_COLS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")
        
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
    
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [None]:
f_fe = ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes',
       'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia',
       'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue',
       'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status',
       'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match',
       'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low',
       'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail',
       'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity',
       'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high',
       'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low',
       'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph',
       'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group',
       'comorbidity_score', 'karnofsky_score', 'hepatic_mild',
       'tce_div_match', 'donor_related', 'melphalan_dose',
       'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate',
       'hla_low_res_10', 'KPS_Bin', 'Comorbidity_Bin', 'Combined_Bin',
       'age_bin', 'donor_sex', 'recipient_sex', 'tbi_status+gvhd_proph',
       'age_diff', 'age_ratio', 'age_ts', 'KPS_Minus_Comorbidity',
       'KPS_Comorbidity_sum', 'KPS_Multi_Comorbidity', 'years_since_2000',
       'sex_match_is_same', 'kmeans_cluster', 'White_disease_rank',
       'White_conditioning_rank', 'comorbidity_score*WDR',
       'karnofsky_score/WDR', 'donor_age*WDR', 'comorbidity_score*WCR',
       'karnofsky_score/WCR', 'donor_age*WCR', 'prim_disease_hct_rank',
       'prim_disease_hct_rerank', 'year_hct_rank', 'year_hct_rerank',
       'donor_age_rank', 'donor_age_rerank', 'age_at_hct_rank',
       'age_at_hct_rerank', 'tfidf_conditioning_intensity_5',
       'tfidf_conditioning_intensity_14', 'tfidf_dri_score_9',
       'tfidf_dri_score_10', 'tfidf_sex_match_1', 'cos_year', 'sin_year',
       'age_at_hctmin','target']
train = train[f_fe]
test = test[[c for c in f_fe if c not in ['target']]]

In [None]:
from autogluon.tabular import TabularPredictor
if kaggle:
    predictor = TabularPredictor.load("/kaggle/input/cibmtr-autogluon-kaplan-models/autogluon_kaplan",require_version_match=False)
else:
    predictor = TabularPredictor(label= 'target',eval_metric ='mae',path=f"./autogluon_nelson",
                                 problem_type='regression').fit(train[f_fe], 
                                                            presets='best_quality',
                                                            time_limit=3600*8, 
                                                            included_model_types=['NN_TORCH', 'CAT', 'GBM','XGB'],) 
auto_kaplan_oof = predictor.predict_oof()
auto_kaplan_preds = predictor.predict(test)

In [None]:
from autogluon.tabular import TabularPredictor
if kaggle:
    predictor = TabularPredictor.load("/kaggle/input/cibmtr-autogluon-nelson-models/autogluon_nelson",require_version_match=False)
else:
    predictor = TabularPredictor(label= 'target',eval_metric ='mae',path=f"./autogluon_nelson",
                                 problem_type='regression').fit(train[f_fe], 
                                                            presets='best_quality',
                                                            time_limit=3600*8, 
                                                            included_model_types=['NN_TORCH', 'CAT', 'GBM','XGB'],) 
auto_nelson_oof = predictor.predict_oof()
auto_nelson_preds = predictor.predict(test)

In [None]:
from autogluon.tabular import TabularPredictor
if kaggle:
    predictor = TabularPredictor.load("/kaggle/input/cibmtr-autogluon-cox-models/autogluon_nelson",require_version_match=False)
else:
    predictor = TabularPredictor(label= 'target',eval_metric ='mae',path=f"./autogluon_nelson",
                                 problem_type='regression').fit(train[f_fe], 
                                                            presets='best_quality',
                                                            time_limit=3600*8, 
                                                            included_model_types=['NN_TORCH', 'CAT', 'GBM','XGB'],) 
auto_cox_oof = predictor.predict_oof()
auto_cox_preds = predictor.predict(test)

In [None]:
from autogluon.tabular import TabularPredictor
if kaggle:
    predictor = TabularPredictor.load("/kaggle/input/cibmtr-autogluon-quantile-models/autogluon_nelson",require_version_match=False)
else:
    predictor = TabularPredictor(label= 'target',eval_metric ='mae',path=f"./autogluon_nelson",
                                 problem_type='regression').fit(train[f_fe], 
                                                            presets='best_quality',
                                                            time_limit=3600*8, 
                                                            included_model_types=['NN_TORCH', 'CAT', 'GBM','XGB'],) 
auto_quantile_oof = predictor.predict_oof()
auto_quantile_preds = predictor.predict(test)

In [None]:
from metric import score
train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = auto_kaplan_oof
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for AutoGluon Kaplan =",m) 
y_pred["prediction"] = auto_nelson_oof
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for AutoGluon Nelson =",m) 
y_pred["prediction"] = auto_quantile_oof
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for AutoGluon quantile =",m) 
y_pred["prediction"] = auto_cox_oof
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for AutoGluon Cox =",m) 

In [None]:
!pip uninstall -y autogluon autogluon.common autogluon.core autogluon.features autogluon.multimodal autogluon.tabular autogluon.timeseries

In [None]:
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/pytorch_lightning-2.4.0-py3-none-any.whl
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -q /kaggle/input/omegaconf-2301/omegaconf-2.3.0-py3-none-any.whl
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/torchmetrics-1.5.2-py3-none-any.whl
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/pytorch_tabnet-4.1.0-py3-none-any.whl
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/einops-0.7.0-py3-none-any.whl
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/pytorch_tabular-1.1.1-py2.py3-none-any.whl

# NN 1

## Prepare

### prepare data

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import TensorDataset
from warnings import filterwarnings

filterwarnings('ignore')


def get_X_cat(df, cat_cols, transformers=None):
    """
    Apply a specific categorical data transformer or a LabelEncoder if None.
    """
    if transformers is None:
        transformers = [LabelEncoder().fit(df[col]) for col in cat_cols]
    return transformers, np.array(
        [transformer.transform(df[col]) for col, transformer in zip(cat_cols, transformers)]
    ).T


def preprocess_data(train, val):
    """
    Standardize numerical variables and transform (Label-encode) categoricals.
    Fill NA values with mean for numerical.
    Create torch dataloaders to prepare data for training and evaluation.
    """
    X_cat_train, X_cat_val, numerical, transformers = get_categoricals(train, val)
    scaler = StandardScaler()
    imp = SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)
    X_num_train = imp.fit_transform(train[numerical])
    X_num_train = scaler.fit_transform(X_num_train)
    X_num_val = imp.transform(val[numerical])
    X_num_val = scaler.transform(X_num_val)
    dl_train = init_dl(X_cat_train, X_num_train, train, training=True)
    dl_val = init_dl(X_cat_val, X_num_val, val)
    return X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers


def get_categoricals(train, val):
    """
    Remove constant categorical columns and transform them using LabelEncoder.
    Return the label-transformers for each categorical column, categorical dataframes and numerical columns.
    """
    categorical_cols, numerical = get_feature_types(train)
    remove = []
    for col in categorical_cols:
        if train[col].nunique() == 1:
            remove.append(col)
        ind = ~val[col].isin(train[col])
        if ind.any():
            val.loc[ind, col] = np.nan
    categorical_cols = [col for col in categorical_cols if col not in remove]
    transformers, X_cat_train = get_X_cat(train, categorical_cols)
    _, X_cat_val = get_X_cat(val, categorical_cols, transformers)
    return X_cat_train, X_cat_val, numerical, transformers


def init_dl(X_cat, X_num, df, training=False):
    """
    Initialize data loaders with 4 dimensions : categorical dataframe, numerical dataframe and target values (efs and efs_time).
    Notice that efs_time is log-transformed.
    Fix batch size to 2048 and return dataloader for training or validation depending on training value.
    """
    ds_train = TensorDataset(
        torch.tensor(X_cat, dtype=torch.long),
        torch.tensor(X_num, dtype=torch.float32),
        torch.tensor(df.efs_time.values, dtype=torch.float32).log(),
        torch.tensor(df.efs.values, dtype=torch.long)
    )
    bs = 2048
    dl_train = torch.utils.data.DataLoader(ds_train, batch_size=bs, pin_memory=True, shuffle=training)
    return dl_train


def get_feature_types(train):
    """
    Utility function to return categorical and numerical column names.
    """
    categorical_cols = [col for i, col in enumerate(train.columns) if ((train[col].dtype == "object") | (2 < train[col].nunique() < 25))]
    RMV = ["ID", "efs", "efs_time", "y"]
    FEATURES = [c for c in train.columns if not c in RMV]
    numerical = [i for i in FEATURES if i not in categorical_cols]
    return categorical_cols, numerical


def add_features(df):
    """
    Create some new features to help the model focus on specific patterns.
    """
    # sex_match = df.sex_match.astype(str)
    # sex_match = sex_match.str.split("-").str[0] == sex_match.str.split("-").str[1]
    # df['sex_match_bool'] = sex_match
    # df.loc[df.sex_match.isna(), 'sex_match_bool'] = np.nan
    # df['big_age'] = df.age_at_hct > 16
    # df.loc[df.year_hct == 2019, 'year_hct'] = 2020
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    df['age_diff'] = df['donor_age'] - df['age_at_hct']
    df['age_ratio'] = df['donor_age'] / df['age_at_hct']
    # df['strange_age'] = df.age_at_hct == 0.044
    # df['age_bin'] = pd.cut(df.age_at_hct, [0, 0.0441, 16, 30, 50, 100])
    # df['age_ts'] = df.age_at_hct / df.donor_age
    df['year_hct'] -= 2000
    
    return df


def load_data():
    """
    Load data and add features.
    """
    if kaggle:
        test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
        train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
    else:
        test = pd.read_csv("../data/test.csv")
        train = pd.read_csv("../data/train.csv")
        
    test = add_features(test)
    train = add_features(train)
    # test['cls'] = cls_preds
    # train['cls'] = cls_oof_preds
    print("Test shape:", test.shape)
    print("Train shape:", train.shape)
    return test, train

### prepare model

In [None]:
import functools
from typing import List

import pytorch_lightning as pl
import numpy as np
import torch
from lifelines.utils import concordance_index
from pytorch_lightning.cli import ReduceLROnPlateau
from pytorch_tabular.models.common.layers import ODST
from torch import nn
from pytorch_lightning.utilities import grad_norm


class CatEmbeddings(nn.Module):
    """
    Embedding module for the categorical dataframe.
    """
    def __init__(
        self,
        projection_dim: int,
        categorical_cardinality: List[int],
        embedding_dim: int
    ):
        """
        projection_dim: The dimension of the final output after projecting the concatenated embeddings into a lower-dimensional space.
        categorical_cardinality: A list where each element represents the number of unique categories (cardinality) in each categorical feature.
        embedding_dim: The size of the embedding space for each categorical feature.
        self.embeddings: list of embedding layers for each categorical feature.
        self.projection: sequential neural network that goes from the embedding to the output projection dimension with GELU activation.
        """
        super(CatEmbeddings, self).__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(cardinality, embedding_dim)
            for cardinality in categorical_cardinality
        ])
        self.projection = nn.Sequential(
            nn.Linear(embedding_dim * len(categorical_cardinality), projection_dim),
            nn.GELU(),
            nn.Linear(projection_dim, projection_dim)
        )

    def forward(self, x_cat):
        """
        Apply the projection on concatened embeddings that contains all categorical features.
        """
        x_cat = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=1)
        return self.projection(x_cat)


class NN(nn.Module):
    """
    Train a model on both categorical embeddings and numerical data.
    """
    def __init__(
            self,
            continuous_dim: int,
            categorical_cardinality: List[int],
            embedding_dim: int,
            projection_dim: int,
            hidden_dim: int,
            dropout: float = 0
    ):
        """
        continuous_dim: The number of continuous features.
        categorical_cardinality: A list of integers representing the number of unique categories in each categorical feature.
        embedding_dim: The dimensionality of the embedding space for each categorical feature.
        projection_dim: The size of the projected output space for the categorical embeddings.
        hidden_dim: The number of neurons in the hidden layer of the MLP.
        dropout: The dropout rate applied in the network.
        self.embeddings: previous embeddings for categorical data.
        self.mlp: defines an MLP model with an ODST layer followed by batch normalization and dropout.
        self.out: linear output layer that maps the output of the MLP to a single value
        self.dropout: defines dropout
        Weights initialization with xavier normal algorithm and biases with zeros.
        """
        super(NN, self).__init__()
        self.embeddings = CatEmbeddings(projection_dim, categorical_cardinality, embedding_dim)
        self.mlp = nn.Sequential(
            ODST(projection_dim + continuous_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout)
        )
        self.out = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

        # initialize weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x_cat, x_cont):
        """
        Create embedding layers for categorical data, concatenate with continous variables.
        Add dropout and goes through MLP and return raw output and 1-dimensional output as well.
        """
        x = self.embeddings(x_cat)
        x = torch.cat([x, x_cont], dim=1)
        x = self.dropout(x)
        x = self.mlp(x)
        return self.out(x), x


@functools.lru_cache
def combinations(N):
    """
    calculates all possible 2-combinations (pairs) of a tensor of indices from 0 to N-1, 
    and caches the result using functools.lru_cache for optimization
    """
    ind = torch.arange(N)
    comb = torch.combinations(ind, r=2)
    return comb.cuda()


class LitNN(pl.LightningModule):
    """
    Main Model creation and losses definition to fully train the model.
    """
    def __init__(
            self,
            continuous_dim: int,
            categorical_cardinality: List[int],
            embedding_dim: int,
            projection_dim: int,
            hidden_dim: int,
            lr: float = 1e-3,
            dropout: float = 0.2,
            weight_decay: float = 1e-3,
            aux_weight: float = 0.1,
            margin: float = 0.5,
            race_index: int = 0,
            efs_pair_weight: float = 1.0, # add
            same_race_weight: float = 1.0,
            special_race_weight: float = 1.0
    ):
        """
        continuous_dim: The number of continuous input features.
        categorical_cardinality: A list of integers, where each element corresponds to the number of unique categories for each categorical feature.
        embedding_dim: The dimension of the embeddings for the categorical features.
        projection_dim: The dimension of the projected space after embedding concatenation.
        hidden_dim: The size of the hidden layers in the feedforward network (MLP).
        lr: The learning rate for the optimizer.
        dropout: Dropout probability to avoid overfitting.
        weight_decay: The L2 regularization term for the optimizer.
        aux_weight: Weight used for auxiliary tasks.
        margin: Margin used in some loss functions.
        race_index: An index that refer to race_group in the input data.
        """
        super(LitNN, self).__init__()
        self.save_hyperparameters()

        # Creates an instance of the NN model defined above
        self.model = NN(
            continuous_dim=self.hparams.continuous_dim,
            categorical_cardinality=self.hparams.categorical_cardinality,
            embedding_dim=self.hparams.embedding_dim,
            projection_dim=self.hparams.projection_dim,
            hidden_dim=self.hparams.hidden_dim,
            dropout=self.hparams.dropout
        )
        self.targets = []

        # Defines a small feedforward neural network that performs an auxiliary task with 1-dimensional output
        self.aux_cls = nn.Sequential(
            nn.Linear(self.hparams.hidden_dim, self.hparams.hidden_dim // 3),
            nn.GELU(),
            nn.Linear(self.hparams.hidden_dim // 3, 1)
        )

    def on_before_optimizer_step(self, optimizer):
        """
        Compute the 2-norm for each layer
        If using mixed precision, the gradients are already unscaled here
        """
        norms = grad_norm(self.model, norm_type=2)
        self.log_dict(norms)

    def forward(self, x_cat, x_cont):
        """
        Forward pass that outputs the 1-dimensional prediction and the embeddings (raw output)
        """
        x, emb = self.model(x_cat, x_cont)
        return x.squeeze(1), emb

    def training_step(self, batch, batch_idx):
        """
        defines how the model processes each batch of data during training.
        A batch is a combination of : categorical data, continuous data, efs_time (y) and efs event.
        y_hat is the efs_time prediction on all data and aux_pred is auxiliary prediction on embeddings.
        Calculates loss and race_group loss on full data.
        Auxiliary loss is calculated with an event mask, ignoring efs=0 predictions and taking the average.
        Returns loss and aux_loss multiplied by weight defined above.
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        aux_pred = self.aux_cls(emb).squeeze(1)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        aux_loss = nn.functional.mse_loss(aux_pred, y, reduction='none')
        aux_mask = efs == 1
        aux_loss = (aux_loss * aux_mask).sum() / aux_mask.sum()
        self.log("train_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("race_loss", race_loss, on_epoch=True, prog_bar=True, logger=True, on_step=False)
        self.log("aux_loss", aux_loss, on_epoch=True, prog_bar=True, logger=True, on_step=False)
        return loss + aux_loss * self.hparams.aux_weight


    def get_race_losses(self, efs, x_cat, y, y_hat):
        """
        Calculate loss for each race_group based on deviation/variance.
        """
        races = torch.unique(x_cat[:, self.hparams.race_index])
        race_group = x_cat[:, self.hparams.race_index].tolist()  # race_groupは文字列のリストとなる
        race_losses = []
        for race in races:
            ind = x_cat[:, self.hparams.race_index] == race
            race_losses.append(self.calc_loss(y[ind], y_hat[ind], efs[ind], race_group))
        race_loss = sum(race_losses) / len(race_losses)
        races_loss_std = sum((r - race_loss)**2 for r in race_losses) / len(race_losses)
        return torch.sqrt(races_loss_std)

    def get_full_loss(self, efs, x_cat, y, y_hat):
        """
        x_catからrace group情報（文字列が入っているカラム）を抽出し、calc_lossに渡す。
        ここでは、race groupカラムはself.hparams.race_indexで指定される列にあると想定。
        """
        # もしx_catがTensorの場合、race groupの部分はPythonリストに変換
        # 例：x_cat[:, race_index]が数値のインデックスの場合は、事前にマッピング済みのリストに変換しておくか、
        # ここで直接文字列が入っていると仮定
        race_group = x_cat[:, self.hparams.race_index].tolist()  # race_groupは文字列のリストとなる
        loss = self.calc_loss(y, y_hat, efs, race_group)
        race_loss = self.get_race_losses(efs, x_cat, y, y_hat)
        loss += 0.1 * race_loss
        race_loss = 0
        return loss, race_loss
    
    def calc_loss(self, y, y_hat, efs, race_group):
        """
        ペアワイズ損失の計算関数です。
        ・全サンプル間の組み合わせ（valid event ペア）を生成し、
        ・各ペアについて、同じrace groupなら重みをsame_race_weight（例：1.2など）に、異なる場合は1.0とする。
        ・さらに、両方ともefs==1なら追加の重み（efs_pair_weight）を乗じる。
        ・さらに、ペアのどちらかに'special race'（ここでは 'black' または 'white'）が含まれる場合、追加の重み（black_white_weight）を乗じる。
        ・その重みをペアワイズ損失に乗じて、マージン付きヒンジロスを計算します。
        """
        N = y.shape[0]
        comb = combinations(N)  # 全サンプルの2組み合わせ (shape: [num_pairs, 2])
        
        # まず、少なくとも片方がイベント発生（efs==1）のペアに限定
        valid_event_mask = (efs[comb[:, 0]] == 1) | (efs[comb[:, 1]] == 1)
        comb = comb[valid_event_mask]
        
        # race_group（文字列リスト）を整数ラベルに変換する
        unique_races = {}
        race_int_list = []
        for r in race_group:
            if r not in unique_races:
                unique_races[r] = len(unique_races)
            race_int_list.append(unique_races[r])
        # integer labelsをTensorに変換（combと同じデバイスに配置）
        race_int_tensor = torch.tensor(race_int_list, device=comb.device)
        
        # 各ペアについて、同じrace groupかどうかを判定
        same_race_indicator = (race_int_tensor[comb[:, 0]] == race_int_tensor[comb[:, 1]])
        weight_factor = torch.where(
            same_race_indicator,
            torch.tensor(self.hparams.same_race_weight, device=comb.device),
            torch.tensor(1.0, device=comb.device)
        )
        
        # 両方ともefs==1なら追加重みを適用
        both_events_mask = (efs[comb[:, 0]] == 1) & (efs[comb[:, 1]] == 1)
        additional_weight = torch.where(
            both_events_mask,
            torch.tensor(self.hparams.efs_pair_weight, device=comb.device),
            torch.tensor(1.0, device=comb.device)
        )
        
        # 'black' または 'white' のどちらかに該当する場合に追加重みを適用
        special_flag = torch.tensor([r in {'White', 'Native Hawaiian or other Pacific Islander', 'Black or African-American'} for r in race_group], device=comb.device)
        # どちらか一方がspecial raceならTrue
        special_mask = special_flag[comb[:, 0]] | special_flag[comb[:, 1]]
        special_weight = torch.where(
            special_mask,
            torch.tensor(self.hparams.special_race_weight, device=comb.device),
            torch.tensor(1.0, device=comb.device)
        )
        
        # 各重みの積を最終的な重みとする
        final_weight = weight_factor * additional_weight * special_weight
        
        # 各ペアの予測値と実際のyを取得
        pred_left = y_hat[comb[:, 0]]
        pred_right = y_hat[comb[:, 1]]
        y_left = y[comb[:, 0]]
        y_right = y[comb[:, 1]]
        
        # 正しい順序の場合は +1、そうでなければ -1
        y_pair = 2 * (y_left > y_right).int() - 1
        
        # マージン付きヒンジロスの計算
        loss_pair = nn.functional.relu(-y_pair * (pred_left - pred_right) + self.hparams.margin)
        
        # 重みを乗じる
        weighted_loss_pair = loss_pair.double() * final_weight.double()
        
        # 既存のget_mask関数による有効なペアのマスク
        mask = self.get_mask(comb, efs, y_left, y_right)
        
        loss = (weighted_loss_pair * mask.double()).sum() / mask.sum()
        return loss
    


    def get_mask(self, comb, efs, y_left, y_right):
        """
        Defines all invalid comparisons :
        * Case 1: "Left outlived Right" but Right is censored
        * Case 2: "Right outlived Left" but Left is censored
        Masks for case 1 and case 2 are combined using |= operator and inverted using ~ to create a "valid pair mask"
        """
        left_outlived = y_left >= y_right
        left_1_right_0 = (efs[comb[:, 0]] == 1) & (efs[comb[:, 1]] == 0)
        mask2 = (left_outlived & left_1_right_0)
        right_outlived = y_right >= y_left
        right_1_left_0 = (efs[comb[:, 1]] == 1) & (efs[comb[:, 0]] == 0)
        mask2 |= (right_outlived & right_1_left_0)
        mask2 = ~mask2
        mask = mask2
        return mask

    def validation_step(self, batch, batch_idx):
        """
        This method defines how the model processes each batch during validation
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        self.targets.append([y, y_hat.detach(), efs, x_cat[:, self.hparams.race_index]])
        self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def on_validation_epoch_end(self):
        """
        At the end of the validation epoch, it computes and logs the concordance index
        """
        cindex, metric = self._calc_cindex()
        self.log("cindex", metric, on_epoch=True, prog_bar=True, logger=True)
        self.log("cindex_simple", cindex, on_epoch=True, prog_bar=True, logger=True)
        self.targets.clear()

    def _calc_cindex(self):
        """
        Calculate c-index accounting for each race_group or global.
        """
        y = torch.cat([t[0] for t in self.targets]).cpu().numpy()
        y_hat = torch.cat([t[1] for t in self.targets]).cpu().numpy()
        efs = torch.cat([t[2] for t in self.targets]).cpu().numpy()
        races = torch.cat([t[3] for t in self.targets]).cpu().numpy()
        metric = self._metric(efs, races, y, y_hat)
        cindex = concordance_index(y, y_hat, efs)
        return cindex, metric

    def _metric(self, efs, races, y, y_hat):
        """
        Calculate c-index accounting for each race_group
        """
        metric_list = []
        for race in np.unique(races):
            y_ = y[races == race]
            y_hat_ = y_hat[races == race]
            efs_ = efs[races == race]
            metric_list.append(concordance_index(y_, y_hat_, efs_))
        metric = float(np.mean(metric_list) - np.sqrt(np.var(metric_list)))
        return metric

    def test_step(self, batch, batch_idx):
        """
        Same as training step but to log test data
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        self.targets.append([y, y_hat.detach(), efs, x_cat[:, self.hparams.race_index]])
        self.log("test_loss", loss)
        return loss

    def on_test_epoch_end(self) -> None:
        """
        At the end of the test epoch, calculates and logs the concordance index for the test set
        """
        cindex, metric = self._calc_cindex()
        self.log("test_cindex", metric, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_cindex_simple", cindex, on_epoch=True, prog_bar=True, logger=True)
        self.targets.clear()


    def configure_optimizers(self):
        """
        configures the optimizer and learning rate scheduler:
        * Optimizer: Adam optimizer with weight decay (L2 regularization).
        * Scheduler: Cosine Annealing scheduler, which adjusts the learning rate according to a cosine curve.
        """
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        scheduler_config = {
            "scheduler": torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer,
                T_max=45,
                eta_min=6e-3
            ),
            "interval": "epoch",
            "frequency": 1,
            "strict": False,
        }

        return {"optimizer": optimizer, "lr_scheduler": scheduler_config}

## Train and Predict

In [None]:
# import json
import pytorch_lightning as pl
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import torch
from pytorch_lightning.callbacks import LearningRateMonitor, TQDMProgressBar
from pytorch_lightning.callbacks import StochasticWeightAveraging
from sklearn.model_selection import KFold, StratifiedKFold

pl.seed_everything(42)

def main(hparams):
    """
    Main function to train the model.
    The steps are as following :
    * load data and fill efs and efs time for test data with 1
    * initialize pred array with 0
    * get categorical and numerical columns
    * split the train data on the stratified criterion : race_group * newborns yes/no
    * preprocess the fold data (create dataloaders)
    * train the model and create final submission output
    """
    test, train_original = load_data()
    test['efs_time'] = 1
    test['efs'] = 1
    test_pred = np.zeros(test.shape[0])
    val_pred = np.zeros(train_original.shape[0])


    categorical_cols, numerical = get_feature_types(train_original)
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    for i, (train_index, test_index) in enumerate(
        kf.split(
            train_original, train_original.race_group.astype(str) + (train_original.age_at_hct == 0.044).astype(str)
        )
    ):
        tt = train_original.copy()
        train = tt.iloc[train_index]
        val = tt.iloc[test_index]
        print('+++++++++',val.shape)
        X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers = preprocess_data(train, val)
        
        
        model = train_final(X_num_train, dl_train, dl_val, transformers, hparams=hparams, categorical_cols=categorical_cols)

        ### eval on valid
        pred, _ = model.cuda().eval()(
            torch.tensor(X_cat_val, dtype=torch.long).cuda(),
            torch.tensor(X_num_val, dtype=torch.float32).cuda()
        )

        
        val_pred[test_index] = pred.detach().cpu().numpy()

        # Create submission
        train = tt.iloc[train_index]
        X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers = preprocess_data(train, test)

        pred, _ = model.cuda().eval()(
            torch.tensor(X_cat_val, dtype=torch.long).cuda(),
            torch.tensor(X_num_val, dtype=torch.float32).cuda()
        )

        test_pred += pred.detach().cpu().numpy()
        

    return val_pred , test_pred



def train_final(X_num_train, dl_train, dl_val, transformers, hparams=None, categorical_cols=None):
    """
    Defines model hyperparameters and fit the model.
    """
    if hparams is None:
        hparams = {
            "embedding_dim": 16,
            "projection_dim": 112,
            "hidden_dim": 56,
            "lr": 0.06464861983337984,
            "dropout": 0.05463240181423116,
            "aux_weight": 0.26545778308743806,
            "margin": 0.2588153271003354,
            "weight_decay": 0.0002773544957610778,
            "efs_pair_weight": 1.0,
            "same_race_weight": 1.0,
            "same_race_weight": 1.0
        }
    model = LitNN(
        continuous_dim=X_num_train.shape[1],
        categorical_cardinality=[len(t.classes_) for t in transformers],
        race_index=categorical_cols.index("race_group"),
        **hparams
    )
    checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss", save_top_k=1)
    trainer = pl.Trainer(
        accelerator='cuda',
        max_epochs=55,
        log_every_n_steps=6,
        callbacks=[
            checkpoint_callback,
            LearningRateMonitor(logging_interval='epoch'),
            TQDMProgressBar(),
            StochasticWeightAveraging(swa_lrs=1e-5, swa_epoch_start=40, annealing_epochs=15)
        ],
    )
    trainer.fit(model, dl_train)
    trainer.test(model, dl_val)
    return model.eval()




In [None]:
hparams = {
        "embedding_dim": 16,
        "projection_dim": 112,
        "hidden_dim": 56,
        "lr": 0.06464861983337984,
        "dropout": 0.05463240181423116,
        "aux_weight": 0.26545778308743806,
        "margin": 0.2588153271003354,
        "weight_decay": 0.0002773544957610778,
        "efs_pair_weight": 1.0,
        "same_race_weight": 1.0,
        "special_race_weight": 1.0
    }
nn_oof_preds, nn_preds = main(hparams)

hparams = {
        "embedding_dim": 16,
        "projection_dim": 112,
        "hidden_dim": 56,
        "lr": 0.06464861983337984,
        "dropout": 0.05463240181423116,
        "aux_weight": 0.26545778308743806,
        "margin": 0.2588153271003354,
        "weight_decay": 0.0002773544957610778,
        "efs_pair_weight": 4.0,
        "same_race_weight": 1.0,
        "special_race_weight": 1.0
    }
nn_efs1_oof_preds, nn_efs1_preds = main(hparams)

hparams = {
        "embedding_dim": 16,
        "projection_dim": 112,
        "hidden_dim": 56,
        "lr": 0.06464861983337984,
        "dropout": 0.05463240181423116,
        "aux_weight": 0.26545778308743806,
        "margin": 0.2588153271003354,
        "weight_decay": 0.0002773544957610778,
        "efs_pair_weight": 1.0,
        "same_race_weight": 1.2,
        "special_race_weight": 1.2
    }
nn_w_oof_preds, nn_w_preds = main(hparams)
print("done")

# NN 2

## Prepare

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import TensorDataset


def get_X_cat(df, cat_cols, transformers=None):
    """
    Apply a specific categorical data transformer or a LabelEncoder if None.
    """
    if transformers is None:
        transformers = [LabelEncoder().fit(df[col]) for col in cat_cols]
    return transformers, np.array(
        [transformer.transform(df[col]) for col, transformer in zip(cat_cols, transformers)]
    ).T


def preprocess_data(train, val):
    """
    Standardize numerical variables and transform (Label-encode) categoricals.
    Fill NA values with mean for numerical.
    Create torch dataloaders to prepare data for training and evaluation.
    """
    X_cat_train, X_cat_val, numerical, transformers = get_categoricals(train, val)
    scaler = StandardScaler()
    imp = SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)
    X_num_train = imp.fit_transform(train[numerical])
    X_num_train = scaler.fit_transform(X_num_train)
    X_num_val = imp.transform(val[numerical])
    X_num_val = scaler.transform(X_num_val)
    dl_train = init_dl(X_cat_train, X_num_train, train, training=True)
    dl_val = init_dl(X_cat_val, X_num_val, val)
    return X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers


def get_categoricals(train, val):
    """
    Remove constant categorical columns and transform them using LabelEncoder.
    Return the label-transformers for each categorical column, categorical dataframes and numerical columns.
    """
    categorical_cols, numerical = get_feature_types(train)
    remove = []
    for col in categorical_cols:
        if train[col].nunique() == 1:
            remove.append(col)
        ind = ~val[col].isin(train[col])
        if ind.any():
            val.loc[ind, col] = np.nan
    categorical_cols = [col for col in categorical_cols if col not in remove]
    transformers, X_cat_train = get_X_cat(train, categorical_cols)
    _, X_cat_val = get_X_cat(val, categorical_cols, transformers)
    return X_cat_train, X_cat_val, numerical, transformers


def init_dl(X_cat, X_num, df, training=False):
    """
    Initialize data loaders with 4 dimensions : categorical dataframe, numerical dataframe and target values (efs and efs_time).
    Notice that efs_time is log-transformed.
    Fix batch size to 2048 and return dataloader for training or validation depending on training value.
    """
    ds_train = TensorDataset(
        torch.tensor(X_cat, dtype=torch.long),
        torch.tensor(X_num, dtype=torch.float32),
        torch.tensor(df.efs_time.values, dtype=torch.float32).log(),
        torch.tensor(df.efs.values, dtype=torch.long)
    )
    bs = 2048
    dl_train = torch.utils.data.DataLoader(ds_train, batch_size=bs, pin_memory=True, shuffle=training)
    return dl_train


def get_feature_types(train):
    """
    Utility function to return categorical and numerical column names.
    """
    categorical_cols = [col for i, col in enumerate(train.columns) if ((train[col].dtype == "object") | (2 < train[col].nunique() < 25))]
    RMV = ["ID", "efs", "efs_time", "y"]
    FEATURES = [c for c in train.columns if not c in RMV]
    numerical = [i for i in FEATURES if i not in categorical_cols]
    return categorical_cols, numerical


def add_features(df):
    """
    Create some new features to help the model focus on specific patterns.
    """
    sex_match = df.sex_match.astype(str)
    sex_match = sex_match.str.split("-").str[0] == sex_match.str.split("-").str[1]
    df['sex_match_bool'] = sex_match
    df.loc[df.sex_match.isna(), 'sex_match_bool'] = np.nan
    df['big_age'] = df.age_at_hct > 16
    df.loc[df.year_hct == 2019, 'year_hct'] = 2020
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    df['strange_age'] = df.age_at_hct == 0.044
    df['age_bin'] = pd.cut(df.age_at_hct, [0, 0.0441, 16, 30, 50, 100])
    df['age_ts'] = df.age_at_hct / df.donor_age
    df['year_hct'] -= 2000
    
    return df


def load_data():
    """
    Load data and add features.
    """
    if kaggle:
        test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
    else:
        test = pd.read_csv("../data/test.csv")
        
    test = add_features(test)
    print("Test shape:", test.shape)
    if kaggle:
        train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
    else:
        train = pd.read_csv("../data/train.csv")
    train = add_features(train)
    print("Train shape:", train.shape)
    return test, train

## Model

In [None]:
import functools
from typing import List

import pytorch_lightning as pl
import numpy as np
import torch
from lifelines.utils import concordance_index
from pytorch_lightning.cli import ReduceLROnPlateau
from pytorch_tabular.models.common.layers import ODST
from torch import nn
from pytorch_lightning.utilities import grad_norm


class CatEmbeddings(nn.Module):
    """
    Embedding module for the categorical dataframe.
    """
    def __init__(
        self,
        projection_dim: int,
        categorical_cardinality: List[int],
        embedding_dim: int
    ):
        """
        projection_dim: The dimension of the final output after projecting the concatenated embeddings into a lower-dimensional space.
        categorical_cardinality: A list where each element represents the number of unique categories (cardinality) in each categorical feature.
        embedding_dim: The size of the embedding space for each categorical feature.
        self.embeddings: list of embedding layers for each categorical feature.
        self.projection: sequential neural network that goes from the embedding to the output projection dimension with GELU activation.
        """
        super(CatEmbeddings, self).__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(cardinality, embedding_dim)
            for cardinality in categorical_cardinality
        ])
        self.projection = nn.Sequential(
            nn.Linear(embedding_dim * len(categorical_cardinality), projection_dim),
            nn.GELU(),
            nn.Linear(projection_dim, projection_dim)
        )

    def forward(self, x_cat):
        """
        Apply the projection on concatened embeddings that contains all categorical features.
        """
        x_cat = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=1)
        return self.projection(x_cat)


class NN(nn.Module):
    """
    Train a model on both categorical embeddings and numerical data.
    """
    def __init__(
            self,
            continuous_dim: int,
            categorical_cardinality: List[int],
            embedding_dim: int,
            projection_dim: int,
            hidden_dim: int,
            dropout: float = 0
    ):
        """
        continuous_dim: The number of continuous features.
        categorical_cardinality: A list of integers representing the number of unique categories in each categorical feature.
        embedding_dim: The dimensionality of the embedding space for each categorical feature.
        projection_dim: The size of the projected output space for the categorical embeddings.
        hidden_dim: The number of neurons in the hidden layer of the MLP.
        dropout: The dropout rate applied in the network.
        self.embeddings: previous embeddings for categorical data.
        self.mlp: defines an MLP model with an ODST layer followed by batch normalization and dropout.
        self.out: linear output layer that maps the output of the MLP to a single value
        self.dropout: defines dropout
        Weights initialization with xavier normal algorithm and biases with zeros.
        """
        super(NN, self).__init__()
        self.embeddings = CatEmbeddings(projection_dim, categorical_cardinality, embedding_dim)
        self.mlp = nn.Sequential(
            ODST(projection_dim + continuous_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout)
        )
        self.out = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

        # initialize weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x_cat, x_cont):
        """
        Create embedding layers for categorical data, concatenate with continous variables.
        Add dropout and goes through MLP and return raw output and 1-dimensional output as well.
        """
        x = self.embeddings(x_cat)
        x = torch.cat([x, x_cont], dim=1)
        x = self.dropout(x)
        x = self.mlp(x)
        return self.out(x), x


@functools.lru_cache
def combinations(N):
    """
    calculates all possible 2-combinations (pairs) of a tensor of indices from 0 to N-1, 
    and caches the result using functools.lru_cache for optimization
    """
    ind = torch.arange(N)
    comb = torch.combinations(ind, r=2)
    return comb.cuda()


class LitNN(pl.LightningModule):
    """
    Main Model creation and losses definition to fully train the model.
    """
    def __init__(
            self,
            continuous_dim: int,
            categorical_cardinality: List[int],
            embedding_dim: int,
            projection_dim: int,
            hidden_dim: int,
            lr: float = 1e-3,
            dropout: float = 0.2,
            weight_decay: float = 1e-3,
            aux_weight: float = 0.1,
            margin: float = 0.5,
            race_index: int = 0
    ):
        """
        continuous_dim: The number of continuous input features.
        categorical_cardinality: A list of integers, where each element corresponds to the number of unique categories for each categorical feature.
        embedding_dim: The dimension of the embeddings for the categorical features.
        projection_dim: The dimension of the projected space after embedding concatenation.
        hidden_dim: The size of the hidden layers in the feedforward network (MLP).
        lr: The learning rate for the optimizer.
        dropout: Dropout probability to avoid overfitting.
        weight_decay: The L2 regularization term for the optimizer.
        aux_weight: Weight used for auxiliary tasks.
        margin: Margin used in some loss functions.
        race_index: An index that refer to race_group in the input data.
        """
        super(LitNN, self).__init__()
        self.save_hyperparameters()

        # Creates an instance of the NN model defined above
        self.model = NN(
            continuous_dim=self.hparams.continuous_dim,
            categorical_cardinality=self.hparams.categorical_cardinality,
            embedding_dim=self.hparams.embedding_dim,
            projection_dim=self.hparams.projection_dim,
            hidden_dim=self.hparams.hidden_dim,
            dropout=self.hparams.dropout
        )
        self.targets = []

        # Defines a small feedforward neural network that performs an auxiliary task with 1-dimensional output
        self.aux_cls = nn.Sequential(
            nn.Linear(self.hparams.hidden_dim, self.hparams.hidden_dim // 3),
            nn.GELU(),
            nn.Linear(self.hparams.hidden_dim // 3, 1)
        )

    def on_before_optimizer_step(self, optimizer):
        """
        Compute the 2-norm for each layer
        If using mixed precision, the gradients are already unscaled here
        """
        norms = grad_norm(self.model, norm_type=2)
        self.log_dict(norms)

    def forward(self, x_cat, x_cont):
        """
        Forward pass that outputs the 1-dimensional prediction and the embeddings (raw output)
        """
        x, emb = self.model(x_cat, x_cont)
        return x.squeeze(1), emb

    def training_step(self, batch, batch_idx):
        """
        defines how the model processes each batch of data during training.
        A batch is a combination of : categorical data, continuous data, efs_time (y) and efs event.
        y_hat is the efs_time prediction on all data and aux_pred is auxiliary prediction on embeddings.
        Calculates loss and race_group loss on full data.
        Auxiliary loss is calculated with an event mask, ignoring efs=0 predictions and taking the average.
        Returns loss and aux_loss multiplied by weight defined above.
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        aux_pred = self.aux_cls(emb).squeeze(1)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        aux_loss = nn.functional.mse_loss(aux_pred, y, reduction='none')
        aux_mask = efs == 1
        aux_loss = (aux_loss * aux_mask).sum() / aux_mask.sum()
        self.log("train_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("race_loss", race_loss, on_epoch=True, prog_bar=True, logger=True, on_step=False)
        self.log("aux_loss", aux_loss, on_epoch=True, prog_bar=True, logger=True, on_step=False)
        return loss + aux_loss * self.hparams.aux_weight

    def get_full_loss(self, efs, x_cat, y, y_hat):
        """
        Output loss and race_group loss.
        """
        loss = self.calc_loss(y, y_hat, efs)
        race_loss = self.get_race_losses(efs, x_cat, y, y_hat)
        loss += 0.1 * race_loss
        return loss, race_loss

    def get_race_losses(self, efs, x_cat, y, y_hat):
        """
        Calculate loss for each race_group based on deviation/variance.
        """
        races = torch.unique(x_cat[:, self.hparams.race_index])
        race_losses = []
        for race in races:
            ind = x_cat[:, self.hparams.race_index] == race
            race_losses.append(self.calc_loss(y[ind], y_hat[ind], efs[ind]))
        race_loss = sum(race_losses) / len(race_losses)
        races_loss_std = sum((r - race_loss)**2 for r in race_losses) / len(race_losses)
        return torch.sqrt(races_loss_std)

    def calc_loss(self, y, y_hat, efs):
        """
        Most important part of the model : loss function used for training.
        We face survival data with event indicators along with time-to-event.

        This function computes the main loss by the following the steps :
        * create all data pairs with "combinations" function (= all "two subjects" combinations)
        * make sure that we have at least 1 event in each pair
        * convert y to +1 or -1 depending on the correct ranking
        * loss is computed using a margin-based hinge loss
        * mask is applied to ensure only valid pairs are being used (censored data can't be ranked with event in some cases)
        * average loss on all pairs is returned
        """
        N = y.shape[0]
        comb = combinations(N)
        comb = comb[(efs[comb[:, 0]] == 1) | (efs[comb[:, 1]] == 1)]
        pred_left = y_hat[comb[:, 0]]
        pred_right = y_hat[comb[:, 1]]
        y_left = y[comb[:, 0]]
        y_right = y[comb[:, 1]]
        y = 2 * (y_left > y_right).int() - 1
        loss = nn.functional.relu(-y * (pred_left - pred_right) + self.hparams.margin)
        mask = self.get_mask(comb, efs, y_left, y_right)
        loss = (loss.double() * (mask.double())).sum() / mask.sum()
        return loss

    def get_mask(self, comb, efs, y_left, y_right):
        """
        Defines all invalid comparisons :
        * Case 1: "Left outlived Right" but Right is censored
        * Case 2: "Right outlived Left" but Left is censored
        Masks for case 1 and case 2 are combined using |= operator and inverted using ~ to create a "valid pair mask"
        """
        left_outlived = y_left >= y_right
        left_1_right_0 = (efs[comb[:, 0]] == 1) & (efs[comb[:, 1]] == 0)
        mask2 = (left_outlived & left_1_right_0)
        right_outlived = y_right >= y_left
        right_1_left_0 = (efs[comb[:, 1]] == 1) & (efs[comb[:, 0]] == 0)
        mask2 |= (right_outlived & right_1_left_0)
        mask2 = ~mask2
        mask = mask2
        return mask

    def validation_step(self, batch, batch_idx):
        """
        This method defines how the model processes each batch during validation
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        self.targets.append([y, y_hat.detach(), efs, x_cat[:, self.hparams.race_index]])
        self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def on_validation_epoch_end(self):
        """
        At the end of the validation epoch, it computes and logs the concordance index
        """
        cindex, metric = self._calc_cindex()
        self.log("cindex", metric, on_epoch=True, prog_bar=True, logger=True)
        self.log("cindex_simple", cindex, on_epoch=True, prog_bar=True, logger=True)
        self.targets.clear()

    def _calc_cindex(self):
        """
        Calculate c-index accounting for each race_group or global.
        """
        y = torch.cat([t[0] for t in self.targets]).cpu().numpy()
        y_hat = torch.cat([t[1] for t in self.targets]).cpu().numpy()
        efs = torch.cat([t[2] for t in self.targets]).cpu().numpy()
        races = torch.cat([t[3] for t in self.targets]).cpu().numpy()
        metric = self._metric(efs, races, y, y_hat)
        cindex = concordance_index(y, y_hat, efs)
        return cindex, metric

    def _metric(self, efs, races, y, y_hat):
        """
        Calculate c-index accounting for each race_group
        """
        metric_list = []
        for race in np.unique(races):
            y_ = y[races == race]
            y_hat_ = y_hat[races == race]
            efs_ = efs[races == race]
            metric_list.append(concordance_index(y_, y_hat_, efs_))
        metric = float(np.mean(metric_list) - np.sqrt(np.var(metric_list)))
        return metric

    def test_step(self, batch, batch_idx):
        """
        Same as training step but to log test data
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        self.targets.append([y, y_hat.detach(), efs, x_cat[:, self.hparams.race_index]])
        self.log("test_loss", loss)
        return loss

    def on_test_epoch_end(self) -> None:
        """
        At the end of the test epoch, calculates and logs the concordance index for the test set
        """
        cindex, metric = self._calc_cindex()
        self.log("test_cindex", metric, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_cindex_simple", cindex, on_epoch=True, prog_bar=True, logger=True)
        self.targets.clear()


    def configure_optimizers(self):
        """
        configures the optimizer and learning rate scheduler:
        * Optimizer: Adam optimizer with weight decay (L2 regularization).
        * Scheduler: Cosine Annealing scheduler, which adjusts the learning rate according to a cosine curve.
        """
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        scheduler_config = {
            "scheduler": torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer,
                T_max=45,
                eta_min=6e-3
            ),
            "interval": "epoch",
            "frequency": 1,
            "strict": False,
        }

        return {"optimizer": optimizer, "lr_scheduler": scheduler_config}

## Train and Predict

In [None]:
import json
import pytorch_lightning as pl
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import torch
from pytorch_lightning.callbacks import LearningRateMonitor, TQDMProgressBar
from pytorch_lightning.callbacks import StochasticWeightAveraging
from sklearn.model_selection import StratifiedKFold

pl.seed_everything(42)

def main(hparams):
    """
    Main function to train the model.
    The steps are as following :
    * load data and fill efs and efs time for test data with 1
    * initialize pred array with 0
    * get categorical and numerical columns
    * split the train data on the stratified criterion : race_group * newborns yes/no
    * preprocess the fold data (create dataloaders)
    * train the model and create final submission output
    """
    test, train_original = load_data()
    test['efs_time'] = 1
    test['efs'] = 1
    test_pred = np.zeros(test.shape[0])
    nn_oof_preds = np.zeros(train_original.shape[0])
    categorical_cols, numerical = get_feature_types(train_original)
    kf = StratifiedKFold(n_splits=5, shuffle=True, )
    for i, (train_index, test_index) in enumerate(
        kf.split(
            train_original, train_original.race_group.astype(str) + (train_original.age_at_hct == 0.044).astype(str)
        )
    ):
        tt = train_original.copy()
        train = tt.iloc[train_index]
        val = tt.iloc[test_index]
        X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers = preprocess_data(train, val)
        model = train_final(X_num_train, dl_train, dl_val, transformers, categorical_cols=categorical_cols)
        val_pred, _ = model.cuda().eval()(
            torch.tensor(X_cat_val, dtype=torch.long).cuda(),
            torch.tensor(X_num_val, dtype=torch.float32).cuda()
        )
        nn_oof_preds[test_index] = val_pred.detach().cpu().numpy()
        # Create submission
        train = tt.iloc[train_index]
        X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers = preprocess_data(train, test)
        pred, _ = model.cuda().eval()(
            torch.tensor(X_cat_val, dtype=torch.long).cuda(),
            torch.tensor(X_num_val, dtype=torch.float32).cuda()
        )
        test_pred += pred.detach().cpu().numpy()
    test_pred /= 5  
    # subm_data = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv")
    # subm_data['prediction'] = -test_pred
    # subm_data.to_csv('submission.csv', index=False)
    
    # display(subm_data.head())
    return test_pred, nn_oof_preds



def train_final(X_num_train, dl_train, dl_val, transformers, hparams=None, categorical_cols=None):
    """
    Defines model hyperparameters and fit the model.
    """
    if hparams is None:
        hparams = {
            "embedding_dim": 16,
            "projection_dim": 112,
            "hidden_dim": 56,
            "lr": 0.06464861983337984,
            "dropout": 0.05463240181423116,
            "aux_weight": 0.26545778308743806,
            "margin": 0.2588153271003354,
            "weight_decay": 0.0002773544957610778
        }
    model = LitNN(
        continuous_dim=X_num_train.shape[1],
        categorical_cardinality=[len(t.classes_) for t in transformers],
        race_index=categorical_cols.index("race_group"),
        **hparams
    )
    checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss", save_top_k=1)
    trainer = pl.Trainer(
        accelerator='cuda',
        max_epochs=70,
        log_every_n_steps=6,
        callbacks=[
            checkpoint_callback,
            LearningRateMonitor(logging_interval='epoch'),
            TQDMProgressBar(),
            StochasticWeightAveraging(swa_lrs=1e-5, swa_epoch_start=45, annealing_epochs=15)
        ],
    )
    trainer.fit(model, dl_train)
    trainer.test(model, dl_val)
    return model.eval()


hparams = None
nn2_preds, nn2_oof_preds = main(hparams)
print("done")

# ZDH GBDTs

In [None]:
import warnings
warnings.filterwarnings('ignore')
from collections import namedtuple
import numpy as np
import polars as pl
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

from sklearn.base import clone
import optuna
import os
from colorama import Fore
from collections import Counter
from scipy.stats import skew, kurtosis

from tqdm import tqdm
from IPython.display import clear_output
from lifelines import KaplanMeierFitter, CoxPHFitter, NelsonAalenFitter
# from lifelines.fitters.nelson_aalen_fitter import NelsonAalenFitter
pd.options.display.max_columns = None
from lifelines.utils import concordance_index


import lightgbm as lgb
from lightgbm import early_stopping  
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import *
from sklearn.metrics import *

import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification,  BitsAndBytesConfig

seed = 114514
n_splits = 10
kaggle = True

In [None]:
# V_1.3 # Include TabNET + OPTUNA Tunning
# !pip install -qq pytorch_tabnet
import pandas as pd
import numpy as np
from tqdm import tqdm
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from pytorch_tabnet.tab_model import TabNetRegressor,TabNetClassifier
import lightgbm as lgb
from lightgbm import LGBMRegressor, early_stopping
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.model_selection import *
from sklearn.metrics import *
from IPython.display import clear_output
from xgboost import XGBRegressor, XGBClassifier
# from sklearn.ensemble import VotingRegressor, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from colorama import Fore
from nltk.corpus import stopwords
import nltk
import string
import optuna
import warnings
from sklearn.preprocessing import LabelEncoder

warnings.simplefilter('ignore')
import optuna
from typing import Dict, Any, Optional, Union, Tuple
from colorama import Fore
import logging
import pandas.api.types
from lifelines.utils import concordance_index
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


class ParticipantVisibleError(Exception):
    pass


SEED = 42


class AbdBase:
    model_name = ["LGBM", "CAT", "XGB", "Voting", 'TABNET']
    metrics = ["roc_auc", "accuracy", "f1", "precision", "recall", 'rmse', 'wmae', "rmsle", "mae", "r2", 'mse',
               'mape']
    problem_types = ["classification", "regression"]  # 任务类型: 分类或回归
    cv_types = ['SKF', 'KF', 'GKF', 'GSKF', "RKF"]
    current_version = ['V_1.3']

    def __init__(self, train_data, test_data=None, target_column=None, tf_vec=False, gpu=False, numpy_data=False,
                 handle_date=False,
                 problem_type="classification", metric="roc_auc", seed=SEED, ohe_fe=False, label_encode=False,
                 target_encode=False,
                 n_splits=5, txt_columns=None, cat_features=None, num_classes=None, prob=False, stat_fe=None,
                 logger: Optional[logging.Logger] = None,
                 early_stop=False, test_prob=False, fold_type='SKF', weights=None, multi_column_tfidf=None):

        self.train_data = train_data  # 训练集
        self.test_data = test_data  # 测试集
        self.target_column = target_column  # 目标列
        self.problem_type = problem_type  # 任务类型
        self.metric = metric  # 评价指标
        self.seed = seed  # 随机种子
        self.n_splits = n_splits  # K折交叉验证
        self.txt_columns = txt_columns
        self.cat_features = cat_features if cat_features else []  # 分类特征的列名列表，用于标明哪些特征是分类特征
        self.num_classes = num_classes  # 分类问题中的类别数量
        self.prob = prob  # 是否输出类别的预测概率，若为True，模型会输出每个类别的概率分布
        self.test_prob = test_prob  # 是否在测试阶段输出概率值
        self.early_stop = early_stop  # 是否启用早期停止机制
        self.fold_type = fold_type  # 交叉验证类型
        self.weights = weights
        self.tf_vec = tf_vec  # 是否应用TF-IDF 向量化
        self.stat_fe = stat_fe  # 是否对数据进行统计特征工程
        self.multi_column_tfidf = multi_column_tfidf  # 用于文本特征的特殊处理，表示是否对多个列使用tf-idf向量化
        self.gpu = gpu
        self.numpy_data = numpy_data  # 输入数据是否为Numpy类型数据
        self.handle_date = handle_date  # 是否对日期数据进行特殊处理（如提取年月日等信息，或将其转化为时间戳等格式）
        self.ohe_fe = ohe_fe  # 是否进行独热编码
        self.label_encode = label_encode  # 是否进行标签编码，标签编码将每个类别映射为一个整数，对于一些模型（如树模型），它比独热编码更高效
        self.target_encode = target_encode  # 表示是否对目标特征进行编码。目标编码通常是基于目标列的统计信息（如均值）对特征进行编码，适用于分类任务
        self.logger = logger or self._setup_default_logger()

        self._validate_input()
        self.checkTarget()
        self._display_initial_info()

        # 处理日期特征
        if self.handle_date:
            print(Fore.YELLOW + f"\nAdding Date Features")

            if self.train_data is not None:
                self.train_data = self.date(
                    df=self.train_data,
                )

            if self.test_data is not None:
                self.test_data = self.date(
                    df=self.test_data,
                )

        # 统计文本特征
        if self.stat_fe:
            print(Fore.YELLOW + f"\nAdding Stats Features")
            self.txt_columns = stat_fe.get('txt_columns', txt_columns)

            if self.train_data is not None:
                self.train_data = self.text_stat(
                    df=self.train_data,
                    txt_cols=self.txt_columns,
                )

            if self.test_data is not None:
                self.test_data = self.text_stat(
                    df=self.test_data,
                    txt_cols=self.txt_columns,
                )
        
        # 对文本特征列应用TF-IDF和SVD(降维)
        if self.tf_vec:
            self.text_column = tf_vec.get('text_column', '')  # 从tf_vec字典中获取文本列的名称
            self.max_features = tf_vec.get('max_features', 1000)  # 最多选择max_features个最重要的词语
            self.n_components = tf_vec.get('n_components', 10)  # n_components表示应用SVD时，将数据降为的维度数
            print(Fore.YELLOW + f"\nTf-IDF Processing For Col: {self.text_column}")
            if self.train_data is not None:
                self.train_data = self.apply_tfidf_svd(
                    df=self.train_data,
                    text_column=self.text_column,
                    max_features=self.max_features,
                    n_components=self.n_components
                )

            if self.test_data is not None:
                self.test_data = self.apply_tfidf_svd(
                    df=self.test_data,
                    text_column=self.text_column,
                    max_features=self.max_features,
                    n_components=self.n_components
                )

        # 针对多个文本列进行TF-IDF特征处理并将处理后的特征添加到训练数据和测试数据中
        if self.multi_column_tfidf:

            self.text_columns = multi_column_tfidf.get('text_columns', [])
            self.max_features = multi_column_tfidf.get('max_features', 1000)

            print(Fore.YELLOW + f"\nMulti-TF_IDF Processing For Columns: {self.text_columns}")

            if self.train_data is not None and self.test_data is not None:
                self.train_data, self.test_data = self.tf_fe(
                    train=self.train_data,
                    test=self.test_data,
                    text_columns=self.text_columns,
                    max_features=self.max_features,
                )
                
        # 对分类特征进行独热编码
        if self.ohe_fe:

            print(Fore.YELLOW + f"\n---> Adding OHE Features\n")
            self.cat_c = ohe_fe.get('cat_c', [])
            if self.train_data is not None and self.test_data is not None:
                self.train_data, self.test_data = self.ohe_transform(
                    train=self.train_data,
                    test=self.test_data,
                    cat_cols=self.cat_c,
                )
        # 对标签进行编码
        if self.label_encode:
            print(Fore.YELLOW + f"\n---> Applying Label Encoder\n")
            self.cat_c = label_encode.get('cat_c', [])
            if self.train_data is not None and self.test_data is not None:
                self.train_data, self.test_data = self.label_encode_transform(
                    train=self.train_data,
                    test=self.test_data,
                    cat_cols=self.cat_c,
                )
        # 对分类特征列进行编码（如因子化），同时优化数据类型，减少内存占用提高计算效率
        if self.target_encode:
            print(Fore.YELLOW + f"\n---> Applying Target Encoder\n")
            self.cat_c = target_encode.get('cat_c', [])
            self.target_col = target_encode.get('target_col', [])
            if self.train_data is not None and self.test_data is not None:
                self.train_data, self.test_data = self.factorize_and_encode(
                    train=self.train_data,
                    test=self.test_data,
                    cat_cols=self.cat_c,
                    target_col=self.target_col,
                )
        # 去除目标列
        self.X_train = self.train_data.drop(self.target_column,
                                            axis=1).to_numpy() if self.numpy_data else self.train_data.drop(
            self.target_column, axis=1)
        # 获取目标列
        self.y_train = self.train_data[self.target_column].to_numpy() if self.numpy_data else self.train_data[
            self.target_column]
        self.y_train = self.y_train.reshape(-1, 1) if self.model_name == 'TABNET' else self.y_train
        targets = [t for t in ["target1","target2","target3","target4"] if t!=self.target_column]
        cols = [c for c in self.X_train.columns if c not in ['efs','efs_time'] + targets]
        self.X_test = self.test_data[cols]
        if self.test_data is not None:
            self.X_test = self.test_data.to_numpy() if self.numpy_data else self.test_data
        else:
            self.X_test = None

    
    @staticmethod
    def label_encode_transform(train: pd.DataFrame, test: pd.DataFrame, cat_cols: list):
        """标签编码"""
        label_encoders = {}

        for col in cat_cols:
            le = LabelEncoder()
            train[col] = le.fit_transform(train[col])  # 对训练集拟合并编码
            test[col] = test[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)  # 用训练集规则对测试集编码
            label_encoders[col] = le

        return train, test

    @staticmethod
    def factorize_and_encode(train: pd.DataFrame, test: pd.DataFrame, cat_cols: list, target_col) -> pd.DataFrame:
        """对分类特列进行编码,同时优化数据类型,以减少内存占用和提高计算效率"""
        combined = pd.concat([train, test], axis=0, ignore_index=True)  # 行拼接，保留原来索引

        for c in cat_cols:
            if c in combined.columns:
                combined[c], _ = combined[c].factorize()  # 将分类特征值转化为整数编码 ['a','b'.'c'] -> [0,1,2]
                combined[c] -= combined[c].min()
                combined[c] = combined[c].astype("int32")
                combined[c] = combined[c].astype("category")

        for c in combined.columns:
            if c not in cat_cols:
                if combined[c].dtype == "float64":
                    combined[c] = combined[c].astype("float32")
                elif combined[c].dtype == "int64":
                    combined[c] = combined[c].astype("int32")

        train_encoded = combined.iloc[:len(train)].copy()
        test_encoded = combined.iloc[len(train):].reset_index(drop=True).copy()

        test_encoded = test_encoded.drop(columns=[target_col])

        return train_encoded, test_encoded

    @staticmethod
    def date(df):
        """
        用于对包含日期列进行一些列日期特征的提取与转换
        """
        df['date'] = pd.to_datetime(df['date'])
        df['year'] = df['date'].dt.year  # 年
        df['day'] = df['date'].dt.day  # 日
        df['month'] = df['date'].dt.month  # 月
        df['month_name'] = df['date'].dt.month_name()  # 提取“月名称”：1-> January
        df['day_of_week'] = df['date'].dt.day_name()  # 星期几
        df['week'] = df['date'].dt.isocalendar().week  # 第几周
        # 利用sin和cos对具有周期性特征列生成周期性特征便于模型更好捕捉周期性特征
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
        df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
        df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
        # 假设以2020年为起始点：（1）一年12月，每月4周（2）每个月有4周 + 当前日期处于当月的第几周
        df['group'] = (df['year'] - 2020) * 48 + df['month'] * 4 + df[
            'day'] // 7  # 组合年月日生成唯一标识符group, 表示从2020年开始的一个相对时间段

        df.drop('date', axis=1, inplace=True)

        df['cos_year'] = np.cos(df['year'] * (2 * np.pi) / 100)
        df['sin_year'] = np.sin(df['year'] * (2 * np.pi) / 100)
        df['year_lag_1'] = df['year'].shift(1)
        df['year_diff'] = df['year'] - df['year_lag_1']  # 今年与去年的变化

        return df

    @staticmethod
    def ohe_transform(train: pd.DataFrame, test: pd.DataFrame, cat_cols: list):
        """对分类特征列进行独热编码"""

        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

        train_ohe = pd.DataFrame(ohe.fit_transform(train[cat_cols]),  # 对训练集中指定的分类特征进行独热编码
                                 columns=ohe.get_feature_names_out(cat_cols),  # 不同特征的不同属性值列举出来
                                 index=train.index)

        test_ohe = pd.DataFrame(ohe.transform(test[cat_cols]),
                                columns=ohe.get_feature_names_out(cat_cols),
                                index=test.index)

        train = train.drop(columns=cat_cols).reset_index(drop=True)
        test = test.drop(columns=cat_cols).reset_index(drop=True)

        train = pd.concat([train, train_ohe.reset_index(drop=True)], axis=1)
        test = pd.concat([test, test_ohe.reset_index(drop=True)], axis=1)

        return train, test

    def CIBMTR_score(self, solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:

        del solution[row_id_column_name]
        del submission[row_id_column_name]

        event_label = 'efs'
        interval_label = 'efs_time'
        prediction_label = 'prediction'
        for col in submission.columns:
            if not pandas.api.types.is_numeric_dtype(submission[col]):
                raise ParticipantVisibleError(f'Submission column {col} must be a number')
        merged_df = pd.concat([solution, submission], axis=1)
        merged_df.reset_index(inplace=True)
        merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
        metric_list = []
        for race in merged_df_race_dict.keys():
            indices = sorted(merged_df_race_dict[race])
            merged_df_race = merged_df.iloc[indices]
            c_index_race = concordance_index(
                merged_df_race[interval_label],
                -merged_df_race[prediction_label],
                merged_df_race[event_label])
            metric_list.append(c_index_race)
        return float(np.mean(metric_list) - np.sqrt(np.var(metric_list)))

    @staticmethod
    def text_stat(df, txt_cols):
        """
        对指定文本列进行多种统计分析，提取特征后将结果作为新列添加到数据框df中
        :param df:
        :param txt_cols: 指定需要提取统计特征的文本列
        :return:
        """
        stop_words = set(stopwords.words('english'))  # 加载英文停止词
        for col in tqdm(txt_cols, desc="Processing text columns"):
            if col not in df.columns:
                print(f"Warning: Column '{col}' not found in DataFrame.")
                continue
            df[f'{col}_length'] = df[col].str.len().astype(int) # 计算每行文本长度
            df[f'{col}_word_count'] = df[col].str.split().str.len().astype(int)  # 计算每行单词数
            df[f'{col}_char_count'] = df[col].apply(lambda x: sum(len(word) for word in x.split())).astype(int) # 字符总数
            df[f'{col}_avg_word_length'] = df[f'{col}_char_count'] / df[f'{col}_word_count'].replace(0, 1)  # 平均单词长度

            df[f'{col}_punctuation_count'] = df[col].apply(
                lambda x: sum(1 for char in x if char in string.punctuation)).astype(int)  # 统计每行文本中的标点符号数量
            df[f'{col}_capitalized_count'] = df[col].apply(
                lambda x: sum(1 for word in x.split() if word.isupper())).astype(int)  # 统计每行文本中的大写单词数
            # 统计每行文本中的特殊字符数
            df[f'{col}_special_char_count'] = df[col].apply(
                lambda x: sum(1 for char in x if not char.isalnum() and not char.isspace())).astype(int)
            # 停止词个数
            df[f'{col}_stopwords_count'] = df[col].apply(
                lambda x: sum(1 for word in x.split() if word.lower() in stop_words)).astype(int)
            # 唯一单词数
            df[f'{col}_unique_word_count'] = df[col].apply(lambda x: len(set(x.split()))).astype(int)
            # 词汇多样性
            df[f'{col}_lexical_diversity'] = df[f'{col}_unique_word_count'] / df[f'{col}_word_count'].replace(0, 1)

        return df

    @staticmethod
    def apply_tfidf_svd(df, text_column, max_features=1000, n_components=10):
        """
        :param df:
        :param text_column: 文本列
        :param max_features: 保留max_features个重要词语
        :param n_components: 文本数据的维度降为n_components
        :return:
        """
        vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
        vectors = vectorizer.fit_transform(df[text_column])  # 将文本列中的文本数据转化为TF-IDF特殊矩阵
        svd = TruncatedSVD(n_components)
        x_sv = svd.fit_transform(vectors)  # 利用SVD将高维数据(即TF-IDF向量)降低到更低的维度
        tfidf_df = pd.DataFrame(x_sv)
        cols = [(text_column + "_tfidf_" + str(f)) for f in tfidf_df.columns.to_list()]
        tfidf_df.columns = cols
        df = df.reset_index(drop=True)
        df = pd.concat([df, tfidf_df], axis="columns")
        df.drop(text_column, axis=1, inplace=True)
        return df

    @staticmethod
    def tf_fe(train, test, text_columns, max_features=3000, analyzer='char_wb'):
        """
        将多个文本列进行TF-IDF
        :param train: 训练数据集
        :param test: 测试数据集
        :param text_columns: 文本列
        :param max_features: 每个文本列最大提取的特指数
        :param analyzer: 用于分析文本的方式，默认为'char_wb'，表示按字符（基于字符的词法分块）分析文本
        :return:
        """
        train_features = []
        test_features = []

        for col in tqdm(text_columns, desc="Processing text columns", unit="col"):
            vectorizer = TfidfVectorizer(analyzer=analyzer, max_features=max_features)
            train_tfidf_col = vectorizer.fit_transform(train[col])
            test_tfidf_col = vectorizer.transform(test[col])
            train_tfidf_col = pd.DataFrame(train_tfidf_col.toarray(),
                                           columns=[f"tfidf_{col}_{i}" for i in range(train_tfidf_col.shape[1])])
            test_tfidf_col = pd.DataFrame(test_tfidf_col.toarray(),
                                          columns=[f"tfidf_{col}_{i}" for i in range(test_tfidf_col.shape[1])])
            train_features.append(train_tfidf_col)
            test_features.append(test_tfidf_col)

        train_with_tfidf = pd.concat([train, *train_features], axis=1)
        test_with_tfidf = pd.concat([test, *test_features], axis=1)

        return train_with_tfidf, test_with_tfidf

    def checkTarget(self):
        """检测目标列是否是字符串类型，如果是则需要进行One-hot编码"""
        if self.train_data[self.target_column].dtype == 'object':
            raise ValueError('Encode Target First')

    def _display_initial_info(self):
        print(Fore.RED + f"*** AbdBase {self.current_version} ***\n")
        print(Fore.RED + " *** Available Settings *** \n")
        print(Fore.RED + "Available Models:", ", ".join([Fore.CYAN + model for model in self.model_name]))
        print(Fore.RED + "Available Metrics:", ", ".join([Fore.CYAN + metric for metric in self.metrics]))
        print(Fore.RED + "Available Problem Types:", ", ".join([Fore.CYAN + problem for problem in self.problem_types]))
        print(Fore.RED + "Available Fold Types:", ", ".join([Fore.CYAN + fold for fold in self.cv_types]))

        print(Fore.RED + "\n *** Configuration *** \n")
        print(Fore.RED + f"Problem Type Selected: {Fore.CYAN + self.problem_type.upper()}")
        print(Fore.RED + f"Metric Selected: {Fore.CYAN + self.metric.upper()}")
        print(Fore.RED + f"Fold Type Selected: {Fore.CYAN + self.fold_type}")
        print(Fore.RED + f"Calculate Train Probabilities: {Fore.CYAN + str(self.prob)}")
        print(Fore.RED + f"Calculate Test Probabilities: {Fore.CYAN + str(self.test_prob)}")
        print(Fore.RED + f"Early Stopping: {Fore.CYAN + str(self.early_stop)}")
        print(Fore.RED + f"GPU: {Fore.CYAN + str(self.gpu)}")

    def _validate_input(self):
        if not isinstance(self.train_data, pd.DataFrame):
            raise ValueError("Training data must be a pandas DataFrame.")
        if self.test_data is not None and not isinstance(self.test_data, pd.DataFrame):
            raise ValueError("Test data must be a pandas DataFrame.")
        if self.target_column not in self.train_data.columns:
            raise ValueError(f"Target column '{self.target_column}' not found in the training dataset.")
        if self.problem_type not in self.problem_types:
            raise ValueError("Invalid problem type. Choose either 'classification' or 'regression'.")
        if self.metric not in self.metrics and self.metric not in self.regression_metrics:
            raise ValueError("Invalid metric. Choose from available metrics.")
        if not isinstance(self.n_splits, int) or self.n_splits < 2:
            raise ValueError("n_splits must be an integer greater than 1.")
        if self.fold_type not in self.cv_types:
            raise ValueError(f"Invalid fold type. Choose from {self.cv_types}.")

    from sklearn.metrics import (
        roc_auc_score, accuracy_score, f1_score, precision_score, recall_score,
        mean_absolute_error, r2_score, mean_squared_error
    )

    def weighted_mean_absolute_error(self, y_true, y_pred, weights):
        return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

    def rmsLe(self, y_true, y_pred):
        y_pred = np.maximum(y_pred, 1e-6)
        return np.sqrt(mean_squared_log_error(y_true, y_pred))

    def mape(self, y_true, y_pred):
        return mean_absolute_percentage_error(y_true, y_pred)

    def get_metric(self, y_true, y_pred, weights=None):
        if self.metric == 'roc_auc':
            return roc_auc_score(y_true, y_pred, multi_class="ovr" if self.num_classes > 2 else None)
        elif self.metric == 'accuracy':
            return accuracy_score(y_true, y_pred.round())
        elif self.metric == 'f1':
            return f1_score(y_true, y_pred.round(), average='weighted') if self.num_classes > 2 else f1_score(y_true,
                                                                                                              y_pred.round())
        elif self.metric == 'precision':
            return precision_score(y_true, y_pred.round(),
                                   average='weighted') if self.num_classes > 2 else precision_score(y_true,
                                                                                                    y_pred.round())
        elif self.metric == 'recall':
            return recall_score(y_true, y_pred.round(), average='weighted') if self.num_classes > 2 else recall_score(
                y_true, y_pred.round())
        elif self.metric == 'mae':
            return mean_absolute_error(y_true, y_pred)
        elif self.metric == 'r2':
            return r2_score(y_true, y_pred)
        elif self.metric == 'rmse':
            return mean_squared_error(y_true, y_pred, squared=False)
        elif self.metric == 'wmae' and weights is not None:
            return self.weighted_mean_absolute_error(y_true, y_pred, weights)
        elif self.metric == 'rmsle':
            return self.rmsLe(y_true, y_pred)
        elif self.metric == 'mse':
            return mean_squared_error(y_true, y_pred, squared=True)
        elif self.metric == "mape":
            return self.mape(y_true, y_pred)
        else:
            raise ValueError(f"Unsupported metric '{self.metric}'")

    def Train_ML(self, params, model_name, e_stop=50, target=None, estimator=None, g_col=None, tab_net_train_params=None,
                 optuna=False, V_weights=None, y_log=False):
        """
        :param params: 模型训练的参数
        :param model_name: ['XGB','LGB','CAT']
        :param e_stop: 早停机制
        :param estimator: 训练的模型对象
        :param g_col: 表示数据中的分组列，通常在分组交叉验证中使用
        :param tab_net_train_params: 如果使用TabNet模型时，可能会传入TabNet训练的参数
        :param optuna:
        :param V_weights:
        :param y_log:是否对目标值（y）进行对数变换，默认为 False
        :return:
        """

        print(f"The EarlyStopping is {e_stop}") if optuna == False else None
        if self.metric not in self.metrics:
            raise ValueError(f"Metric '{self.metric}' is not supported. Choose from Given Metrics.")
        # classification or regression
        if self.problem_type not in self.problem_types:
            raise ValueError(
                f"Problem type '{self.problem_type}' is not supported. Choose from: 'classification', 'regression'.")

        if self.fold_type == 'SKF':
            kfold = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
        elif self.fold_type == 'KF':
            kfold = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
        elif self.fold_type == 'GKF':
            kfold = GroupKFold(n_splits=self.n_splits)
        elif self.fold_type == 'RKF':
            kfold = RepeatedKFold(n_splits=self.n_splits, n_repeats=1, random_state=self.seed)
        else:
            raise NotImplementedError("Select the Given Cv Statergy")

        train_scores = []  # 保存每一轮交叉验证训练集上的模型得分
        oof_scores = []  # 用于保存每一轮交叉验证在验证集的得分
        all_models = []  # 保存每折训练的模型
        oof_predictions = np.zeros((len(self.y_train), self.num_classes)) if self.num_classes > 2 else np.zeros(
            len(self.y_train))
        # 存储测试机在不同交叉验证中的预测结果
        test_preds = (
            None if self.X_test is None else
            np.zeros((len(self.X_test), self.n_splits, self.num_classes)) if self.num_classes > 2 else
            np.zeros((len(self.X_test), self.n_splits))
        )
        # 确定分类特征索引，并根据模型的类型（特别是 model==‘CAT’）来决定是否需要将这些分类特征的索引传递给模型
        cat_features_indices = [self.X_train.columns.get_loc(col) for col in
                                self.cat_features] if model_name == 'CAT' else None
        
        for fold, (train_idx, val_idx) in enumerate(tqdm(kfold.split(self.X_train, self.y_train) if self.fold_type != 'GKF' else kfold.split(self.X_train, self.y_train,groups=self.X_train[g_col]), desc="Training Folds", total=self.n_splits)):
            print(1)
            if self.numpy_data:
                X_train, X_val = self.X_train[train_idx], self.X_train[val_idx]
                y_train, y_val = self.y_train[train_idx], self.y_train[val_idx]
            else:
                X_train, X_val = self.X_train.iloc[train_idx], self.X_train.iloc[val_idx]
                y_train, y_val = self.y_train.iloc[train_idx], self.y_train.iloc[val_idx]

            if y_log:  # 对标签进行对数变换
                y_train = np.log1p(y_train)
                y_val = np.log1p(y_val)

            # 为测试机的样本分配权值，使得每一个样本都有一个与训练集样本相对应的权值
            def distribute_test_weights(test_sample_size, weights):
                repeated_weights = np.tile(weights, int(np.ceil(test_sample_size / len(weights))))[:test_sample_size]
                return repeated_weights

            if self.weights is not None:
                #                 train_weights, val_weights = self.weights.iloc[train_idx], self.weights.iloc[val_idx]
                val_weights = distribute_test_weights(len(y_val),
                                                      self.weights)  # If Test Weights are Less || Sample Thm
                train_weights = np.ones(len(y_train))  # If Train Weights are None

            if model_name == 'LGBM':
                model = lgb.LGBMClassifier(**params, random_state=self.seed, verbose=-1,
                                           device='gpu' if self.gpu else 'cpu') if self.problem_type == 'classification' else lgb.LGBMRegressor(
                    **params, random_state=self.seed, verbose=-1,
                    device='gpu' if self.gpu else 'cpu')
            elif model_name == 'TABNET':
                model = TabNetClassifier(**params, seed=self.seed, verbose=-1,
                                         device_name='gpu' if self.gpu else 'cpu') if self.problem_type == 'classification' else TabNetRegressor(
                    **params, seed=self.seed, verbose=-1,
                    device_name='gpu' if self.gpu else 'cpu')
            elif model_name == 'XGB':
                model = XGBClassifier(**params, random_state=self.seed, verbose=-1,
                                      tree_method='gpu_hist' if self.gpu else 'hist') if self.problem_type == 'classification' else XGBRegressor(
                    **params, random_state=self.seed, verbose=-1,
                    tree_method='gpu_hist' if self.gpu else 'hist')
            elif model_name == 'CAT':
                train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features_indices)
                val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features_indices)
                model = CatBoostClassifier(**params, random_state=self.seed, verbose=0,
                                           task_type='GPU' if self.gpu else 'CPU') if self.problem_type == 'classification' else CatBoostRegressor(
                    **params, random_state=self.seed, verbose=0,
                    task_type='GPU' if self.gpu else 'CPU')
            elif model_name == 'Voting':
                model = VotingClassifier(estimators=estimator,
                                         weights=V_weights if V_weights is not None else None) if self.problem_type == 'classification' else VotingRegressor(
                    estimators=estimator, weights=V_weights if V_weights is not None else None)
            else:
                raise ValueError("model_name must be 'LGBM' or 'CAT'.")

            callbacks = [early_stopping(stopping_rounds=e_stop, verbose=False)] if self.early_stop else None
            if model_name == 'LGBM':
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                          eval_metric=self.metric.lower() if self.metric.lower() in ['mae', 'mse', 'rmse', 'rmsle',
                                                                                     'wmae'] else self.metric,
                          callbacks=callbacks)

            elif model_name == 'TABNET':
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=self.metric, **tab_net_train_params)

            elif model_name == 'XGB':
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=self.metric.lower() if self.metric.lower() in ['mae', 'mse', 'rmse', 'rmsle','wmae'] else self.metric,
                          early_stopping_rounds=e_stop if self.early_stop else None, verbose=False)

            elif model_name == 'CAT':
                if kaggle:
                     model.load_model(f'/kaggle/input/cibmtr-catboost-{target}/catboost_{fold}_{target}_{model_name}.cbm')
                else:
                    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=e_stop if self.early_stop else None)
                    model.save_model(f'catboost_{fold}_{target}_{model_name}.cbm')
            elif model_name == 'Voting':
                model.fit(X_train, y_train)

            if self.problem_type == 'classification':
                y_train_pred = model.predict_proba(X_train)[:, 1] if self.prob else model.predict(X_train)
                y_val_pred = model.predict_proba(X_val)[:, 1] if self.prob else model.predict(X_val)
            else:
                y_train_pred = model.predict(X_train)
                y_val_pred = model.predict(X_val)

            if y_log:
                y_train_pred = np.expm1(y_train_pred)
                y_val_pred = np.expm1(y_val_pred)
                y_train = np.expm1(y_train)
                y_val = np.expm1(y_val)

            oof_predictions[val_idx] = y_val_pred

            if self.metric == "accuracy":
                train_scores.append(accuracy_score(y_train,
                                                   np.argmax(y_train_pred, axis=1) if self.num_classes > 2 else (
                                                           y_train_pred > 0.5).astype(int)))
                oof_scores.append(accuracy_score(y_val, np.argmax(y_val_pred, axis=1) if self.num_classes > 2 else (
                        y_val_pred > 0.5).astype(int)))
            elif self.metric == "roc_auc":
                train_scores.append(
                    roc_auc_score(y_train, y_train_pred, multi_class="ovr" if self.num_classes > 2 else None))
                oof_scores.append(roc_auc_score(y_val, y_val_pred, multi_class="ovr" if self.num_classes > 2 else None))

            elif self.metric == 'wmae' and self.weights is not None:
                train_scores.append(self.get_metric(y_train, y_train_pred, train_weights))
                oof_scores.append(self.get_metric(y_val, y_val_pred, val_weights))

            else:
                train_scores.append(self.get_metric(y_train, y_train_pred))
                oof_scores.append(self.get_metric(y_val, y_val_pred))

            if self.X_test is not None:
                if self.problem_type == 'classification':
                    test_preds[:, fold] = model.predict_proba(self.X_test)[:, 1] if self.test_prob else model.predict(
                        self.X_test)
                elif model_name == 'TABNET':
                    pred = model.predict(self.X_test)
                    test_preds[:, fold] = pred.squeeze()
                else:
                    test_preds[:, fold] = model.predict(self.X_test) 

            print(
                f"Fold {fold + 1} - Train {self.metric.upper()}: {train_scores[-1]:.4f}, OOF {self.metric.upper()}: {oof_scores[-1]:.4f}") if optuna == False else None
            all_models.append(model)
            clear_output(wait=True) if optuna == False else None

        mean_train_scores = f"{np.mean(train_scores):.4f}"
        mean_off_scores = f"{np.mean(oof_scores):.4f}"

        print(f"Overall Train {self.metric.upper()}: {mean_train_scores}") if optuna == False else None
        print(f"Overall OOF {self.metric.upper()}: {mean_off_scores} ") if optuna == False else None

        mean_test_preds = test_preds.mean(axis=1) if self.X_test is not None else None

        if y_log:
            mean_test_preds = np.expm1(mean_test_preds)

        return oof_predictions, mean_test_preds, model, all_models, mean_off_scores, mean_train_scores

    def _setup_default_logger(self) -> logging.Logger:
        logger = logging.getLogger(self.__class__.__name__)
        logger.setLevel(logging.INFO)

        if logger.handlers:
            logger.handlers.clear()

        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        return logger

    def RUN_OPTUNA(
            self, MODEL_NAME: str, PARAMS: Dict[str, Any], DIRECTION: str = 'minimize',
            TRIALS: int = 10, SEED: int = 42, ENABLE_PRUNER: bool = False,
            PRUNER_PARAMS: Optional[Dict[str, Any]] = None, y_log: bool = False) -> Dict[str, Any]:

        sampler = optuna.samplers.TPESampler(seed=SEED)

        pruner = None
        if ENABLE_PRUNER:
            pruner_config = PRUNER_PARAMS or {
                'n_startup_trials': 5,
                'n_warmup_steps': 3,
                'n_valid_steps': 3
            }
            pruner = optuna.pruners.MedianPruner(**pruner_config)

        study = optuna.create_study(sampler=sampler, direction=DIRECTION, pruner=pruner)

        best_scores = {'train_score': None, 'val_score': None}

        def objective(trial):
            train_score, val_score = self.OPTUNE_TRAIN(trial, MODEL_NAME=MODEL_NAME, PARAMS=PARAMS, y_log=y_log)

            if best_scores['val_score'] is None or (
                    (DIRECTION == 'minimize' and val_score < best_scores['val_score']) or
                    (DIRECTION == 'maximize' and val_score > best_scores['val_score'])
            ):
                best_scores['train_score'] = train_score
                best_scores['val_score'] = val_score

            return val_score

        try:
            study.optimize(objective, n_trials=TRIALS)

            self.logger.info(Fore.RED + f"--> Best Train Score for {MODEL_NAME}: " +
                             Fore.CYAN + f"{best_scores['train_score']:.4f}")
            self.logger.info(Fore.RED + f"--> Best Validation Score for {MODEL_NAME}: " +
                             Fore.CYAN + f"{best_scores['val_score']:.4f}")
            self.logger.info(Fore.RED + f"--> Best Parameters: " + Fore.CYAN + f"{study.best_params}")

            return study

        except Exception as e:
            self.logger.error(f"Optuna Optimization Failed: {str(e)}")
            raise

    def OPTUNE_TRAIN(self, trial: optuna.trial.Trial, MODEL_NAME: str = "", optuna=True,
                     PARAMS: Optional[Dict[str, Union[Tuple[Union[int, float], Union[int, float]], Any]]] = None,
                     y_log: bool = False) -> Tuple[float, float]:

        params = PARAMS.copy() if PARAMS else {}

        for param, value in params.items():
            try:
                if isinstance(value, tuple) and len(value) == 2:
                    if isinstance(value[0], int):
                        params[param] = trial.suggest_int(param, value[0], value[1])
                    elif isinstance(value[0], float):
                        params[param] = trial.suggest_float(param, value[0], value[1], log=True)
            except Exception as e:
                self.logger.error(f"Error suggesting parameter {param}: {e}")
                raise

        try:
            result = self.Train_ML(params=params, model_name=MODEL_NAME, e_stop=40, estimator=None,
                                   g_col=None, tab_net_train_params=None, optuna=optuna, y_log=y_log)

            test_score = result[4]
            train_score = result[5]
            try:
                test_score = float(test_score)
                train_score = float(train_score)
                return train_score, test_score
            except ValueError as e:
                raise ValueError(f"Score '{test_score}' and {train_score} is not a valid float. Original error: {e}")

        except Exception as e:
            self.logger.error(f"Training failed for {MODEL_NAME}: {e}")
            raise

In [None]:
if kaggle:
    train = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
    test = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')
    sample = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv')
    data_description = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv')
else:
    train = pd.read_csv('../data/train.csv')
    test = pd.read_csv('../data/test.csv')
    sample = pd.read_csv('../data/sample_submission.csv')
    data_description = pd.read_csv('../data/data_dictionary.csv')
    
train_solution = train[['ID','efs','efs_time','race_group']].copy()

In [None]:
def recalculate_hla_sums(df):
    df = df.copy()
    # Replace NaN with 0 and calculate new columns
    df['hla_nmdp_6'] = (df['hla_match_a_low'].fillna(0) + df['hla_match_b_low'].fillna(0) + df['hla_match_drb1_high'].fillna(0))
    df['hla_low_res_6'] = (df['hla_match_a_low'].fillna(0) + df['hla_match_b_low'].fillna(0) + df['hla_match_drb1_low'].fillna(0))
    df['hla_high_res_6'] = (df['hla_match_a_high'].fillna(0) + df['hla_match_b_high'].fillna(0) + df['hla_match_drb1_high'].fillna(0))
    df['hla_low_res_8'] = (df['hla_match_a_low'].fillna(0) + df['hla_match_b_low'].fillna(0) + df['hla_match_c_low'].fillna(0) + df['hla_match_drb1_low'].fillna(0))
    df['hla_high_res_8'] = (df['hla_match_a_high'].fillna(0) + df['hla_match_b_high'].fillna(0) + df['hla_match_c_high'].fillna(0) + df['hla_match_drb1_high'].fillna(0))
    df['hla_low_res_10'] = (df['hla_match_a_low'].fillna(0) + df['hla_match_b_low'].fillna(0) + df['hla_match_c_low'].fillna(0) + df['hla_match_drb1_low'].fillna(0) + df['hla_match_dqb1_low'].fillna(0))
    df['hla_high_res_10'] = (df['hla_match_a_high'].fillna(0) + df['hla_match_b_high'].fillna(0) + df['hla_match_c_high'].fillna(0) + df['hla_match_drb1_high'].fillna(0) + df['hla_match_dqb1_high'].fillna(0))
    return df
train = recalculate_hla_sums(train)
test = recalculate_hla_sums(test)

In [None]:
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata
from sklearn.preprocessing import OneHotEncoder, quantile_transform, FunctionTransformer, PolynomialFeatures, StandardScaler
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)

from sklearn.decomposition import NMF

def scale_with_efs_miya(efs):
    return 1/(np.log(efs+3.5))**1

def shift_points(points, efs):
    points = np.array(points)
    efs = np.array(efs)
    center_efs0 = points[efs == 0].mean(axis=0)
    center_efs1 = points[efs == 1].mean(axis=0)
    shift_vector = center_efs1 - center_efs0
    scaling_factors = scale_with_efs_miya(efs)
    #shift_vector2 = (shift_vector) * scaling_factors[:, None]
    shift_vector2 = shift_vector
    shifted_points = points + efs[:, None] * shift_vector2
    return shifted_points, center_efs0, center_efs1

def transform_rank_log(time, event):
    """Transform the target by stretching the range of eventful efs_times and compressing the range of event_free efs_times
    
    From https://www.kaggle.com/code/cdeotte/nn-mlp-baseline-cv-670-lb-676"""
    transformed = time.values.copy()
    mx = transformed[event == 1].max() # last patient who dies
    mn = transformed[event == 0].min() # first patient who survives
    transformed[event == 0] = time[event == 0] + mx - mn
    transformed = rankdata(transformed)
    transformed[event == 0] += len(transformed) * 2
    transformed = transformed / transformed.max()
    transformed = np.log(transformed)
    return - transformed

def transform_quantile(time, event):
    """Transform the target by stretching the range of eventful efs_times and compressing the range of event_free efs_times
    
    From https://www.kaggle.com/code/ambrosm/esp-eda-which-makes-sense"""
    transformed = np.full(len(time), np.nan)
    transformed_dead = quantile_transform(- time[event == 1].values.reshape(-1, 1)).ravel()
    transformed[event == 1] = transformed_dead
    transformed[event == 0] = transformed_dead.min() - 0.3
    return transformed

def update_target_with_survival_probabilities(df, method="kaplan", title='target', time_col='efs_time', event_col='efs'):
    res = np.zeros(df.shape[0])
    for train_idx, val_idx in skf.split(df, df["race_group"]):
        X_trn, X_val = df.iloc[train_idx], df.iloc[val_idx]
        if method == "kaplan":
            kmf = KaplanMeierFitter()
            kmf.fit(durations = X_trn[time_col], event_observed = X_trn[event_col])
            res[val_idx] =  kmf.survival_function_at_times(X_val[time_col]).values
        elif method == "nelson":
            naf = NelsonAalenFitter()
            naf.fit(durations=X_trn[time_col], event_observed=X_trn[event_col])
            res[val_idx] = -naf.cumulative_hazard_at_times(X_val[time_col]).values
        elif method == "cox":
            data_trn = pd.get_dummies(X_trn, columns=cat_cols, drop_first=True).drop("ID",axis=1)
            data_val = pd.get_dummies(X_val, columns=cat_cols, drop_first=True).drop("ID",axis=1)
            # Drop constant columns if they exist
            train_data = data_trn.loc[:, data_trn.nunique() > 1]
            valid_data = data_val[train_data.columns]
            cph = CoxPHFitter(penalizer=0.01)
            cph.fit(train_data, duration_col=time_col, event_col=event_col)
            res[val_idx] = cph.predict_partial_hazard(valid_data).values
        elif method == 'quantile':
            y_tr = transform_quantile(time=X_val.efs_time, event=X_val.efs)
            res[val_idx] = y_tr
        elif method == 'rank_log':
            res[val_idx] = transform_rank_log(time=X_val.efs_time, event=X_val.efs)
    df[title] = res
    if method == "kaplan":
        df.loc[df[event_col] == 0, title] -= 0.15
    if method == "nelson":
        df.loc[df.efs == 0, title] = (-(-df.loc[df.efs == 0, title])**0.5)
    return df
train = update_target_with_survival_probabilities(train, method="kaplan", title="target1",time_col='efs_time', event_col='efs')
train = update_target_with_survival_probabilities(train, method="nelson", title="target2",time_col='efs_time', event_col='efs')
train = update_target_with_survival_probabilities(train, method="quantile", title="target3",time_col='efs_time', event_col='efs')
train = update_target_with_survival_probabilities(train, method="rank_log", title="target4",time_col='efs_time', event_col='efs')

In [None]:
RMV = ["ID","efs","efs_time","target1","target2","target3","target4"]
FEATURES = [c for c in train.columns if not c in RMV]
CAT_COLS = []
NUM_COLS = []

for c in FEATURES:
    if test[c].dtype == "object":
        CAT_COLS.append(c)
    else:
        NUM_COLS.append(c)
print(f"In these features, there are {len(CAT_COLS)} CATEGORICAL FEATURES: {CAT_COLS}")

def update(df):
    global CAT_COLS
    for c in CAT_COLS:
        df[c] = df[c].astype(str).fillna("NaN").astype("category")
    for c in NUM_COLS:
        if df[c].dtype == "float64":
            df[c] = df[c].fillna(0).astype("float32")
        if df[c].dtype == "int64":
            df[c] = df[c].fillna(0).astype("int32")
    j_ch = ',[]{}:"\\<'
    for ch in j_ch:
        for c in CAT_COLS:
            df[c] = df[c].apply(lambda x: str(x).replace(ch, ""))
    return df
train = update(train)
test = update(test)

In [None]:
def collapse_year(x):
    if x == 2020:
        return 2019
    else:
        return x

def cat_fe(df):
    df["KPS_Bin"] = pd.cut(df["karnofsky_score"],bins=[0, 10, 50, 80, 100],labels=["Critical", "Severely_Dependent", "Partially_Independent", "Healthy"],right=False) # 1.111457
    df["Comorbidity_Bin"] = pd.cut(df["comorbidity_score"],bins=[0, 2, 5, np.inf],labels=["Low", "Medium", "High"],right=False) # 1.528398
    df["Combined_Bin"] = df["KPS_Bin"].astype(str) + "_" + df["Comorbidity_Bin"].astype(str) # 1.450107
    # df['is_cyto_score_same'] = (df['cyto_score'].astype(str) == df['cyto_score_detail'].astype(str)).astype(int)
    # df['strange_age'] = df.age_at_hct == 0.044
    df["age_bin"] = pd.cut(df["age_at_hct"],bins=[0,0.0441,16,30,50,100])
    # df["age_group"] = np.where(df["age_at_hct"] < 16, '<16','>=16')
    df[["donor_sex","recipient_sex"]] = df["sex_match"].str.split('-', expand=True)
    df["tbi_status+gvhd_proph"] = df["tbi_status"].astype(str) + '_' + df["gvhd_proph"].astype(str)
    return df

train = cat_fe(train)
test = cat_fe(test)

In [None]:
# gaps = [1,2,3]
hla_high_cols = ['hla_match_a_high', 'hla_match_b_high', 'hla_match_c_high']
hla_low_cols = ['hla_match_a_low', 'hla_match_b_low', 'hla_match_c_low']
def numeric_fe(df):
    df['year_hct'] = df['year_hct'].apply(collapse_year)
    df["age_diff"] = df["donor_age"] - df["age_at_hct"] # 1.560917
    df["age_ratio"] =  df["donor_age"] / df["age_at_hct"] # 1.281368
    df["age_ts"] = df["age_at_hct"] / df["donor_age"]

   # df["KPS/Comorbidity"] = df["karnofsky_score"] / df["comorbidity_score"] # 0.70
    df["KPS_Minus_Comorbidity"] = df["karnofsky_score"] - df["comorbidity_score"] # 2.166210
    df["KPS_Comorbidity_sum"] = df["karnofsky_score"] + df["comorbidity_score"] # 1.533922
    df["KPS_Multi_Comorbidity"] = df["karnofsky_score"] * df["comorbidity_score"] # 0.99
    df["years_since_2000"] = df['year_hct'] - 2000 #1.141439

    df["sex_match_is_same"] = (df["donor_sex"] == df["recipient_sex"]).astype(int)
    return df
train = numeric_fe(train)
test = numeric_fe(test)

In [None]:
from sklearn.cluster import KMeans
def create_kmeans_features(train, test, n_clusters=8, cat_cols=None, num_cols=None, seed=42):
    if cat_cols is None:
        cal_cols = []
    if num_cols is None:
        num_cols = []
    cols = cat_cols + num_cols    
    train_encoded = pd.get_dummies(train[cols], columns=cat_cols, drop_first=True)
    test_encoded  = pd.get_dummies(test[cols],  columns=cat_cols, drop_first=True)
    test_encoded  = test_encoded.reindex(columns=train_encoded.columns,fill_value=0)
    kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
    train_clusters = kmeans.fit_predict(train_encoded)
    train['kmeans_cluster'] = train_clusters
    test_clusters = kmeans.predict(test_encoded)
    test['kmeans_cluster'] = test_clusters
    
    return train, test

train, test = create_kmeans_features(train, test,n_clusters=5, cat_cols=CAT_COLS, num_cols=NUM_COLS)

In [None]:
# new FE version.3 ( for usage within the CV loop )

def race_group_white_FE(df, disease_rank_df=None, conditioning_rank_df=None):
    # Trainデータの処理
    # "White" = 2
    if disease_rank_df is None:
        disease_rank_df = df[df['race_group'] == "White"]['prim_disease_hct'].value_counts().reset_index()
        disease_rank_df.columns = ['prim_disease_hct', 'count']
        disease_rank_df['White_disease_rank'] = disease_rank_df['count'].rank(method='dense', ascending=False).astype(int)

    if conditioning_rank_df is None:
        conditioning_rank_df = df[df['race_group'] == "White"]['conditioning_intensity'].value_counts().reset_index()
        conditioning_rank_df.columns = ['conditioning_intensity', 'count']
        conditioning_rank_df['White_conditioning_rank'] = conditioning_rank_df['count'].rank(method='dense', ascending=False).astype(int)

    # prim_disease_hct のランクを適用
    df = df.merge(disease_rank_df[['prim_disease_hct', 'White_disease_rank']], on='prim_disease_hct', how='left')
    max_disease_rank = disease_rank_df['White_disease_rank'].max() if not disease_rank_df.empty else 0
    df['White_disease_rank'] = df['White_disease_rank'].fillna(max_disease_rank + 1).astype(int)

    # conditioning_intensity のランクを適用
    df = df.merge(conditioning_rank_df[['conditioning_intensity', 'White_conditioning_rank']], on='conditioning_intensity', how='left')
    max_conditioning_rank = conditioning_rank_df['White_conditioning_rank'].max() if not conditioning_rank_df.empty else 0
    df['White_conditioning_rank'] = df['White_conditioning_rank'].fillna(max_conditioning_rank + 1).astype(int)

    # 新しい特徴量の作成
    df['comorbidity_score*WDR'] = df['comorbidity_score'] * df['White_disease_rank']
    df['karnofsky_score/WDR'] = df['karnofsky_score'] / df['White_disease_rank']
    df['donor_age*WDR'] = df['donor_age'] * df['White_disease_rank']
    
    df['comorbidity_score*WCR'] = df['comorbidity_score'] * df['White_conditioning_rank']
    df['karnofsky_score/WCR'] = df['karnofsky_score'] / df['White_conditioning_rank']
    df['donor_age*WCR'] = df['donor_age'] * df['White_conditioning_rank']

    return df, disease_rank_df, conditioning_rank_df

# Trainデータでランクを作成
train, disease_rank_df, conditioning_rank_df = race_group_white_FE(train)

# TestデータにTrainのランクを適用
test, _, _ =race_group_white_FE(test, disease_rank_df, conditioning_rank_df)

In [None]:
def dfrank(newdf: pd.DataFrame): # 添加基础排名因子
    
    num_cols = ['donor_age', 'age_at_hct','prim_disease_hct', 'year_hct']
    columns=[column for column in newdf.columns if column in num_cols]
    for column in columns:
        # 从小到大排名【测试下双排名有效果是因为加上了na_option='bottom'的处理机制还是因为实现的双排名方案】
        newdf=pd.concat([newdf,(newdf[str(column)].rank(method="max", ascending=False,na_option='bottom')/len(newdf)).rename(f"{str(column)}_rank")], axis=1) # 从大到小排序
        # 从大到小排名
        newdf=pd.concat([newdf,(newdf[str(column)].rank(method="max", ascending=True,na_option='bottom')/len(newdf)).rename(f"{str(column)}_rerank")], axis=1) # 从大到小排序
    return newdf
train = dfrank(train)
test = dfrank(test)

In [None]:
RMV = ["ID","efs","efs_time","target1","target2","target3","target4"]
FEATURES = [c for c in train.columns if not c in RMV]
cat_cols = []
num_cols = []

for c in FEATURES:
    if test[c].dtype == "object" or test[c].dtype == "category":
        cat_cols.append(c)
    else:
        num_cols.append(c)
print(f"In these features, there are {len(cat_cols)} CATEGORICAL FEATURES: {cat_cols}")

def update(df):
    global cat_cols
    for c in cat_cols:
        df[c] = df[c].astype(str).fillna("NaN").astype("category")
    for c in num_cols:
        if df[c].dtype == "float64":
            df[c] = df[c].fillna(0).astype("float32")
        if df[c].dtype == "int64":
            df[c] = df[c].fillna(0).astype("int32")
    j_ch = ',[]{}:"\\<'
    for ch in j_ch:
        for c in cat_cols:
            df[c] = df[c].apply(lambda x: str(x).replace(ch, ""))
    return df
train = update(train)
test = update(test)

In [None]:
def c_index_score(modeloff, model_name, weights=None):
    y_true = train_solution 
    y_pred = train_solution[["ID"]].copy()

    if isinstance(modeloff, (list, tuple, np.ndarray)) and all(isinstance(m, np.ndarray) for m in modeloff):
        if weights is None:
            weights = [1] * len(modeloff)
        
        assert len(modeloff) == len(weights), "The number of models must match the number of weights."
        
        combined_modeloff = sum(weight * model for weight, model in zip(weights, modeloff))
        y_pred["prediction"] = combined_modeloff
    else:
        y_pred["prediction"] = modeloff

    c_index = base.CIBMTR_score(y_true.copy(), y_pred.copy(), "ID")
    print(Fore.YELLOW + f"The Score of {model_name} is: {c_index:.4f}")

In [None]:
f_fe = [ 'hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10', 'age_diff', 'age_ratio', 'age_ts', 'KPS_Minus_Comorbidity', 'KPS_Comorbidity_sum', 'KPS_Multi_Comorbidity', 'years_since_2000', 'sex_match_is_same', 'kmeans_cluster', 'White_disease_rank', 'White_conditioning_rank', 'comorbidity_score*WDR', 'karnofsky_score/WDR', 'donor_age*WDR', 'comorbidity_score*WCR', 'karnofsky_score/WCR', 'donor_age*WCR', 'prim_disease_hct_rank', 'prim_disease_hct_rerank', 'year_hct_rank', 'year_hct_rerank', 'donor_age_rank', 'donor_age_rerank', 'age_at_hct_rank', 'age_at_hct_rerank', 'tfidf_conditioning_intensity_5', 'tfidf_conditioning_intensity_14', 'tfidf_dri_score_9', 'tfidf_dri_score_10', 'tfidf_sex_match_1', 'dri_score_High', 'dri_score_High - TED AML case missing cytogenetics', 'dri_score_Intermediate', 'dri_score_Intermediate - TED AML case missing cytogenetics', 'dri_score_Low', 'dri_score_Missing disease status', 'dri_score_N/A - disease not classifiable', 'dri_score_N/A - non-malignant indication', 'dri_score_N/A - pediatric', 'dri_score_TBD cytogenetics', 'dri_score_Very high', 'dri_score_nan', 'psych_disturb_No', 'psych_disturb_Not done', 'psych_disturb_Yes', 'psych_disturb_nan', 'cyto_score_Favorable', 'cyto_score_Intermediate', 'cyto_score_Normal', 'cyto_score_Not tested', 'cyto_score_Other', 'cyto_score_Poor', 'cyto_score_TBD', 'cyto_score_nan', 'diabetes_No', 'diabetes_Not done', 'diabetes_Yes', 'diabetes_nan', 'tbi_status_No TBI', 'tbi_status_TBI + Cy +- Other', 'tbi_status_TBI +- Other -cGy fractionated', 'tbi_status_TBI +- Other -cGy single', 'tbi_status_TBI +- Other =cGy', 'tbi_status_TBI +- Other >cGy', 'arrhythmia_No', 'arrhythmia_Not done', 'arrhythmia_Yes', 'arrhythmia_nan', 'graft_type_Bone marrow', 'graft_type_Peripheral blood', 'vent_hist_No', 'vent_hist_Yes', 'vent_hist_nan', 'renal_issue_No', 'renal_issue_Not done', 'renal_issue_Yes', 'renal_issue_nan', 'pulm_severe_No', 'pulm_severe_Not done', 'pulm_severe_Yes', 'pulm_severe_nan', 'prim_disease_hct_AI', 'prim_disease_hct_ALL', 'prim_disease_hct_AML', 'prim_disease_hct_CML', 'prim_disease_hct_HD', 'prim_disease_hct_HIS', 'prim_disease_hct_IEA', 'prim_disease_hct_IIS', 'prim_disease_hct_IMD', 'prim_disease_hct_IPA', 'prim_disease_hct_MDS', 'prim_disease_hct_MPN', 'prim_disease_hct_NHL', 'prim_disease_hct_Other acute leukemia', 'prim_disease_hct_Other leukemia', 'prim_disease_hct_PCD', 'prim_disease_hct_SAA', 'cmv_status_+/+', 'cmv_status_+/-', 'cmv_status_-/+', 'cmv_status_-/-', 'cmv_status_nan', 'tce_imm_match_G/B', 'tce_imm_match_G/G', 'tce_imm_match_H/B', 'tce_imm_match_H/H', 'tce_imm_match_P/B', 'tce_imm_match_P/G', 'tce_imm_match_P/H', 'tce_imm_match_P/P', 'tce_imm_match_nan', 'rituximab_No', 'rituximab_Yes', 'rituximab_nan', 'prod_type_BM', 'prod_type_PB', 'cyto_score_detail_Favorable', 'cyto_score_detail_Intermediate', 'cyto_score_detail_Not tested', 'cyto_score_detail_Poor', 'cyto_score_detail_TBD', 'cyto_score_detail_nan', 'conditioning_intensity_MAC', 'conditioning_intensity_N/A F(pre-TED) not submitted', 'conditioning_intensity_NMA', 'conditioning_intensity_No drugs reported', 'conditioning_intensity_RIC', 'conditioning_intensity_TBD', 'conditioning_intensity_nan', 'ethnicity_Hispanic or Latino', 'ethnicity_Non-resident of the U.S.', 'ethnicity_Not Hispanic or Latino', 'ethnicity_nan', 'obesity_No', 'obesity_Not done', 'obesity_Yes', 'obesity_nan', 'mrd_hct_Negative', 'mrd_hct_Positive', 'mrd_hct_nan', 'in_vivo_tcd_No', 'in_vivo_tcd_Yes', 'in_vivo_tcd_nan', 'tce_match_Fully matched', 'tce_match_GvH non-permissive', 'tce_match_HvG non-permissive', 'tce_match_Permissive', 'tce_match_nan', 'hepatic_severe_No', 'hepatic_severe_Not done', 'hepatic_severe_Yes', 'hepatic_severe_nan', 'prior_tumor_No', 'prior_tumor_Not done', 'prior_tumor_Yes', 'prior_tumor_nan', 'peptic_ulcer_No', 'peptic_ulcer_Not done', 'peptic_ulcer_Yes', 'peptic_ulcer_nan', 'gvhd_proph_CDselect +- other', 'gvhd_proph_CDselect alone', 'gvhd_proph_CSA + MMF +- others(not FK)', 'gvhd_proph_CSA + MTX +- others(not MMFFK)', 'gvhd_proph_CSA +- others(not FKMMFMTX)', 'gvhd_proph_CSA alone', 'gvhd_proph_Cyclophosphamide +- others', 'gvhd_proph_Cyclophosphamide alone', 'gvhd_proph_FK+ MMF +- others', 'gvhd_proph_FK+ MTX +- others(not MMF)', 'gvhd_proph_FKalone', 'gvhd_proph_No GvHD Prophylaxis', 'gvhd_proph_Other GVHD Prophylaxis', 'gvhd_proph_TDEPLETION +- other', 'gvhd_proph_nan', 'rheum_issue_No', 'rheum_issue_Not done', 'rheum_issue_Yes', 'rheum_issue_nan', 'sex_match_F-F', 'sex_match_F-M', 'sex_match_M-F', 'sex_match_M-M', 'sex_match_nan', 'race_group_American Indian or Alaska Native', 'race_group_Asian', 'race_group_Black or African-American', 'race_group_More than one race', 'race_group_Native Hawaiian or other Pacific Islander', 'race_group_White', 'hepatic_mild_No', 'hepatic_mild_Not done', 'hepatic_mild_Yes', 'hepatic_mild_nan', 'tce_div_match_Bi-directional non-permissive', 'tce_div_match_GvH non-permissive', 'tce_div_match_HvG non-permissive', 'tce_div_match_Permissive mismatched', 'tce_div_match_nan', 'donor_related_Multiple donor (non-UCB)', 'donor_related_Related', 'donor_related_Unrelated', 'donor_related_nan', 'melphalan_dose_MEL', 'melphalan_dose_N/A Mel not given', 'melphalan_dose_nan', 'cardiac_No', 'cardiac_Not done', 'cardiac_Yes', 'cardiac_nan', 'pulm_moderate_No', 'pulm_moderate_Not done', 'pulm_moderate_Yes', 'pulm_moderate_nan', 'KPS_Bin_Critical', 'KPS_Bin_Healthy', 'KPS_Bin_Partially_Independent', 'KPS_Bin_Severely_Dependent', 'KPS_Bin_nan', 'Comorbidity_Bin_High', 'Comorbidity_Bin_Low', 'Comorbidity_Bin_Medium', 'Combined_Bin_Critical_Low', 'Combined_Bin_Healthy_High', 'Combined_Bin_Healthy_Low', 'Combined_Bin_Healthy_Medium', 'Combined_Bin_Partially_Independent_High', 'Combined_Bin_Partially_Independent_Low', 'Combined_Bin_Partially_Independent_Medium', 'Combined_Bin_Severely_Dependent_High', 'Combined_Bin_Severely_Dependent_Low', 'Combined_Bin_nan_Low', 'age_bin_(0.0 0.0441', 'age_bin_(0.0441 16.0', 'age_bin_(16.0 30.0', 'age_bin_(30.0 50.0', 'age_bin_(50.0 100.0', 'donor_sex_F', 'donor_sex_M', 'recipient_sex_F', 'recipient_sex_M', 'recipient_sex_None', 'tbi_status+gvhd_proph_No TBI_CDselect +- other', 'tbi_status+gvhd_proph_No TBI_CDselect alone', 'tbi_status+gvhd_proph_No TBI_CSA + MMF +- others(not FK)', 'tbi_status+gvhd_proph_No TBI_CSA + MTX +- others(not MMFFK)', 'tbi_status+gvhd_proph_No TBI_CSA +- others(not FKMMFMTX)', 'tbi_status+gvhd_proph_No TBI_CSA alone', 'tbi_status+gvhd_proph_No TBI_Cyclophosphamide +- others', 'tbi_status+gvhd_proph_No TBI_Cyclophosphamide alone', 'tbi_status+gvhd_proph_No TBI_FK+ MMF +- others', 'tbi_status+gvhd_proph_No TBI_FK+ MTX +- others(not MMF)', 'tbi_status+gvhd_proph_No TBI_FKalone', 'tbi_status+gvhd_proph_No TBI_No GvHD Prophylaxis', 'tbi_status+gvhd_proph_No TBI_Other GVHD Prophylaxis', 'tbi_status+gvhd_proph_No TBI_Parent Q = yes but no agent', 'tbi_status+gvhd_proph_No TBI_TDEPLETION +- other', 'tbi_status+gvhd_proph_No TBI_TDEPLETION alone', 'tbi_status+gvhd_proph_No TBI_nan', 'tbi_status+gvhd_proph_TBI + Cy +- Other_CDselect +- other', 'tbi_status+gvhd_proph_TBI + Cy +- Other_CDselect alone', 'tbi_status+gvhd_proph_TBI + Cy +- Other_CSA + MMF +- others(not FK)', 'tbi_status+gvhd_proph_TBI + Cy +- Other_CSA + MTX +- others(not MMFFK)', 'tbi_status+gvhd_proph_TBI + Cy +- Other_CSA +- others(not FKMMFMTX)', 'tbi_status+gvhd_proph_TBI + Cy +- Other_CSA alone', 'tbi_status+gvhd_proph_TBI + Cy +- Other_Cyclophosphamide +- others', 'tbi_status+gvhd_proph_TBI + Cy +- Other_Cyclophosphamide alone', 'tbi_status+gvhd_proph_TBI + Cy +- Other_FK+ MMF +- others', 'tbi_status+gvhd_proph_TBI + Cy +- Other_FK+ MTX +- others(not MMF)', 'tbi_status+gvhd_proph_TBI + Cy +- Other_FKalone', 'tbi_status+gvhd_proph_TBI + Cy +- Other_Other GVHD Prophylaxis', 'tbi_status+gvhd_proph_TBI + Cy +- Other_TDEPLETION +- other', 'tbi_status+gvhd_proph_TBI + Cy +- Other_TDEPLETION alone', 'tbi_status+gvhd_proph_TBI +- Other -cGy fractionated_CSA + MMF +- others(not FK)', 'tbi_status+gvhd_proph_TBI +- Other -cGy fractionated_Cyclophosphamide +- others', 'tbi_status+gvhd_proph_TBI +- Other -cGy fractionated_Cyclophosphamide alone', 'tbi_status+gvhd_proph_TBI +- Other -cGy fractionated_FK+ MTX +- others(not MMF)', 'tbi_status+gvhd_proph_TBI +- Other -cGy fractionated_TDEPLETION +- other', 'tbi_status+gvhd_proph_TBI +- Other -cGy single_CDselect alone', 'tbi_status+gvhd_proph_TBI +- Other -cGy single_CSA + MMF +- others(not FK)', 'tbi_status+gvhd_proph_TBI +- Other -cGy single_Cyclophosphamide +- others', 'tbi_status+gvhd_proph_TBI +- Other -cGy single_FK+ MMF +- others', 'tbi_status+gvhd_proph_TBI +- Other -cGy single_FK+ MTX +- others(not MMF)', 'tbi_status+gvhd_proph_TBI +- Other -cGy single_Other GVHD Prophylaxis', 'tbi_status+gvhd_proph_TBI +- Other -cGy single_TDEPLETION alone', 'tbi_status+gvhd_proph_TBI +- Other -cGy unknown dose_Cyclophosphamide +- others', 'tbi_status+gvhd_proph_TBI +- Other -cGy unknown dose_Cyclophosphamide alone', 'tbi_status+gvhd_proph_TBI +- Other -cGy unknown dose_FK+ MMF +- others', 'tbi_status+gvhd_proph_TBI +- Other -cGy unknown dose_Other GVHD Prophylaxis', 'tbi_status+gvhd_proph_TBI +- Other -cGy unknown dose_TDEPLETION alone', 'tbi_status+gvhd_proph_TBI +- Other =cGy_CDselect alone', 'tbi_status+gvhd_proph_TBI +- Other =cGy_CSA + MMF +- others(not FK)', 'tbi_status+gvhd_proph_TBI +- Other =cGy_CSA +- others(not FKMMFMTX)', 'tbi_status+gvhd_proph_TBI +- Other =cGy_CSA alone', 'tbi_status+gvhd_proph_TBI +- Other =cGy_Cyclophosphamide +- others', 'tbi_status+gvhd_proph_TBI +- Other =cGy_Cyclophosphamide alone', 'tbi_status+gvhd_proph_TBI +- Other =cGy_FK+ MMF +- others', 'tbi_status+gvhd_proph_TBI +- Other =cGy_FK+ MTX +- others(not MMF)', 'tbi_status+gvhd_proph_TBI +- Other =cGy_FKalone', 'tbi_status+gvhd_proph_TBI +- Other =cGy_Parent Q = yes but no agent', 'tbi_status+gvhd_proph_TBI +- Other =cGy_TDEPLETION alone', 'tbi_status+gvhd_proph_TBI +- Other =cGy_nan', 'tbi_status+gvhd_proph_TBI +- Other >cGy_CDselect +- other', 'tbi_status+gvhd_proph_TBI +- Other >cGy_CDselect alone', 'tbi_status+gvhd_proph_TBI +- Other >cGy_CSA + MMF +- others(not FK)', 'tbi_status+gvhd_proph_TBI +- Other >cGy_Cyclophosphamide +- others', 'tbi_status+gvhd_proph_TBI +- Other >cGy_Cyclophosphamide alone', 'tbi_status+gvhd_proph_TBI +- Other >cGy_FK+ MMF +- others', 'tbi_status+gvhd_proph_TBI +- Other >cGy_FK+ MTX +- others(not MMF)', 'tbi_status+gvhd_proph_TBI +- Other >cGy_FKalone', 'tbi_status+gvhd_proph_TBI +- Other >cGy_No GvHD Prophylaxis', 'tbi_status+gvhd_proph_TBI +- Other >cGy_Other GVHD Prophylaxis', 'tbi_status+gvhd_proph_TBI +- Other >cGy_TDEPLETION +- other', 'tbi_status+gvhd_proph_TBI +- Other >cGy_TDEPLETION alone', 'tbi_status+gvhd_proph_TBI +- Other >cGy_nan', 'tbi_status+gvhd_proph_TBI +- Other unknown dose_CSA + MMF +- others(not FK)', 'tbi_status+gvhd_proph_TBI +- Other unknown dose_FK+ MMF +- others',]
CATS=[ 'dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate', 'KPS_Bin', 'Comorbidity_Bin', 'Combined_Bin', 'age_bin', 'donor_sex', 'recipient_sex', 'tbi_status+gvhd_proph']

In [None]:
TF_IDF_COLS = ["conditioning_intensity","dri_score","sex_match"]
ohe_cols = {'cat_c': cat_cols}
label_cols = {'cat_c': cat_cols}
txt_cols = {'txt_columns': CAT_COLS}
tf_cols = {
    'text_columns': TF_IDF_COLS,
    'max_features': 1000,
          }

base = AbdBase(train_data=train, test_data=test, target_column='target1', gpu=False,
                 problem_type="regression", metric="mae", seed=seed,ohe_fe=ohe_cols,label_encode=None, stat_fe=None, multi_column_tfidf=tf_cols,
                 n_splits=10,early_stop=True,num_classes=0,cat_features=None, 
                 fold_type='RKF')

base.X_train = base.X_train[f_fe]
base.X_test = base.X_test[f_fe]

def year_tf(df):
    
    df['cos_year'] = np.cos(df['year_hct'] * (2 * np.pi) / 100)
    df['sin_year'] = np.sin(df['year_hct'] * (2 * np.pi) / 100)
    
    return df

def FE(df):
    df['age_at_hctmin']=df['year_hct']-df['age_at_hct']
    return df
    
base.X_train = year_tf(base.X_train)
base.X_test = year_tf(base.X_test)
base.X_train = FE(base.X_train)
base.X_test = FE(base.X_test)

In [None]:
%%time

CP = {'iterations': 3114, 'learning_rate': 0.020996758976627422, 'depth': 7, 'l2_leaf_reg': 1.213620321659482,
      'border_count': 153, 'bagging_temperature': 0.24943381016139077}

ctb1 = base.Train_ML(CP,'CAT',e_stop=300,target="kaplan")
c_index_score(ctb1[0],'CAT_MODEL') # 0.6789

In [None]:
TF_IDF_COLS = ["conditioning_intensity","dri_score","sex_match"]
ohe_cols = {'cat_c': cat_cols}
label_cols = {'cat_c': cat_cols}
txt_cols = {'txt_columns': CAT_COLS}
tf_cols = {
    'text_columns': TF_IDF_COLS,
    'max_features': 1000,
          }

base = AbdBase(train_data=train, test_data=test, target_column='target2', gpu=False,
                 problem_type="regression", metric="mae", seed=seed,ohe_fe=ohe_cols,label_encode=None, stat_fe=None, multi_column_tfidf=tf_cols,
                 n_splits=10,early_stop=True,num_classes=0,cat_features=None, 
                 fold_type='RKF')

base.X_train = base.X_train[f_fe]
base.X_test = base.X_test[f_fe]

def year_tf(df):
    
    df['cos_year'] = np.cos(df['year_hct'] * (2 * np.pi) / 100)
    df['sin_year'] = np.sin(df['year_hct'] * (2 * np.pi) / 100)
    
    return df

def FE(df):
    df['age_at_hctmin']=df['year_hct']-df['age_at_hct']
    return df
    
base.X_train = year_tf(base.X_train)
base.X_test = year_tf(base.X_test)
base.X_train = FE(base.X_train)
base.X_test = FE(base.X_test)

In [None]:
%%time

CP = {'iterations': 3114, 'learning_rate': 0.020996758976627422, 'depth': 7, 'l2_leaf_reg': 1.213620321659482,
      'border_count': 153, 'bagging_temperature': 0.24943381016139077}

ctb2 = base.Train_ML(CP,'CAT',e_stop=300,target="nelson")
c_index_score(ctb2[0],'CAT_MODEL') # 0.6789

In [None]:
TF_IDF_COLS = ["conditioning_intensity","dri_score","sex_match"]
ohe_cols = {'cat_c': cat_cols}
label_cols = {'cat_c': cat_cols}
txt_cols = {'txt_columns': CAT_COLS}
tf_cols = {
    'text_columns': TF_IDF_COLS,
    'max_features': 1000,
          }

base = AbdBase(train_data=train, test_data=test, target_column='target3', gpu=False,
                 problem_type="regression", metric="mae", seed=seed,ohe_fe=ohe_cols,label_encode=None, stat_fe=None, multi_column_tfidf=tf_cols,
                 n_splits=10,early_stop=True,num_classes=0,cat_features=None, 
                 fold_type='RKF')

base.X_train = base.X_train[f_fe]
base.X_test = base.X_test[f_fe]

def year_tf(df):
    
    df['cos_year'] = np.cos(df['year_hct'] * (2 * np.pi) / 100)
    df['sin_year'] = np.sin(df['year_hct'] * (2 * np.pi) / 100)
    
    return df

def FE(df):
    df['age_at_hctmin']=df['year_hct']-df['age_at_hct']
    return df
    
base.X_train = year_tf(base.X_train)
base.X_test = year_tf(base.X_test)
base.X_train = FE(base.X_train)
base.X_test = FE(base.X_test)

In [None]:
%%time

CP = {'iterations': 3114, 'learning_rate': 0.020996758976627422, 'depth': 7, 'l2_leaf_reg': 1.213620321659482,
      'border_count': 153, 'bagging_temperature': 0.24943381016139077}

ctb3 = base.Train_ML(CP,'CAT',e_stop=300,target="quantile")
c_index_score(ctb3[0],'CAT_MODEL') # 0.6789

In [None]:
TF_IDF_COLS = ["conditioning_intensity","dri_score","sex_match"]
ohe_cols = {'cat_c': cat_cols}
label_cols = {'cat_c': cat_cols}
txt_cols = {'txt_columns': CAT_COLS}
tf_cols = {
    'text_columns': TF_IDF_COLS,
    'max_features': 1000,
          }

base = AbdBase(train_data=train, test_data=test, target_column='target4', gpu=False,
                 problem_type="regression", metric="mae", seed=seed,ohe_fe=ohe_cols,label_encode=None, stat_fe=None, multi_column_tfidf=tf_cols,
                 n_splits=10,early_stop=True,num_classes=0,cat_features=None, 
                 fold_type='RKF')

base.X_train = base.X_train[f_fe]
base.X_test = base.X_test[f_fe]

def year_tf(df):
    
    df['cos_year'] = np.cos(df['year_hct'] * (2 * np.pi) / 100)
    df['sin_year'] = np.sin(df['year_hct'] * (2 * np.pi) / 100)
    
    return df

def FE(df):
    df['age_at_hctmin']=df['year_hct']-df['age_at_hct']
    return df
    
base.X_train = year_tf(base.X_train)
base.X_test = year_tf(base.X_test)
base.X_train = FE(base.X_train)
base.X_test = FE(base.X_test)

In [None]:
%%time

CP = {'iterations': 3114, 'learning_rate': 0.020996758976627422, 'depth': 7, 'l2_leaf_reg': 1.213620321659482,
      'border_count': 153, 'bagging_temperature': 0.24943381016139077}

ctb4 = base.Train_ML(CP,'CAT',e_stop=300,target="ranklog")
c_index_score(ctb4[0],'CAT_MODEL') # 0.6789

In [None]:
ctb_kaplan_oof = ctb1[0]
ctb_nelson_oof = ctb2[0]
ctb_quantile_oof = ctb3[0]
ctb_ranklog_oof = ctb4[0]

In [None]:
ctb_kaplan_preds = ctb1[1]
ctb_nelson_preds = ctb2[1]
ctb_quantile_preds = ctb3[1]
ctb_ranklog_preds = ctb4[1]

# GBDTs

## Prepare and Config

In [None]:
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')
from joblib import dump, load

# lifelines
from lifelines import CoxPHFitter
from lifelines import KaplanMeierFitter
from lifelines import NelsonAalenFitter
from lifelines import BreslowFlemingHarringtonFitter

# for models
import lightgbm as lgb
from scipy.stats import rankdata 
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import PowerTransformer

"""
To evaluate the equitable prediction of transplant survival outcomes,
we use the concordance index (C-index) between a series of event
times and a predicted score across each race group.
 
It represents the global assessment of the model discrimination power:
this is the model’s ability to correctly provide a reliable ranking
of the survival times based on the individual risk scores.
 
The concordance index is a value between 0 and 1 where:
 
0.5 is the expected result from random predictions,
1.0 is perfect concordance (with no censoring, otherwise <1.0),
0.0 is perfect anti-concordance (with no censoring, otherwise >0.0)

"""

import pandas as pd
import pandas.api.types
import numpy as np
from lifelines.utils import concordance_index
from colorama import Fore, Back, Style

class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """
    
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))


In [None]:
class Config:
    if not kaggle:
        train_path = '../data/train.csv'
        test_path = '../data/test.csv'
        subm_path = '../data/sample_submission.csv'
    else:
        train_path = Path('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
        test_path = Path('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')
        subm_path = Path('/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv')


    early_stopping_round = 300

    batch_size = 32768
    early_stop = 300
    penalizer = 0.01
    n_splits = 10
    seed = 42

    weights = [1.0, 1.0, 8.0, 4.0, 8.0, 4.0, 6.0, 6.0]
    # weights = [0.0, 0.0, 8.0, 4.0, 8.0, 4.0, 6.0, 6.0]

    ctb_params = {
        'loss_function': 'RMSE',
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'num_trees': 6000,
        'subsample': 0.85,
        'reg_lambda': 8.0,
        'depth': 8,
        # 'thread_count': 12,
    }

    lgb_params = {
        'objective': 'regression',
        'min_child_samples': 32,
        'num_iterations': 6000,
        'learning_rate': 0.03,
        'extra_trees': True,
        'reg_lambda': 8.0,
        'reg_alpha': 0.1,
        'num_leaves': 64,
        'metric': 'rmse',
        'max_depth': 8,
        'device': 'cpu',
        'max_bin': 128,
        'verbose': -1,
        'seed': 42
    }
    
    xgb_params = {
        'device': 'cuda',
        'max_depth': 3,
        'colsample_bytree': 0.5,
        'subsample': 0.8,
        'n_estimators': 5000,
        'learning_rate': 0.03,
        'enable_categorical': True,
        'min_child_weight': 80,
         "early_stopping_rounds": 300,
         "tree_method": "hist",
         "objective": 'reg:squaredlogerror',
         "eval_metric": 'rmse', 
    }

    # Parameters for the first CatBoost model with Cox loss function
    cox1_params = {
        'grow_policy': 'Depthwise',
        'min_child_samples': 8,
        'loss_function': 'Cox',
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'num_trees': 6000,
        'subsample': 0.85,
        'reg_lambda': 8.0,
        'depth': 8
    }

    # Parameters for the second CatBoost model with Cox loss function
    cox2_params = {
        'grow_policy': 'Lossguide',
        'loss_function': 'Cox',
        'learning_rate': 0.03,
        'random_state': 42,
        'task_type': 'CPU',
        'num_trees': 6000,
        'subsample': 0.85,
        'reg_lambda': 8.0,
        'num_leaves': 32,
        'depth': 8
    }

    xgb_cox_params = {
        'max_depth': 3,  
        'colsample_bytree': 0.5,  
        'subsample': 0.8,  
        'n_estimators': 2000,  
        'learning_rate': 0.02,  
        'enable_categorical': True,
        'min_child_weight': 80,
        'objective': 'survival:cox',
        'eval_metric': 'cox-nloglik',
    }
    
    lgb_tw_params = {
        'objective': 'tweedie',
        'min_child_samples': 32,
        'num_iterations': 6000,
        'learning_rate': 0.03,
        'extra_trees': True,
        'reg_lambda': 8.0,
        'reg_alpha': 0.1,
        'num_leaves': 64,
        'metric': 'rmse',
        'max_depth': 8,
        'device': 'cpu',
        'max_bin': 128,
        'verbose': -1,
        'seed': 42
    }

## Feature Engineering

In [None]:
class FeatureEngineer:

    def __init__(self, batch_size):
        self._batch_size = batch_size

    def load_data(self, path):

        return pl.read_csv(path, batch_size=self._batch_size)

    def cast_datatypes(self, df):

        num_cols = [ 'hla_high_res_8', 'hla_low_res_8', 'hla_high_res_6', 'hla_low_res_6', 'hla_high_res_10', 'hla_low_res_10', 'hla_match_dqb1_high', 'hla_match_dqb1_low', 'hla_match_drb1_high', 'hla_match_drb1_low', 'hla_nmdp_6', 'year_hct', 'hla_match_a_high', 'hla_match_a_low', 'hla_match_b_high', 'hla_match_b_low', 'hla_match_c_high', 'hla_match_c_low', 'donor_age', 'age_at_hct', 'comorbidity_score', 'karnofsky_score', 'efs', 'efs_time' ]

        # fill null values
        for col in df.columns:
            if col in num_cols:
                # df = df.with_columns(pl.col(col).fill_null(-1).cast(pl.Float32))  
                df = df.with_columns(pl.col(col).cast(pl.Float32))  
            else:
                df = df.with_columns(pl.col(col).fill_null('Unknown').cast(pl.String))  

        return df.with_columns(pl.col('ID').cast(pl.Int32))

    def recalculate_hla_sums(self, df):
        
        df = df.with_columns(
            (pl.col("hla_match_a_low").fill_null(0) + pl.col("hla_match_b_low").fill_null(0) + 
             pl.col("hla_match_drb1_high").fill_null(0)).alias("hla_nmdp_6"),
            
            (pl.col("hla_match_a_low").fill_null(0) + pl.col("hla_match_b_low").fill_null(0) + 
             pl.col("hla_match_drb1_low").fill_null(0)).alias("hla_low_res_6"),
            
            (pl.col("hla_match_a_high").fill_null(0) + pl.col("hla_match_b_high").fill_null(0) + 
             pl.col("hla_match_drb1_high").fill_null(0)).alias("hla_high_res_6"),
            
            (pl.col("hla_match_a_low").fill_null(0) + pl.col("hla_match_b_low").fill_null(0) + 
             pl.col("hla_match_c_low").fill_null(0) + pl.col("hla_match_drb1_low").fill_null(0)
            ).alias("hla_low_res_8"),
            
            (pl.col("hla_match_a_high").fill_null(0) + pl.col("hla_match_b_high").fill_null(0) + 
             pl.col("hla_match_c_high").fill_null(0) + pl.col("hla_match_drb1_high").fill_null(0)
            ).alias("hla_high_res_8"),
            
            (pl.col("hla_match_a_low").fill_null(0) + pl.col("hla_match_b_low").fill_null(0) + 
             pl.col("hla_match_c_low").fill_null(0) + pl.col("hla_match_drb1_low").fill_null(0) +
             pl.col("hla_match_dqb1_low").fill_null(0)).alias("hla_low_res_10"),
            
            (pl.col("hla_match_a_high").fill_null(0) + pl.col("hla_match_b_high").fill_null(0) + 
             pl.col("hla_match_c_high").fill_null(0) + pl.col("hla_match_drb1_high").fill_null(0) +
             pl.col("hla_match_dqb1_high").fill_null(0)).alias("hla_high_res_10"),
        )

        return df
    

    def numeric_fe(self, df):
        # df['num_null_count'] = (df.isna()).sum(axis=1)
        # df['total_null_count'] = df['num_null_count'] + df['cat_null_count']
        # df['age_diff'] = abs(df['donor_age'] - df['age_at_hct'])
        df['age_diff'] = df['donor_age'] - df['age_at_hct']
        df['age_ratio'] = df['donor_age'] / df['age_at_hct']
        # df.loc[((df.donor_age < 0) | (df.age_at_hct < 0)),'age_diff'] = -1
        # df.loc[((df.donor_age < 0) | (df.age_at_hct < 0)),'age_ratio'] = -1
        # df['older_donor'] = df['age_ratio'].apply(lambda x: 'Yes' if x>1 else 'No')
        # df.loc[((df['donor_age'].isna())|(df['age_at_hct'].isna())),'older_donor'] = 'Unknown'
        # df['null_count_diff'] = df['cat_null_count'] - df['num_null_count']

        return df

    def cat_fe(self, df):
        # df['cat_null_count'] = (df=="Unknown").sum(axis=1)

        return df

    def select_features(self, df):
        base_features = ['ID', 'dri_score', 'psych_disturb', 'cyto_score', 'diabetes',
            'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia',
            'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue',
            'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status',
            'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6',
            'hla_match_c_low', 'rituximab', 'hla_match_drb1_low',
            'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail',
            'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct',
            'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe',
            'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer',
            'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue',
            'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score',
            'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related',
            'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high',
            'pulm_moderate', 'hla_low_res_10', 'efs', 'efs_time']

        selected_features = ['older_donor', 'total_null_count', 'null_count_diff', 'age_ratio'] 
        features = list(set(df.columns) & set(base_features + selected_features))
        return df[features]

    def info(self, df):
        
        print(f'\nShape of dataframe: {df.shape}') 
        
        mem = df.memory_usage().sum() / 1024**2
        print('Memory usage: {:.2f} MB\n'.format(mem))

        display(df.head())

    def apply_fe(self, path):

        df = self.load_data(path)
        df = self.cast_datatypes(df)
        df = self.recalculate_hla_sums(df)
        df = df.to_pandas()
        df = self.cat_fe(df)
        df = self.numeric_fe(df)

        # df = self.select_features(df)

        self.info(df)
        
        cat_cols = [col for col in df.columns if df[col].dtype == pl.String]
        print(cat_cols)

        return df, cat_cols

In [None]:
feature_engineer = FeatureEngineer(Config.batch_size)
train_data, cat_cols = feature_engineer.apply_fe(Config.train_path)
test_data, cat_cols = feature_engineer.apply_fe(Config.test_path)

## Model

### define targets

In [None]:
from sklearn.preprocessing import OneHotEncoder, quantile_transform, FunctionTransformer, PolynomialFeatures, StandardScaler
class Targets:

    def __init__(self, data, cat_cols, penalizer, n_splits):
        
        self.data = data
        self.cat_cols = cat_cols
        
        self._length = len(self.data)
        self._penalizer = penalizer
        self._n_splits = n_splits

    def _prepare_cv(self):
        
        oof_preds = np.zeros(self._length)
            
        cv = KFold(n_splits=self._n_splits, shuffle=True, random_state=42)
        # cv = StratifiedKFold(n_splits=self._n_splits, shuffle=True, random_state=42)

        return cv, oof_preds

    def validate_model(self, preds, title):
            
        y_true = self.data[['ID', 'efs', 'efs_time', 'race_group']].copy()
        y_pred = self.data[['ID']].copy()
        
        y_pred['prediction'] = preds
            
        c_index_score = score(y_true.copy(), y_pred.copy(), 'ID')
        print(f'Overall Stratified C-Index Score for {title}: {c_index_score:.4f}')
        return c_index_score

    def transform_rank_log(self, time, event):
        """Transform the target by stretching the range of eventful efs_times and compressing the range of event_free efs_times
    
        From https://www.kaggle.com/code/cdeotte/nn-mlp-baseline-cv-670-lb-676"""
        transformed = time.values.copy()
        mx = transformed[event == 1].max() # last patient who dies
        mn = transformed[event == 0].min() # first patient who survives
        transformed[event == 0] = time[event == 0] + mx - mn
        transformed = rankdata(transformed)
        transformed[event == 0] += len(transformed) * 2
        transformed = transformed / transformed.max()
        transformed = np.log(transformed)
        return - transformed

    def transform_quantile(self, time, event):
        """Transform the target by stretching the range of eventful efs_times and compressing the range of event_free efs_times
    
        From https://www.kaggle.com/code/ambrosm/esp-eda-which-makes-sense"""
        transformed = np.full(len(time), np.nan)
        transformed_dead = quantile_transform(- time[event == 1].values.reshape(-1, 1)).ravel()
        transformed[event == 1] = transformed_dead
        transformed[event == 0] = transformed_dead.min() - 0.3 
        return transformed
        
    def create_target1(self):  

        '''
        Inside the CV loop, constant columns are dropped if they exist in a fold. Otherwise, the code produces error:

        delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: 
        https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model
        '''

        cv, oof_preds = self._prepare_cv()

        # Apply one hot encoding to categorical columns
        data = pd.get_dummies(self.data, columns=self.cat_cols, drop_first=True).drop('ID', axis=1) 
        data = data.fillna(-1)
        # data = data.drop(additional_features, axis=1)

        for train_index, valid_index in cv.split(self.data):
        # for fold, (train_index, valid_index) in enumerate(cv.split(self.data, self.data['race_group'].astype(str) + (self.data['age_at_hct']==0.044).astype(str))):

            train_data = data.iloc[train_index]
            valid_data = data.iloc[valid_index]

            # Drop constant columns if they exist
            train_data = train_data.loc[:, train_data.nunique() > 1]
            valid_data = valid_data[train_data.columns]

            cph = CoxPHFitter(penalizer=self._penalizer)
            cph.fit(train_data, duration_col='efs_time', event_col='efs')
            
            oof_preds[valid_index] = cph.predict_partial_hazard(valid_data)              

        self.data['target1'] = oof_preds 
        self.validate_model(oof_preds, 'Cox') 

        return self.data

    def create_target2(self):        

        cv, oof_preds = self._prepare_cv()

        for train_index, valid_index in cv.split(self.data):
        # for fold, (train_index, valid_index) in enumerate(cv.split(self.data, self.data['race_group'].astype(str) + (self.data['age_at_hct']==0.044).astype(str))):

            train_data = self.data.iloc[train_index]
            valid_data = self.data.iloc[valid_index]

            kmf = KaplanMeierFitter()
            kmf.fit(durations=train_data['efs_time'], event_observed=train_data['efs'])
            
            oof_preds[valid_index] = kmf.survival_function_at_times(valid_data['efs_time']).values

        self.data['target2'] = oof_preds  
        self.validate_model(oof_preds, 'Kaplan-Meier')

        return self.data

    def create_target3(self):        

        cv, oof_preds = self._prepare_cv()

        for train_index, valid_index in cv.split(self.data):
        # for fold, (train_index, valid_index) in enumerate(cv.split(self.data, self.data['race_group'].astype(str) + (self.data['age_at_hct']==0.044).astype(str))):

            train_data = self.data.iloc[train_index]
            valid_data = self.data.iloc[valid_index]
            
            naf = NelsonAalenFitter()
            naf.fit(durations=train_data['efs_time'], event_observed=train_data['efs'])
            
            oof_preds[valid_index] = -naf.cumulative_hazard_at_times(valid_data['efs_time']).values

        self.data['target3'] = oof_preds  
        self.validate_model(oof_preds, 'Nelson-Aalen')

        return self.data

    def create_target4(self):

        self.data['target4'] = self.data.efs_time.copy()
        self.data.loc[self.data.efs == 0, 'target4'] *= -1

        return self.data

    # kmf target additional transform
    def create_target5(self):
        
        cv, oof_preds = self._prepare_cv()

        for train_index, valid_index in cv.split(self.data):
        # for fold, (train_index, valid_index) in enumerate(cv.split(self.data, self.data['race_group'].astype(str) + (self.data['age_at_hct']==0.044).astype(str))):
        # for train_index, valid_index in cv.split(self.data, self.data['race_group'].astype(str) + '_' + self.data['efs'].astype(str)):
            train_data = self.data.iloc[train_index].copy()
            valid_data = self.data.iloc[valid_index].copy()

            train_data['efs_time_trans'] = np.log(train_data['efs_time'])/(np.log(train_data['efs']+3))**3
            train_data['efs_time_trans'] = train_data['efs_time_trans'].clip(lower=1e-6)
            valid_data['efs_time_trans'] = np.log(valid_data['efs_time'])/(np.log(valid_data['efs']+3))**3
            valid_data['efs_time_trans'] = valid_data['efs_time_trans'].clip(lower=1e-6)

            kmf = KaplanMeierFitter()
            kmf.fit(
                durations=train_data['efs_time_trans'],
                event_observed=train_data['efs']
            )

            train_data['kmf'] = kmf.survival_function_at_times(train_data['efs_time_trans']).values
            valid_data['kmf'] = kmf.survival_function_at_times(valid_data['efs_time_trans']).values

            pt = PowerTransformer(method='yeo-johnson', standardize=False)
            train_data['kmf_trans'] = pt.fit_transform(train_data[['kmf']])
            valid_data['kmf_trans'] = pt.transform(valid_data[['kmf']])

            oof_preds[valid_index] = valid_data['kmf_trans'].values.squeeze()

        self.data['target5'] = oof_preds
        self.validate_model(oof_preds, 'Kaplan-Meier transformed')

        return self.data

    def create_target6(self):
        cv, oof_preds = self._prepare_cv()

        for train_index, valid_index in cv.split(self.data):

            train_data = self.data.iloc[train_index]
            valid_data = self.data.iloc[valid_index]
            
            bfh = BreslowFlemingHarringtonFitter()
            bfh.fit(durations=train_data['efs_time'], event_observed=train_data['efs'])
            
            oof_preds[valid_index] = bfh.survival_function_at_times(valid_data['efs_time']).values

        self.data['target6'] = oof_preds  
        self.validate_model(oof_preds, 'BreslowFleming-Harrington')

        return self.data

    def create_target7(self):
        cv, oof_preds = self._prepare_cv()
        for train_index, valid_index in cv.split(self.data):

            train_data = self.data.iloc[train_index]
            valid_data = self.data.iloc[valid_index]
            y_tr = self.transform_quantile(time=valid_data.efs_time, event=valid_data.efs)
            oof_preds[valid_index] = y_tr
        
        self.data['target7'] = oof_preds
        self.validate_model(oof_preds, 'transform_quantile')
        return self.data

    def create_target8(self):
        cv, oof_preds = self._prepare_cv()
        for train_index, valid_index in cv.split(self.data):

            train_data = self.data.iloc[train_index]
            valid_data = self.data.iloc[valid_index]
            y_tr = self.transform_rank_log(time=valid_data.efs_time, event=valid_data.efs)
            oof_preds[valid_index] = y_tr
        
        self.data['target8'] = oof_preds
        self.validate_model(oof_preds, 'transform_rank_log')
        return self.data


### define model

In [None]:
from catboost import  Pool
class Model:
    
    def __init__(self, data, cat_cols, early_stop, penalizer, n_splits):
        
        self.targets = Targets(data, cat_cols, penalizer, n_splits)
        
        self.data = data
        # self.cat_cols = ['peptic_ulcer', 'ethnicity', 'prim_disease_hct', 'tbi_status', 'tce_div_match', 'donor_related', 'melphalan_dose', 'diabetes', 'pulm_severe', 'cardiac', 'obesity', 'rheum_issue', 'gvhd_proph', 'graft_type', 'cmv_status', 'renal_issue', 'sex_match', 'race_group', 'tce_imm_match', 'cyto_score_detail', 'arrhythmia', 'mrd_hct', 'pulm_moderate', 'hepatic_severe', 'dri_score', 'in_vivo_tcd', 'conditioning_intensity', 'rituximab', 'prior_tumor', 'hepatic_mild', 'psych_disturb', 'tce_match', 'prod_type', 'cyto_score', 'vent_hist']
        self.cat_cols = cat_cols
        self._early_stop = early_stop

    def create_targets(self):
        print('creating target 1')
        self.data = self.targets.create_target1()
        print('creating target 2')
        self.data = self.targets.create_target2()
        print('creating target 3')
        self.data = self.targets.create_target3()
        print('creating target 4')
        self.data = self.targets.create_target4()
        print('creating target 5')
        self.data = self.targets.create_target5()
        print('creating target 6')
        self.data = self.targets.create_target6()
        print('creating target 7')
        self.data = self.targets.create_target7()
        print('creating target 8')
        self.data = self.targets.create_target8()

        return self.data
        
    def train_model(self, params, target, title):
        
        for col in self.cat_cols:
            self.data[col] = self.data[col].astype('category')

        target_cols = [item for item in self.data.columns if item.startswith('target')]
        # X = self.data.drop(['ID', 'efs', 'efs_time', 'target1', 'target2', 'target3', 'target4', 'target5'], axis=1)
        X = self.data.drop(['ID', 'efs', 'efs_time']+target_cols, axis=1)
        y = self.data[target]
        
        models, fold_scores = [], []
            
        cv, oof_preds = self.targets._prepare_cv()
    
        for fold, (train_index, valid_index) in enumerate(cv.split(X, y)):
        # for fold, (train_index, valid_index) in enumerate(cv.split(self.data, self.data['race_group'].astype(str) + (self.data['age_at_hct']==0.044).astype(str))):
                
            X_train = X.iloc[train_index]
            X_valid = X.iloc[valid_index]
                
            y_train = y.iloc[train_index]
            y_valid = y.iloc[valid_index]
    
            if title.startswith('LightGBM'):
                        
                model = lgb.LGBMRegressor(**params)
                model.fit(
                    X_train, 
                    y_train,  
                    eval_set=[(X_valid, y_valid)],
                    eval_metric='rmse',
                    callbacks=[lgb.early_stopping(self._early_stop, verbose=0), lgb.log_evaluation(0)]
                )
                data = X_valid        
                
                
            elif title.startswith('XGBoost'):
                model = xgb.XGBRegressor(**params)
                model.fit(
                    X_train, 
                    y_train, 
                    eval_set=[(X_valid, y_valid)], 
                    verbose=False
                )
                
                data = X_valid        
                
            elif title.startswith('CatBoost'):
                # feature selection
                # X_train = X_train.drop(columns=['age_diff'])
                # X_valid = X_valid.drop(columns=['age_diff'])
                model = CatBoostRegressor(**params, verbose=0, cat_features=self.cat_cols)

                # if target in ['target6','target7','target8']:
                #     model.fit(
                #         X_train,
                #         y_train,
                #         eval_set=(X_valid, y_valid),
                #         early_stopping_rounds=self._early_stop, 
                #         verbose=0
                #     )               
                #     model.save_model(f'./checkpoints/baseline/catboost_{fold}_{target}_{title}_skf.cbm')

                # else:
                if kaggle:
                    prfx = '/kaggle/input/cib-catboost-models' if target not in ['target6','target7','target8'] else '/kaggle/input/cibmtr-catboost-6-8'
                    model.load_model(f'{prfx}/catboost_{fold}_{target}_{title}.cbm')
                else:
                    model.load_model(f'./checkpoints/baseline/catboost_{fold}_{target}_{title}.cbm')
                
                data = Pool(data=X_valid, cat_features=self.cat_cols)                 
            models.append(model)
            
            oof_preds[valid_index] = model.predict(data)

            y_true_fold = self.data.iloc[valid_index][['ID', 'efs', 'efs_time', 'race_group']].copy()
            y_pred_fold = self.data.iloc[valid_index][['ID']].copy()
            
            y_pred_fold['prediction'] = oof_preds[valid_index]
    
            fold_score = score(y_true_fold, y_pred_fold, 'ID')
            fold_scores.append(fold_score)
    
        self.targets.validate_model(oof_preds, title)
        
        return models, oof_preds

    def infer_model(self, data, models, title):
        
        data = data.drop(['ID'], axis=1)
        # if title.startswith('CatBoost'):
            # data = data.drop(['age_diff'], axis=1)

        for col in self.cat_cols:
            data[col] = data[col].astype('category')
        if title.startswith('LightGBM') or title.startswith('XGBoost'):
            return np.mean([model.predict(data) for model in models], axis=0)
        elif title.startswith('CatBoost'):
            data_ = Pool(data=data, cat_features=self.cat_cols)   
            return np.mean([model.predict(data_) for model in models], axis=0)

### create targets

In [None]:
# cat_cols = ['peptic_ulcer', 'ethnicity', 'prim_disease_hct', 'tbi_status', 'tce_div_match', 'donor_related', 'melphalan_dose', 'diabetes', 'pulm_severe', 'cardiac', 'obesity', 'rheum_issue', 'gvhd_proph', 'graft_type', 'cmv_status', 'renal_issue', 'sex_match', 'race_group', 'tce_imm_match', 'cyto_score_detail', 'arrhythmia', 'mrd_hct', 'pulm_moderate', 'hepatic_severe', 'dri_score', 'in_vivo_tcd', 'conditioning_intensity', 'rituximab', 'prior_tumor', 'hepatic_mild', 'psych_disturb', 'tce_match', 'prod_type', 'cyto_score', 'vent_hist']
model = Model(data=train_data, cat_cols=cat_cols, early_stop=Config.early_stopping_round, penalizer=Config.penalizer, n_splits=Config.n_splits)
train_data = model.create_targets()

### train and preds

In [None]:
# Cox target
ctb1_models, ctb1_oof_preds = model.train_model(Config.ctb_params, target='target1', title='CatBoost')
lgb1_models, lgb1_oof_preds = model.train_model(Config.lgb_params, target='target1', title='LightGBM')
xgb1_models, xgb1_oof_preds = model.train_model(Config.xgb_params, target='target1', title='XGBoost')
ctb1_preds = model.infer_model(test_data, ctb1_models, title='CatBoost')
lgb1_preds = model.infer_model(test_data, lgb1_models, title='LightGBM')
xgb1_preds = model.infer_model(test_data, xgb1_models, title='XGBoost')
# Kaplan Meier target
ctb2_models, ctb2_oof_preds = model.train_model(Config.ctb_params, target='target2', title='CatBoost')
lgb2_models, lgb2_oof_preds = model.train_model(Config.lgb_params, target='target2', title='LightGBM')
lgb22_models, lgb22_oof_preds = model.train_model(Config.lgb_tw_params, target='target2', title='LightGBM')
xgb2_models, xgb2_oof_preds = model.train_model(Config.xgb_params, target='target2', title='XGBoost')
ctb2_preds = model.infer_model(test_data, ctb2_models, title='CatBoost')
lgb2_preds = model.infer_model(test_data, lgb2_models, title='LightGBM')
lgb22_preds = model.infer_model(test_data, lgb22_models, title='LightGBM')
xgb2_preds = model.infer_model(test_data, xgb2_models, title='XGBoost')
# Nelson Aalen target
ctb3_models, ctb3_oof_preds = model.train_model(Config.ctb_params, target='target3', title='CatBoost')
lgb3_models, lgb3_oof_preds = model.train_model(Config.lgb_params, target='target3', title='LightGBM')
xgb3_models, xgb3_oof_preds = model.train_model(Config.xgb_params, target='target3', title='XGBoost')
ctb3_preds = model.infer_model(test_data, ctb3_models, title='CatBoost')
lgb3_preds = model.infer_model(test_data, lgb3_models, title='LightGBM')
xgb3_preds = model.infer_model(test_data, xgb3_models, title='XGBoost')
# Cox-loss target
cox1_models, cox1_oof_preds = model.train_model(Config.cox1_params, target='target4', title='CatBoost')
cox2_models, cox2_oof_preds = model.train_model(Config.cox2_params, target='target4', title='CatBoost')
cox3_models, cox3_oof_preds = model.train_model(Config.xgb_cox_params, target='target4', title='XGBoost')
cox1_preds = model.infer_model(test_data, cox1_models, title='CatBoost')
cox2_preds = model.infer_model(test_data, cox2_models, title='CatBoost')
cox3_preds = model.infer_model(test_data, cox3_models, title='XGBoost')
# Kaplan Meier transformed target
ctb5_models, ctb5_oof_preds = model.train_model(Config.ctb_params, target='target5', title='CatBoost')
lgb5_models, lgb5_oof_preds = model.train_model(Config.lgb_params, target='target5', title='LightGBM')
xgb5_models, xgb5_oof_preds = model.train_model(Config.xgb_params, target='target5', title='XGBoost')
ctb5_preds = model.infer_model(test_data, ctb5_models, title='CatBoost')
lgb5_preds = model.infer_model(test_data, lgb5_models, title='LightGBM')
xgb5_preds = model.infer_model(test_data, xgb5_models, title='XGBoost')
# BreslowFleming-Harrington
ctb6_models, ctb6_oof_preds = model.train_model(Config.ctb_params, target='target6', title='CatBoost')
lgb6_models, lgb6_oof_preds = model.train_model(Config.lgb_params, target='target6', title='LightGBM')
xgb6_models, xgb6_oof_preds = model.train_model(Config.xgb_params, target='target6', title='XGBoost')
ctb6_preds = model.infer_model(test_data, ctb6_models, title='CatBoost')
lgb6_preds = model.infer_model(test_data, lgb6_models, title='LightGBM')
xgb6_preds = model.infer_model(test_data, xgb6_models, title='XGBoost')
# quantile
ctb7_models, ctb7_oof_preds = model.train_model(Config.ctb_params, target='target7', title='CatBoost')
lgb7_models, lgb7_oof_preds = model.train_model(Config.lgb_params, target='target7', title='LightGBM')
xgb7_models, xgb7_oof_preds = model.train_model(Config.xgb_params, target='target7', title='XGBoost')
ctb7_preds = model.infer_model(test_data, ctb7_models, title='CatBoost')
lgb7_preds = model.infer_model(test_data, lgb7_models, title='LightGBM')
xgb7_preds = model.infer_model(test_data, xgb7_models, title='XGBoost')
# rank log
ctb8_models, ctb8_oof_preds = model.train_model(Config.ctb_params, target='target8', title='CatBoost')
lgb8_models, lgb8_oof_preds = model.train_model(Config.lgb_params, target='target8', title='LightGBM')
xgb8_models, xgb8_oof_preds = model.train_model(Config.xgb_params, target='target8', title='XGBoost')
ctb8_preds = model.infer_model(test_data, ctb8_models, title='CatBoost')
lgb8_preds = model.infer_model(test_data, lgb8_models, title='LightGBM')
xgb8_preds = model.infer_model(test_data, xgb8_models, title='XGBoost')


## Classifier for post-processing

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import xgboost as xgb
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

n_splits = Config.n_splits
seed = Config.seed
# cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
n_train = len(train_data)

target_cols = [col for col in train_data.columns if col.startswith('target')]
drop_cols = ['ID', 'efs', 'efs_time'] + target_cols
features = train_data.drop(columns=drop_cols).columns

# =============================================================================
# Level 1: LGB, XGB, CTB
# =============================================================================

oof_lgb = np.zeros(n_train)
oof_xgb = np.zeros(n_train)
oof_ctb = np.zeros(n_train)

cls_preds_lgb = []
cls_preds_xgb = []
cls_preds_ctb = []

for fold, (train_index, valid_index) in enumerate(cv.split(train_data), 1):
# for train_index, valid_index in cv.split(train_data, train_data['race_group'].astype(str) + (train_data['age_at_hct']==0.044).astype(str)):
    print(f"====== Level1 - Fold {fold} ======")
    
    train_fold = train_data.iloc[train_index].copy()
    valid_fold = train_data.iloc[valid_index].copy()
    
    X_train = train_fold.drop(columns=drop_cols).copy()
    y_train = train_fold['efs']
    X_valid = valid_fold.drop(columns=drop_cols).copy()
    y_valid = valid_fold['efs']
    
    for col in cat_cols:
        if col in X_train.columns:
            X_train[col] = X_train[col].astype('category')
        if col in X_valid.columns:
            X_valid[col] = X_valid[col].astype('category')

    # LightGBM
    model_lgb = LGBMClassifier(
        objective='binary',
        eval_metric='auc',
        boosting_type='gbdt',
        max_depth=3,
        learning_rate=0.02,
        n_estimators=5000,
        random_state=seed
    )
    model_lgb.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[early_stopping(300, verbose=0), log_evaluation(0)],
        categorical_feature=cat_cols
    )
    preds_lgb = model_lgb.predict_proba(X_valid)[:, 1]
    oof_lgb[valid_index] = preds_lgb
    
    X_test = test_data[features].copy()
    for col in cat_cols:
        if col in X_test.columns:
            X_test[col] = X_test[col].astype('category')
    cls_pred_lgb = model_lgb.predict_proba(X_test)[:, 1]
    cls_preds_lgb.append(cls_pred_lgb)
    
    # XGBoost
    model_xgb = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        max_depth=3,
        learning_rate=0.02,
        n_estimators=5000,
        random_state=seed,
        enable_categorical=True,
        early_stopping_rounds=300,
        verbose=False
    )
    model_xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    preds_xgb = model_xgb.predict_proba(X_valid)[:, 1]
    oof_xgb[valid_index] = preds_xgb
    
    cls_pred_xgb = model_xgb.predict_proba(X_test)[:, 1]
    cls_preds_xgb.append(cls_pred_xgb)
    
    # CatBoost
    model_ctb = CatBoostClassifier(
        iterations=5000,
        learning_rate=0.02,
        depth=3,
        eval_metric='AUC',
        random_seed=seed,
        # od_type='Iter',
        # od_wait=300,
        verbose=False
    )
    model_ctb.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        cat_features=cat_cols,
        use_best_model=True,
        verbose=False
    )
    preds_ctb = model_ctb.predict_proba(X_valid)[:, 1]
    oof_ctb[valid_index] = preds_ctb
    
    cls_pred_ctb = model_ctb.predict_proba(X_test)[:, 1]
    cls_preds_ctb.append(cls_pred_ctb)
    
    auc_lgb_fold = roc_auc_score(y_valid, preds_lgb)
    auc_xgb_fold = roc_auc_score(y_valid, preds_xgb)
    auc_ctb_fold = roc_auc_score(y_valid, preds_ctb)
    print(f"Fold {fold} AUC - LightGBM: {auc_lgb_fold:.4f}, XGBoost: {auc_xgb_fold:.4f}, CatBoost: {auc_ctb_fold:.4f}")

auc_lgb = roc_auc_score(train_data['efs'], oof_lgb)
auc_xgb = roc_auc_score(train_data['efs'], oof_xgb)
auc_ctb = roc_auc_score(train_data['efs'], oof_ctb)
print("----- Overall Base Model CV Scores -----")
print(f"LightGBM: {auc_lgb:.4f}")
print(f"XGBoost:  {auc_xgb:.4f}")
print(f"CatBoost: {auc_ctb:.4f}")

X_meta_train = np.column_stack((oof_lgb, oof_xgb, oof_ctb))
y_meta = train_data['efs'].values

cls_pred_lgb_final = np.mean(cls_preds_lgb, axis=0)
cls_pred_xgb_final = np.mean(cls_preds_xgb, axis=0)
cls_pred_ctb_final = np.mean(cls_preds_ctb, axis=0)
X_meta_test = np.column_stack((cls_pred_lgb_final, cls_pred_xgb_final, cls_pred_ctb_final))

# =============================================================================
# Level 2: Stacking with Logistic Regression
# =============================================================================

cls_oof_preds = np.zeros(n_train)
test_preds_meta_folds = []

for fold, (meta_train_index, meta_valid_index) in enumerate(cv.split(X_meta_train), 1):
# for train_index, valid_index in cv.split(train_data, train_data['race_group'].astype(str) + (train_data['age_at_hct']==0.044).astype(str)):
    print(f"====== Level2 (LR) - Fold {fold} ======")
    X_meta_tr = X_meta_train[meta_train_index]
    y_meta_tr = y_meta[meta_train_index]
    X_meta_val = X_meta_train[meta_valid_index]
    
    lr_meta = LogisticRegression(max_iter=1000, random_state=seed)
    lr_meta.fit(X_meta_tr, y_meta_tr)
    
    preds_meta_val = lr_meta.predict_proba(X_meta_val)[:, 1]
    cls_oof_preds[meta_valid_index] = preds_meta_val
    
    preds_meta_test = lr_meta.predict_proba(X_meta_test)[:, 1]
    test_preds_meta_folds.append(preds_meta_test)

meta_cv_auc = roc_auc_score(y_meta, cls_oof_preds)
print("Level2 (LR) CV AUC:", meta_cv_auc)

# 最終テスト予測：各fold のメタモデル予測の平均
cls_preds = np.mean(test_preds_meta_folds, axis=0)


In [None]:
if use_autogluon:
    cls_oof_preds = 0.15*cls_oof_preds+0.85*autogluon_cls_oof_preds.loc[:,1]
    cls_preds = 0.15*cls_preds+0.85*autogluon_cls_preds.loc[:,1]

# Ensembling

## Classifier thresholds

### Explore max F1 thresholds for each race

In [None]:
race_thresh_dict = {}
for race in train_data.race_group.unique():
    race_ind = train_data['race_group']==race
    y_true = train_data.loc[race_ind, 'efs']
    probs = cls_oof_preds[race_ind]
    auc = roc_auc_score(y_true, probs)

    print(race,"AUC:", auc)

    thresholds = np.linspace(0, 1, 1000)
    best_f1 = 0
    best_thresh = 0
    
    for thresh in thresholds:
        y_pred = (probs >= thresh).astype(int)
        current_f1 = f1_score(y_true, y_pred)
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_thresh = thresh
    
    print("最大のF1スコア:", best_f1, "を与える閾値:", best_thresh)
    race_thresh_dict[race] = best_thresh
    
    
    y_pred_best = (probs >= best_thresh).astype(int)
    cm = confusion_matrix(y_true, y_pred_best)
    print("混同行列（F1最大の閾値での予測）:")
    print(cm)

race_thresh_dict

### Elastic Net Stacking

We aggreagate the oof preds target-wise, because the models are too many to ensemble with simple weighted mean. Avoid overfitting CV with too many optuna params.

In [None]:
from sklearn.linear_model import MultiTaskElasticNet, ElasticNet
import optuna

oof_preds = [
    ctb1_oof_preds, # cox ph fitter
    lgb1_oof_preds, 
    xgb1_oof_preds,
    ctb2_oof_preds, # kaplan meier
    lgb2_oof_preds,
    lgb22_oof_preds, # tweedie objective
    xgb2_oof_preds,
    ctb3_oof_preds, # nelson aalen
    lgb3_oof_preds,
    # xgb3_oof_preds,
    cox1_oof_preds, # cox objective
    cox2_oof_preds,
    cox3_oof_preds,
    ctb5_oof_preds, # kmf transformed target by octopus210
    lgb5_oof_preds,
    xgb5_oof_preds,
    ctb6_oof_preds, # breslowfleming
    lgb6_oof_preds,
    xgb6_oof_preds,
    ctb7_oof_preds, # transform quantile
    lgb7_oof_preds,
    xgb7_oof_preds,
    ctb8_oof_preds, # transform ranklog
    lgb8_oof_preds,
    xgb8_oof_preds,
    -nn_oof_preds, # pairwise loss
    -nn2_oof_preds,
    ctb_kaplan_oof, # by zdh
    ctb_nelson_oof,
    ctb_quantile_oof,
    ctb_ranklog_oof,
    auto_kaplan_oof,
    auto_nelson_oof,
    auto_quantile_oof,
    auto_cox_oof,
    # cls_oof_preds,
    # 0.15*cls_oof_preds+0.85*autogluon_cls_oof_preds.loc[:,1]
]
if use_autogluon:
    oof_preds += [autogluon_1_oof_preds,
    autogluon_2_oof_preds,
    autogluon_3_oof_preds,
    autogluon_4_oof_preds,
    autogluon_5_oof_preds,
    autogluon_6_oof_preds,
    autogluon_7_oof_preds,
    autogluon_8_oof_preds]

preds = [
    ctb1_preds, 
    lgb1_preds, 
    xgb1_preds,
    ctb2_preds, 
    lgb2_preds,
    lgb22_preds,
    xgb2_preds,
    ctb3_preds, 
    lgb3_preds,
    # xgb3_preds,
    cox1_preds,
    cox2_preds,
    cox3_preds,
    ctb5_preds,  # kmf transformed target by octopus210
    lgb5_preds,
    xgb5_preds,
    ctb6_preds,  
    lgb6_preds,
    xgb6_preds,
    ctb7_preds, 
    lgb7_preds,
    xgb7_preds,
    ctb8_preds,  
    lgb8_preds,
    xgb8_preds,
    -nn_preds,
    -nn2_preds,
    ctb_kaplan_preds,
    ctb_nelson_preds,
    ctb_quantile_preds,
    ctb_ranklog_preds,
    auto_kaplan_preds,
    auto_nelson_preds,
    auto_quantile_preds,
    auto_cox_preds,
    # autogluon_kaplan_preds, # by kekshibata
    # autogluon_nelson_preds # by kekshibata
    # cls_preds,
]
if use_autogluon:
    preds += [
    autogluon_1_preds,
    autogluon_2_preds,
    autogluon_3_preds,
    autogluon_4_preds,
    autogluon_5_preds,
    autogluon_6_preds,
    autogluon_7_preds,
    autogluon_8_preds
    ]

columns = ['ctb1','lgb1','xgb1','ctb2', 'lgb2', 'lgb22','xgb2','ctb3', 'lgb3', 'cox1', 'cox2','cox3','ctb5','lgb5','xgb5','ctb6','lgb6','xgb6','ctb7','lgb7','xgb7','ctb8','lgb8','xgb8', 'nn', 'nn2', 'zdh_kp', 'zdh_nl', 'zdh_q', 'zdh_rl', 'auto_kp','auto_nl','auto_q','auto_cox']
if use_autogluon:
    columns +=['auto1','auto2','auto3','auto4','auto5','auto6','auto7','auto8']

X = pd.DataFrame(np.array(oof_preds).T, columns=columns)
# X = X.drop(columns=['auto1','auto2','auto3','auto5','auto6'])
y = model.data[['target1','target2', 'target3', 'target4', 'target5', 'target6', 'target7', 'target8']]

cv = KFold(n_splits=10, shuffle=True, random_state=42)
en_oof_preds = np.zeros(y.shape)
en_models = []

# === Optuna === #
# def objective(trial):
#     alpha = trial.suggest_float("alpha", 0.0, 1.0)
#     l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)
#     for fold, (train_index, valid_index) in enumerate(cv.split(X, y)):
#         X_train = X.iloc[train_index]
#         X_valid = X.iloc[valid_index]
            
#         y_train = y.iloc[train_index]
#         y_valid = y.iloc[valid_index]
    
#         elastic_net = MultiTaskElasticNet(l1_ratio=l1_ratio, alpha=alpha)
#         elastic_net.fit(X_train,y_train)
#         en_oof_preds[valid_index] = elastic_net.predict(X_valid)

#     _scores = []
#     for i in range(en_oof_preds.shape[1]):
#         # _scores.append(model.targets.validate_model(en_oof_preds[:,i], f'target {i+1}'))
#         _scores.append(model.targets.validate_model(en_oof_preds[:,i], f'target {i+1}'))
#         # _scores.append(detail_validate_model(en_oof_preds[:,i], f'target {i+1}', True)*0.5)
#     return np.sum(_scores)
    
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=500)

# print("Best params: ", study.best_params)
# print("Best score: ", study.best_value)
#{'alpha': 0.019676921068033953, 'l1_ratio': 0.9299672633702524}
# === Optuna === #

for fold, (train_index, valid_index) in enumerate(cv.split(X, y)):
    X_train = X.iloc[train_index]
    X_valid = X.iloc[valid_index]
        
    y_train = y.iloc[train_index]
    y_valid = y.iloc[valid_index]

    # elastic_net = MultiTaskElasticNet(l1_ratio=0.9299672633702524, alpha=0.019676921068033953)
    # elastic_net = MultiTaskElasticNet(l1_ratio=0.61, alpha=0.015)
    elastic_net = MultiTaskElasticNet(l1_ratio=0.05, alpha=0.015)
    # elastic_net = ElasticNet(l1_ratio=0.1)
    elastic_net.fit(X_train,y_train)
    en_oof_preds[valid_index] = elastic_net.predict(X_valid)
    en_models.append(elastic_net)

print(elastic_net.coef_)

_scores = []
for i in range(en_oof_preds.shape[1]):
    model.targets.validate_model(en_oof_preds[:,i], f'target {i+1}')

# test dataの予測
test_df = pd.DataFrame(np.array(preds).T, columns=columns)
# test_df = test_df.drop(columns=['zdh_kp','zdh_nl', 'zdh_q', 'zdh_rl'])
preds_per_model = [model.predict(test_df) for model in en_models]
en_preds = np.mean(np.array(preds_per_model), axis=0)

## optimize weights

### post-processing params

In [None]:
use_naive_post_processing = False
apply_naive_post_processing_rank_ensemble = True

In [None]:
# best_params = {'target1': {'More than one race': {'alpha': 78.84317645703733, 'beta': 0.25196195670651333, 'theta': 59.88067188356692, 'gamma': 0.1120373149458288}, 'Asian': {'alpha': 10.44914757781216, 'beta': 0.2908144694816823, 'theta': 36.219616525683605, 'gamma': 0.017494905023588365}, 'White': {'alpha': 2.470483575295753, 'beta': 0.5559666659024368, 'theta': 36.877368478653885, 'gamma': 0.2850756902629916}, 'American Indian or Alaska Native': {'alpha': 5.616154941797586, 'beta': 0.5995259057000294, 'theta': 44.3537906804016, 'gamma': 0.06616580330445526}, 'Native Hawaiian or other Pacific Islander': {'alpha': 94.38769892642624, 'beta': 0.3957706072410804, 'theta': 98.35401829871518, 'gamma': 0.07871989299873693}, 'Black or African-American': {'alpha': 95.11169202737975, 'beta': 0.28255581553680553, 'theta': 95.5373555898328, 'gamma': 0.2828705303498587}}, 'target2': {'More than one race': {'alpha': 0.18229678419592776, 'beta': 0.5708721296203112, 'theta': 39.666586963641436, 'gamma': 0.19553922507986318}, 'Asian': {'alpha': 90.30734659833965, 'beta': 0.4169551530807272, 'theta': 21.082237977138327, 'gamma': 0.3333718383290243}, 'White': {'alpha': 88.94531772834964, 'beta': 0.4554009915812633, 'theta': 27.069350836895453, 'gamma': 0.26573318768136256}, 'American Indian or Alaska Native': {'alpha': 52.24556542584492, 'beta': 0.43759800691938094, 'theta': 38.71603409305368, 'gamma': 0.22623652955269163}, 'Native Hawaiian or other Pacific Islander': {'alpha': 3.9910438506479187, 'beta': 0.4525692471349121, 'theta': 41.316193264903966, 'gamma': 0.11205797315172086}, 'Black or African-American': {'alpha': 46.59597784974474, 'beta': 0.33192434087055317, 'theta': 37.73153094401885, 'gamma': 0.07289212383543853}}, 'target3': {'More than one race': {'alpha': 2.070261033684846, 'beta': 0.2553796915800422, 'theta': 58.05429665297534, 'gamma': 0.130892914748113}, 'Asian': {'alpha': 29.18172746720677, 'beta': 0.42013848780969315, 'theta': 74.03394732652785, 'gamma': 0.020303798284684652}, 'White': {'alpha': 95.55896778024719, 'beta': 0.4554239297749219, 'theta': 26.975713941168234, 'gamma': 0.17668650009570783}, 'American Indian or Alaska Native': {'alpha': 78.76593053011024, 'beta': 0.437821673729129, 'theta': 10.610224829709027, 'gamma': 0.2015652574673209}, 'Native Hawaiian or other Pacific Islander': {'alpha': 35.94067947962198, 'beta': 0.4255135432317088, 'theta': 6.129135080027377, 'gamma': 0.20447534040106485}, 'Black or African-American': {'alpha': 0.24271505368638774, 'beta': 0.5261099395125066, 'theta': 59.735652084124396, 'gamma': 0.2622558686106822}}, 'target4': {'More than one race': {'alpha': 73.1236425338593, 'beta': 0.1357452314657769, 'theta': 34.28371357237041, 'gamma': 0.26263148347478676}, 'Asian': {'alpha': 89.27485172071773, 'beta': 0.1378948174008421, 'theta': 47.65034530156892, 'gamma': 0.13968822139931386}, 'White': {'alpha': 96.11722960822586, 'beta': 0.38571848580290974, 'theta': 71.86695409817999, 'gamma': 0.3841851129317032}, 'American Indian or Alaska Native': {'alpha': 97.3423956122681, 'beta': 0.4457165654919174, 'theta': 33.95994163852485, 'gamma': 0.22661771178980428}, 'Native Hawaiian or other Pacific Islander': {'alpha': 86.30657340332333, 'beta': 0.4488370384936414, 'theta': 51.531598350478056, 'gamma': 0.2998133350065305}, 'Black or African-American': {'alpha': 35.31427398000774, 'beta': 0.2863900299762455, 'theta': 89.69133606376818, 'gamma': 0.28634525830526997}}, 'target5': {'More than one race': {'alpha': 39.54266772844984, 'beta': 0.13569670335528716, 'theta': 50.88016288197015, 'gamma': 0.21834147664196782}, 'Asian': {'alpha': 2.472309070710642, 'beta': 0.1521792256298301, 'theta': 39.572416459896786, 'gamma': 0.2842212807492685}, 'White': {'alpha': 46.053721848342164, 'beta': 0.4150006650054838, 'theta': 10.669071852779783, 'gamma': 0.11676466306117632}, 'American Indian or Alaska Native': {'alpha': 66.99960537805879, 'beta': 0.4367395459365, 'theta': 48.0106910399209, 'gamma': 0.25277621843724096}, 'Native Hawaiian or other Pacific Islander': {'alpha': 51.72529163988056, 'beta': 0.4212020454255499, 'theta': 52.379915798704445, 'gamma': 0.159804866594236}, 'Black or African-American': {'alpha': 12.163813930775135, 'beta': 0.1165265012693055, 'theta': 58.71943697345976, 'gamma': 0.2819870250874112}}, 'target6': {'More than one race': {'alpha': 47.30059539869976, 'beta': 0.12212146684768022, 'theta': 1.6795515870985918, 'gamma': 0.25977319298960244}, 'Asian': {'alpha': 74.3842105997614, 'beta': 0.41686916063940777, 'theta': 49.582839135171334, 'gamma': 0.29138311738939165}, 'White': {'alpha': 99.11685259119176, 'beta': 0.4552988093175907, 'theta': 70.8344611141105, 'gamma': 0.1818410267063783}, 'American Indian or Alaska Native': {'alpha': 88.42931418535267, 'beta': 0.43730900618776686, 'theta': 52.40834617370091, 'gamma': 0.17129502207402644}, 'Native Hawaiian or other Pacific Islander': {'alpha': 45.411442820403934, 'beta': 0.4230105168780501, 'theta': 52.02651855067934, 'gamma': 0.37876371402698755}, 'Black or African-American': {'alpha': 92.12573267216331, 'beta': 0.3350239342356327, 'theta': 47.20814804430578, 'gamma': 0.020600769490179208}}, 'target7': {'More than one race': {'alpha': 1.4546956031039453, 'beta': 0.2519643078688045, 'theta': 64.3645931347212, 'gamma': 0.12266613857907123}, 'Asian': {'alpha': 45.10118665564052, 'beta': 0.02843622588577595, 'theta': 75.05612647721712, 'gamma': 0.28431113587464313}, 'White': {'alpha': 94.65706935018751, 'beta': 0.17550523276101154, 'theta': 3.6777609480567124, 'gamma': 0.3878090419379047}, 'American Indian or Alaska Native': {'alpha': 50.17037032478814, 'beta': 0.4382487668749145, 'theta': 55.354731242221874, 'gamma': 0.32179253659832563}, 'Native Hawaiian or other Pacific Islander': {'alpha': 59.71958819307276, 'beta': 0.42522126666986554, 'theta': 86.36153789308638, 'gamma': 0.23682704968744914}, 'Black or African-American': {'alpha': 83.77709382457579, 'beta': 0.19129300964328794, 'theta': 65.4079578542401, 'gamma': 0.2843870488335714}}, 'target8': {'More than one race': {'alpha': 56.13024488348109, 'beta': 0.027404478398130837, 'theta': 42.560480064442515, 'gamma': 0.22662600542226374}, 'Asian': {'alpha': 24.43175906309471, 'beta': 0.09473136370546636, 'theta': 14.897768434530565, 'gamma': 0.28797748619815866}, 'White': {'alpha': 1.4060356896783772, 'beta': 0.5011417883160718, 'theta': 35.242991143127874, 'gamma': 0.3472666985915581}, 'American Indian or Alaska Native': {'alpha': 94.31920874288929, 'beta': 0.4380492231393451, 'theta': 44.005896000389384, 'gamma': 0.25905861171405337}, 'Native Hawaiian or other Pacific Islander': {'alpha': 26.29399837965448, 'beta': 0.4496571460617914, 'theta': 51.07032499681351, 'gamma': 0.1840453556703833}, 'Black or African-American': {'alpha': 95.92290203154307, 'beta': 0.06465729634750773, 'theta': 49.873298521547156, 'gamma': 0.28441154749477987}}}
best_params = {'target1': {'More than one race': {'alpha': 2.152640428281422, 'beta': 0.5792007080000788, 'theta': 49.23769195632153, 'gamma': 0.38523952534219374}, 'Asian': {'alpha': 4.437803147380086, 'beta': 0.5418327439892101, 'theta': 33.97322832841521, 'gamma': 0.12376990231425139}, 'White': {'alpha': 10.31355645250159, 'beta': 0.5431295238955499, 'theta': 6.204391381017401, 'gamma': 0.1400086710348517}, 'American Indian or Alaska Native': {'alpha': 70.07386765063042, 'beta': 0.5434642161062178, 'theta': 0.06351053706971843, 'gamma': 0.033032458451892316}, 'Native Hawaiian or other Pacific Islander': {'alpha': 76.45785165552746, 'beta': 0.5171309086024187, 'theta': 81.95905896873602, 'gamma': 0.39101343674722083}, 'Black or African-American': {'alpha': 86.79455395037331, 'beta': 0.4877647089252286, 'theta': 76.10181391129231, 'gamma': 0.028845316661710352}}, 'target2': {'More than one race': {'alpha': 0.8110837893409355, 'beta': 0.5877661084980736, 'theta': 70.83267334890569, 'gamma': 0.2734864313200098}, 'Asian': {'alpha': 1.6404584063130776, 'beta': 0.5392851425533112, 'theta': 78.24666785468791, 'gamma': 0.14393116369201425}, 'White': {'alpha': 86.45541257670959, 'beta': 0.49301388701970866, 'theta': 69.52098205607768, 'gamma': 0.3109508515082501}, 'American Indian or Alaska Native': {'alpha': 1.296396992425549, 'beta': 0.5487259854699585, 'theta': 97.97878738691243, 'gamma': 0.023898595888245257}, 'Native Hawaiian or other Pacific Islander': {'alpha': 1.8768225647261314, 'beta': 0.5345482456342568, 'theta': 85.4380163397157, 'gamma': 0.015530141616448136}, 'Black or African-American': {'alpha': 79.05886492204074, 'beta': 0.4685626053401703, 'theta': 57.47671014224372, 'gamma': 0.04872233262364207}}, 'target3': {'More than one race': {'alpha': 0.5941680999449113, 'beta': 0.5628291634549697, 'theta': 34.910406590087995, 'gamma': 0.016988205955135085}, 'Asian': {'alpha': 42.75415132584927, 'beta': 0.4721109223378618, 'theta': 49.55792880611136, 'gamma': 0.03168770787913031}, 'White': {'alpha': 46.82556950706927, 'beta': 0.4932029818960778, 'theta': 86.1392396563657, 'gamma': 0.3039935306650603}, 'American Indian or Alaska Native': {'alpha': 5.277913789156747, 'beta': 0.5091892256068017, 'theta': 51.28872356750218, 'gamma': 0.23067553471394375}, 'Native Hawaiian or other Pacific Islander': {'alpha': 2.6102567053927195, 'beta': 0.5305289593559397, 'theta': 41.56702678985413, 'gamma': 0.26309819484768565}, 'Black or African-American': {'alpha': 43.029471628139035, 'beta': 0.468168166222635, 'theta': 6.167479240846404, 'gamma': 0.19637198973950232}}, 'target4': {'More than one race': {'alpha': 99.89484884802813, 'beta': 0.5745562858916264, 'theta': 7.378186433219837, 'gamma': 0.37854124593366745}, 'Asian': {'alpha': 99.9705968198394, 'beta': 0.5652540122061038, 'theta': 32.422250976371565, 'gamma': 0.3989204114440998}, 'White': {'alpha': 99.71338683316239, 'beta': 0.5446843654627052, 'theta': 98.67407891283072, 'gamma': 0.078236349449899}, 'American Indian or Alaska Native': {'alpha': 99.98479829638308, 'beta': 0.5626564320260298, 'theta': 6.294122540806603, 'gamma': 0.20322635169243827}, 'Native Hawaiian or other Pacific Islander': {'alpha': 99.34406222579591, 'beta': 0.5753074686603651, 'theta': 73.54388319061322, 'gamma': 0.3938303483941479}, 'Black or African-American': {'alpha': 99.93361372800788, 'beta': 0.5789144211750551, 'theta': 66.5831546919557, 'gamma': 0.24940567987384782}}, 'target5': {'More than one race': {'alpha': 33.67067556806241, 'beta': 0.38549172248603325, 'theta': 95.65119373186913, 'gamma': 0.08173315354757583}, 'Asian': {'alpha': 69.50989573668102, 'beta': 0.47035930577653046, 'theta': 69.54280838964564, 'gamma': 0.21373494649064814}, 'White': {'alpha': 94.21638470671931, 'beta': 0.4922417433058817, 'theta': 21.804992770893495, 'gamma': 0.13342953356501253}, 'American Indian or Alaska Native': {'alpha': 35.53756585961691, 'beta': 0.4990210069708231, 'theta': 5.168835213237544, 'gamma': 0.36618532734119863}, 'Native Hawaiian or other Pacific Islander': {'alpha': 72.70367361556025, 'beta': 0.49389843041055603, 'theta': 96.47391550039197, 'gamma': 0.04859225917825136}, 'Black or African-American': {'alpha': 73.70356541027238, 'beta': 0.46757312470678364, 'theta': 48.902400420220005, 'gamma': 0.09085512847729205}}, 'target6': {'More than one race': {'alpha': 0.3927065688323634, 'beta': 0.5983650120073558, 'theta': 84.34895237763759, 'gamma': 0.38708936173761926}, 'Asian': {'alpha': 81.51650401381376, 'beta': 0.4705425922116324, 'theta': 8.800416198710161, 'gamma': 0.05099736169883566}, 'White': {'alpha': 95.12689940798616, 'beta': 0.492735790359426, 'theta': 71.36789846972917, 'gamma': 0.26607254584036616}, 'American Indian or Alaska Native': {'alpha': 2.969349374533144, 'beta': 0.5149082370494863, 'theta': 7.451395076957817, 'gamma': 0.06104211137876875}, 'Native Hawaiian or other Pacific Islander': {'alpha': 1.952431902399395, 'beta': 0.5325992597729167, 'theta': 46.84292974819731, 'gamma': 0.2907584149298512}, 'Black or African-American': {'alpha': 89.36684669401004, 'beta': 0.46803611163715986, 'theta': 9.106904319346084, 'gamma': 0.13700986827718117}}, 'target7': {'More than one race': {'alpha': 27.342056760977087, 'beta': 0.23902079970920176, 'theta': 74.29114658110893, 'gamma': 0.3857386608910257}, 'Asian': {'alpha': 71.63740084533232, 'beta': 0.4719423613253588, 'theta': 5.275325566433647, 'gamma': 0.12302863313394932}, 'White': {'alpha': 80.10290149072446, 'beta': 0.49301181843787245, 'theta': 19.786162094256095, 'gamma': 0.28079667150942167}, 'American Indian or Alaska Native': {'alpha': 5.4577587139359185, 'beta': 0.5084731594241092, 'theta': 68.71540873661782, 'gamma': 0.3127378188245579}, 'Native Hawaiian or other Pacific Islander': {'alpha': 4.003109709074951, 'beta': 0.520970327885257, 'theta': 70.25214351862972, 'gamma': 0.12565039687098145}, 'Black or African-American': {'alpha': 90.0791997523357, 'beta': 0.4680676723607272, 'theta': 49.35649027468699, 'gamma': 0.20709993598066617}}, 'target8': {'More than one race': {'alpha': 2.7326195586801827, 'beta': 0.4686647756173375, 'theta': 56.85932505297336, 'gamma': 0.38111626264277676}, 'Asian': {'alpha': 95.15459728896133, 'beta': 0.47188452231361794, 'theta': 74.17118740791163, 'gamma': 0.3666888402673796}, 'White': {'alpha': 97.53821292907097, 'beta': 0.49369053808619007, 'theta': 77.84825637252688, 'gamma': 0.39902617936250334}, 'American Indian or Alaska Native': {'alpha': 21.386717631622357, 'beta': 0.5074801097758145, 'theta': 29.251964834715388, 'gamma': 0.31444121174493006}, 'Native Hawaiian or other Pacific Islander': {'alpha': 36.129254417689346, 'beta': 0.4970064513792411, 'theta': 8.846582207359521, 'gamma': 0.10553648927919092}, 'Black or African-American': {'alpha': 12.532673444268525, 'beta': 0.4793141245423127, 'theta': 93.14072845734536, 'gamma': 0.2739754742855011}}}

def apply_adjustment(row):
    race = row['race_group']
    # 各ターゲットに対して、race別に求めたパラメータを使用
    params = best_params[col][race]
    alpha = params["alpha"]
    beta  = params["beta"]
    theta = params["theta"]
    gamma = params["gamma"]
    adjusted_value = row[col] - np.clip(alpha * (beta - row['cls']), 0, None) \
                                   - np.clip(theta * (gamma - row['cls']), 0, None)
    return adjusted_value

### blending weights (Elastic Net output)

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import rankdata

# Supondo que as previsões fora da amostra (oof_preds) já estejam definidas

df_oof = pd.DataFrame(np.array(en_oof_preds), columns=['target1','target2','target3','target4', 'target5', 'target6','target7','target8'])

df_oof['cls'] = cls_oof_preds
    
df_oof['race_group'] = train_data['race_group'].values

for col in ['target1','target2','target3','target4', 'target5', 'target6','target7','target8']:
    if use_naive_post_processing:
        if col in ['target1','target4','target8']:
            df_oof.loc[df_oof.cls < 0.4, col] -= 0.1
            df_oof.loc[df_oof.cls < 0.3, col] -= 0.5
            df_oof.loc[df_oof.cls < 0.2, col] -= 0.3
            df_oof.loc[df_oof.cls < 0.1, col] -= 0.2
        elif col == 'target5':
            df_oof.loc[df_oof.cls < 0.4, col] -= 0.001
            df_oof.loc[df_oof.cls < 0.3, col] -= 0.005
            df_oof.loc[df_oof.cls < 0.2, col] -= 0.003
            df_oof.loc[df_oof.cls < 0.1, col] -= 0.002            
        else:
            df_oof.loc[df_oof.cls < 0.4, col] -= 0.01
            df_oof.loc[df_oof.cls < 0.3, col] -= 0.05
            df_oof.loc[df_oof.cls < 0.2, col] -= 0.03
            df_oof.loc[df_oof.cls < 0.1, col] -= 0.02
    else:
        df_oof[col] = df_oof.apply(apply_adjustment, axis=1)

    df_oof[col] = rankdata(df_oof[col].values)

ranked_oof_preds = df_oof.drop(columns=['cls', 'race_group']).values

# Função para avaliar o modelo
def evaluate_model(weights, ranked_oof_preds):
    ensemble_oof_preds = np.dot(weights, ranked_oof_preds.T)
    return model.targets.validate_model(ensemble_oof_preds, 'Ensemble Model')

# Loop para testar pesos randomicamente
best_score = 0.00000000
best_weights = None

for _ in range(10000):  # Número de iterações para testar diferentes pesos
    weights = np.random.randint(0, 10, size=ranked_oof_preds.shape[1])
    score_ = evaluate_model(weights, ranked_oof_preds)
    if score_ > best_score:
        best_score = score_
        best_weights = weights
        print(f'Overall Stratified C-Index Score for {_}: {best_score:.4f}')


print(f"Best weights: {best_weights}")
print(f"Best score: {best_score}")

# Usar os melhores pesos encontrados
ensemble_oof_preds = np.dot(best_weights, ranked_oof_preds.T)
model.targets.validate_model(ensemble_oof_preds, 'Ensemble Model')

### blending weights (Simple Rankdata)

In [None]:
# from sklearn.linear_model import MultiTaskElasticNet, ElasticNet
# import optuna

# oof_preds = [
#     ctb1_oof_preds, # cox ph fitter
#     lgb1_oof_preds, 
#     xgb1_oof_preds,
#     ctb2_oof_preds, # kaplan meier
#     lgb2_oof_preds,
#     lgb22_oof_preds, # tweedie objective
#     xgb2_oof_preds,
#     ctb3_oof_preds, # nelson aalen
#     lgb3_oof_preds,
#     # xgb3_oof_preds,
#     cox1_oof_preds, # cox objective
#     cox2_oof_preds,
#     cox3_oof_preds,
#     ctb5_oof_preds, # kmf transformed target by octopus210
#     lgb5_oof_preds,
#     xgb5_oof_preds,
#     ctb6_oof_preds, # breslowfleming
#     lgb6_oof_preds,
#     xgb6_oof_preds,
#     ctb7_oof_preds, # transform quantile
#     lgb7_oof_preds,
#     xgb7_oof_preds,
#     ctb8_oof_preds, # transform ranklog
#     lgb8_oof_preds,
#     xgb8_oof_preds,
#     -nn_oof_preds, # pairwise loss
#     -nn2_oof_preds,
#     ctb_kaplan_oof, # by zdh
#     ctb_nelson_oof,
#     ctb_quantile_oof,
#     ctb_ranklog_oof,
#     auto_kaplan_oof,
#     auto_nelson_oof,
#     auto_quantile_oof,
#     auto_cox_oof,
#     # cls_oof_preds,
#     # 0.15*cls_oof_preds+0.85*autogluon_cls_oof_preds.loc[:,1]
# ]
# if use_autogluon:
#     oof_preds += [autogluon_1_oof_preds,
#     autogluon_2_oof_preds,
#     autogluon_3_oof_preds,
#     autogluon_4_oof_preds,
#     autogluon_5_oof_preds,
#     autogluon_6_oof_preds,
#     autogluon_7_oof_preds,
#     autogluon_8_oof_preds]

# preds = [
#     ctb1_preds, 
#     lgb1_preds, 
#     xgb1_preds,
#     ctb2_preds, 
#     lgb2_preds,
#     lgb22_preds,
#     xgb2_preds,
#     ctb3_preds, 
#     lgb3_preds,
#     # xgb3_preds,
#     cox1_preds,
#     cox2_preds,
#     cox3_preds,
#     ctb5_preds,  # kmf transformed target by octopus210
#     lgb5_preds,
#     xgb5_preds,
#     ctb6_preds,  
#     lgb6_preds,
#     xgb6_preds,
#     ctb7_preds, 
#     lgb7_preds,
#     xgb7_preds,
#     ctb8_preds,  
#     lgb8_preds,
#     xgb8_preds,
#     -nn_preds,
#     -nn2_preds,
#     ctb_kaplan_preds,
#     ctb_nelson_preds,
#     ctb_quantile_preds,
#     ctb_ranklog_preds,
#     auto_kaplan_preds,
#     auto_nelson_preds,
#     auto_quantile_preds,
#     auto_cox_preds,
#     # autogluon_kaplan_preds, # by kekshibata
#     # autogluon_nelson_preds # by kekshibata
#     # cls_preds,
# ]
# if use_autogluon:
#     preds += [
#     autogluon_1_preds,
#     autogluon_2_preds,
#     autogluon_3_preds,
#     autogluon_4_preds,
#     autogluon_5_preds,
#     autogluon_6_preds,
#     autogluon_7_preds,
#     autogluon_8_preds
#     ]

# columns = ['ctb1','lgb1','xgb1','ctb2', 'lgb2', 'lgb22','xgb2','ctb3', 'lgb3', 'cox1', 'cox2','cox3','ctb5','lgb5','xgb5','ctb6','lgb6','xgb6','ctb7','lgb7','xgb7','ctb8','lgb8','xgb8', 'nn', 'nn2', 'zdh_kp', 'zdh_nl', 'zdh_q', 'zdh_rl', 'auto_kp','auto_nl','auto_q','auto_cox']
# if use_autogluon:
#     columns +=['auto1','auto2','auto3','auto4','auto5','auto6','auto7','auto8']

# df_oof = pd.DataFrame(np.array(oof_preds).T, columns=columns)
# df_oof['cls'] = cls_oof_preds
# df_preds = pd.DataFrame(np.array(preds).T, columns=columns)
# df_preds['cls'] = cls_preds

# # === Apply naive post-processing ===
# if apply_naive_post_processing_rank_ensemble:
# # oof
#     if col in ['ctb1','lgb1','xgb1', 'cox1', 'cox2','cox3', 'ctb8','lgb8','xgb8', 'auto1','auto4','auto8']:
#         df_oof.loc[df_oof.cls < 0.4, col] -= 0.1
#         df_oof.loc[df_oof.cls < 0.3, col] -= 0.5
#         df_oof.loc[df_oof.cls < 0.2, col] -= 0.3
#         df_oof.loc[df_oof.cls < 0.1, col] -= 0.2
#     elif col in ['ctb5','lgb5','xgb5', 'auto5']:
#         df_oof.loc[df_oof.cls < 0.4, col] -= 0.001
#         df_oof.loc[df_oof.cls < 0.3, col] -= 0.005
#         df_oof.loc[df_oof.cls < 0.2, col] -= 0.003
#         df_oof.loc[df_oof.cls < 0.1, col] -= 0.002            
#     elif col not in ['zdh_kp', 'zdh_nl', 'zdh_q', 'zdh_rl', 'auto_kp','auto_nl','auto_q','auto_cox']:
#         df_oof.loc[df_oof.cls < 0.4, col] -= 0.01
#         df_oof.loc[df_oof.cls < 0.3, col] -= 0.05
#         df_oof.loc[df_oof.cls < 0.2, col] -= 0.03
#         df_oof.loc[df_oof.cls < 0.1, col] -= 0.02
# # preds
#     if col in ['ctb1','lgb1','xgb1', 'cox1', 'cox2','cox3', 'ctb8','lgb8','xgb8', 'auto1','auto4','auto8']:
#         df_preds.loc[df_preds.cls < 0.4, col] -= 0.1
#         df_preds.loc[df_preds.cls < 0.3, col] -= 0.5
#         df_preds.loc[df_preds.cls < 0.2, col] -= 0.3
#         df_preds.loc[df_preds.cls < 0.1, col] -= 0.2
#     elif col in ['ctb5','lgb5','xgb5', 'auto5']:
#         df_preds.loc[df_preds.cls < 0.4, col] -= 0.001
#         df_preds.loc[df_preds.cls < 0.3, col] -= 0.005
#         df_preds.loc[df_preds.cls < 0.2, col] -= 0.003
#         df_preds.loc[df_preds.cls < 0.1, col] -= 0.002            
#     elif col not in ['zdh_kp', 'zdh_nl', 'zdh_q', 'zdh_rl', 'auto_kp','auto_nl','auto_q','auto_cox']:
#         df_preds.loc[df_preds.cls < 0.4, col] -= 0.01
#         df_preds.loc[df_preds.cls < 0.3, col] -= 0.05
#         df_preds.loc[df_preds.cls < 0.2, col] -= 0.03
#         df_preds.loc[df_preds.cls < 0.1, col] -= 0.02

# # ============

# df_oof = df_oof.drop(columns=['cls'])
# df_preds = df_preds.drop(columns=['cls'])


# def validate_model(df, preds):
        
#     y_true = df[['ID', 'efs', 'efs_time', 'race_group']].copy()
#     y_pred = df[['ID']].copy()
    
#     y_pred['prediction'] = preds
        
#     c_index_score = score(y_true.copy(), y_pred.copy(), 'ID')
#     return c_index_score


# # === モデルの評価関数 ===
# def evaluate_model(weights, df, df_preds):
#     ensemble_oof_preds = np.dot(weights, df_preds.T)
#     # return model.targets.validate_model(ensemble_oof_preds, 'Ensemble Model')
#     return validate_model(df, ensemble_oof_preds)


# # === Optunaによる重み最適化 ===
# def objective(trial):
#     ranked_oof_preds = df_oof.copy()
#     _scores = []
#     for i in range(10):
#         num_models = ranked_oof_preds.shape[1]  
#         sampled_df = ranked_oof_preds.sample(frac=0.8, replace=False, random_state=42 + i) 
#         for col in sampled_df.columns:
#             sampled_df[col] = rankdata(sampled_df[col])
        
#         weights = np.array([
#             trial.suggest_int(f"weight_{i}", 0, 10)
#             for i in range(num_models)
#         ])
        
#         # 評価スコアの計算
#         _scores.append(evaluate_model(weights, train_data[['ID', 'efs','efs_time', 'race_group']].sample(frac=0.8, replace=False, random_state=42 + i), sampled_df))
#     return np.mean(_scores)


# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=5000)

# # 最適な重みとスコアの表示
# rank_best_weights = np.array([study.best_params[f"weight_{i}"] for i in range(len(study.best_params))])
# print("Best weights: ", rank_best_weights)
# print("Best score: ", study.best_value)

# # === 最適な重みを用いて最終的なアンサンブル予測の評価 ===
# rank_ensemble_oof_preds = rankdata(np.dot(rank_best_weights, df_oof.T))
# rank_ensemble_preds = rankdata(np.dot(rank_best_weights, df_preds.T))
# model.targets.validate_model(rank_ensemble_oof_preds, 'Simple Rankdata Ensemble Model')


## OOF evaluation

### 1. Elastic Net + post-processing + rankdata + weighted mean

In [None]:
df_oof = pd.DataFrame(np.array(en_oof_preds), columns=['target1','target2','target3','target4', 'target5', 'target6','target7','target8'])
df_oof['cls'] = cls_oof_preds
df_oof['race_group'] = train_data['race_group'].values

for col in ['target1','target2','target3','target4', 'target5', 'target6','target7','target8']:
    if use_naive_post_processing:
        if col in ['target1','target4','target8']:
            df_oof.loc[df_oof.cls < 0.4, col] -= 0.1
            df_oof.loc[df_oof.cls < 0.3, col] -= 0.5
            df_oof.loc[df_oof.cls < 0.2, col] -= 0.3
            df_oof.loc[df_oof.cls < 0.1, col] -= 0.2
        elif col == 'target5':
            df_oof.loc[df_oof.cls < 0.4, col] -= 0.001
            df_oof.loc[df_oof.cls < 0.3, col] -= 0.005
            df_oof.loc[df_oof.cls < 0.2, col] -= 0.003
            df_oof.loc[df_oof.cls < 0.1, col] -= 0.002            
        else:
            df_oof.loc[df_oof.cls < 0.4, col] -= 0.01
            df_oof.loc[df_oof.cls < 0.3, col] -= 0.05
            df_oof.loc[df_oof.cls < 0.2, col] -= 0.03
            df_oof.loc[df_oof.cls < 0.1, col] -= 0.02
    else:
        df_oof[col] = df_oof.apply(apply_adjustment, axis=1)

    df_oof[col] = rankdata(df_oof[col].values)

ranked_oof_preds = df_oof.drop(columns=['cls','race_group']).values
ensemble_oof_preds = np.dot(best_weights, ranked_oof_preds.T)
model.targets.validate_model(ensemble_oof_preds, 'Ensemble Model')

In [None]:
model.targets.validate_model(ensemble_oof_preds, 'Ensemble Model')

### 2. Adjusting with custom PRL NN which is strong for efs=1 data

In [None]:
oof_preds = [
    cls_oof_preds,
    ensemble_oof_preds,
    -nn_efs1_oof_preds,
]
df_oof = pd.DataFrame(np.array(oof_preds).T, columns = ['cls','ens','efs1'])
df_oof = pd.concat([df_oof, train_data['race_group']], axis=1)
df_oof['adjusted'] = 0

for race in df_oof.race_group.unique():
    race_ind = df_oof['race_group']==race
    df_oof.loc[race_ind, 'ens'] = rankdata(df_oof.loc[race_ind, 'ens'])
    df_oof.loc[race_ind, 'efs1'] = rankdata(df_oof.loc[race_ind, 'efs1'])
    df_oof.loc[race_ind & (df_oof.cls<race_thresh_dict[race]),'ens'] -= 1000000000
    df_oof.loc[race_ind & (df_oof.cls<race_thresh_dict[race]),'efs1'] -= 1000000000
    
    df_oof.loc[race_ind & (df_oof.cls>=race_thresh_dict[race]),'ens'] = rankdata(df_oof.loc[race_ind & (df_oof.cls>=race_thresh_dict[race]),'ens'])
    df_oof.loc[race_ind & (df_oof.cls>=race_thresh_dict[race]),'efs1'] = rankdata(df_oof.loc[race_ind & (df_oof.cls>=race_thresh_dict[race]),'efs1'])
    df_oof.loc[race_ind, 'adjusted'] = df_oof.loc[race_ind, 'ens']
    w = 0.25
    df_oof.loc[race_ind & (df_oof.cls>=race_thresh_dict[race]), 'adjusted'] = df_oof.loc[race_ind & (df_oof.cls>=race_thresh_dict[race]),'ens']+w*df_oof.loc[race_ind & (df_oof.cls>=race_thresh_dict[race]),'efs1']


adjusted_ensemble_oof_preds = rankdata(df_oof['adjusted'].values)
print('Overall CV for single NN with threshold manipulation:', model.targets.validate_model(adjusted_ensemble_oof_preds, 'model'))

### 3. Final NN blending

In [None]:
oof_preds = [
    cls_oof_preds,
    adjusted_ensemble_oof_preds,
    -nn_w_oof_preds,
]
df_oof = pd.DataFrame(np.array(oof_preds).T, columns = ['cls','ens','nn'])
df_oof = pd.concat([df_oof, train_data['race_group']], axis=1)
df_oof['final'] = 0

for race in df_oof.race_group.unique():
    race_ind = df_oof['race_group']==race
    df_oof.loc[race_ind, 'ens'] = rankdata(df_oof.loc[race_ind, 'ens'])
    df_oof.loc[race_ind, 'nn'] = rankdata(df_oof.loc[race_ind, 'nn'])
    df_oof['final'] = df_oof['ens']+0.2*df_oof['nn']
    
    # df_oof.loc[race_ind & (df_oof.cls<race_thresh_dict[race]),'final'] -= 1000000000
    # df_oof.loc[race_ind & (df_oof.cls>0.9),'final'] += 1000000000
    # df_oof.loc[race_ind & (df_oof.cls<race_thresh_dict[race]),'nn'] -= 1000000000

final_ensemble_oof_preds = rankdata(df_oof['final'])



print('Overall CV for final ensemble:', model.targets.validate_model(final_ensemble_oof_preds, 'model'))

### 4. Finally, integrate the simple rankdata ensemble

iterative searching for final weight

In [None]:
# best_cv = 0.0
# best_final_weight = 0.0

# for weight in np.arange(0, 1.05, 0.05):  # 1.0 を含むために 1.05 まで
#     very_final_ensemble_oof_preds = final_ensemble_oof_preds + weight * rank_ensemble_oof_preds
#     cv_score = model.targets.validate_model(very_final_ensemble_oof_preds, 'model')
#     print(f'Weight: {weight:.2f}, CV Score: {cv_score}')
#     if cv_score > best_cv:
#         best_cv = cv_score
#         best_final_weight = weight

# print('最良のfinal_weight:', best_final_weight)
# print('最良のCV Score:', best_cv)


In [None]:
# very_final_ensemble_oof_preds = final_ensemble_oof_preds + best_final_weight * rank_ensemble_oof_preds
# print('Overall CV for very final ensemble:', model.targets.validate_model(very_final_ensemble_oof_preds, 'model'))

## Submission

In [None]:
# テストデータに対する予測結果の DataFrame を作成
df = pd.DataFrame(np.array(en_preds), 
                  columns=['target1','target2','target3','target4', 
                           'target5','target6','target7','target8'])
df['cls'] = cls_preds
df['race_group'] = test_data['race_group'].values

# 各ターゲットに対して、race別のパラメータで調整し、ランク付けする
for col in ['target1','target2','target3','target4', 
            'target5','target6','target7','target8']:
    if use_naive_post_processing:
        if col in ['target1','target4','target8']:
            df.loc[df.cls < 0.4, col] -= 0.1
            df.loc[df.cls < 0.3, col] -= 0.5
            df.loc[df.cls < 0.2, col] -= 0.3
            df.loc[df.cls < 0.1, col] -= 0.2
        elif col == 'target5':
            df.loc[df.cls < 0.4, col] -= 0.001
            df.loc[df.cls < 0.3, col] -= 0.005
            df.loc[df.cls < 0.2, col] -= 0.003
            df.loc[df.cls < 0.1, col] -= 0.002            
        else:
            df.loc[df.cls < 0.4, col] -= 0.01
            df.loc[df.cls < 0.3, col] -= 0.05
            df.loc[df.cls < 0.2, col] -= 0.03
            df.loc[df.cls < 0.1, col] -= 0.02
    else:
        df[col] = df.apply(apply_adjustment, axis=1)
    df[col] = rankdata(df[col].values)

# ランク付けした予測値からアンサンブル予測を計算
ranked_preds = df.drop(columns=['cls','race_group']).values
ensemble_preds = np.dot(best_weights, ranked_preds.T)

# cls、ensemble、efs1 の予測を統合
preds = [
    cls_preds,
    ensemble_preds,
    -nn_efs1_preds,
]
df = pd.DataFrame(np.array(preds).T, columns=['cls','ens','efs1'])
df = pd.concat([df, test_data['race_group']], axis=1)
df['adjusted'] = 0

# race ごとに、cls の閾値に応じた調整と再ランク付け
for race in df.race_group.unique():
    race_ind = df['race_group'] == race
    df.loc[race_ind, 'ens'] = rankdata(df.loc[race_ind, 'ens'])
    df.loc[race_ind, 'efs1'] = rankdata(df.loc[race_ind, 'efs1'])
    df.loc[race_ind & (df.cls < race_thresh_dict[race]), 'ens'] -= 1000000000
    df.loc[race_ind & (df.cls < race_thresh_dict[race]), 'efs1'] -= 1000000000
    
    df.loc[race_ind & (df.cls >= race_thresh_dict[race]), 'ens'] = rankdata(
        df.loc[race_ind & (df.cls >= race_thresh_dict[race]), 'ens'])
    df.loc[race_ind & (df.cls >= race_thresh_dict[race]), 'efs1'] = rankdata(
        df.loc[race_ind & (df.cls >= race_thresh_dict[race]), 'efs1'])
    df.loc[race_ind, 'adjusted'] = df.loc[race_ind, 'ens']
    w = 0.25
    df.loc[race_ind & (df.cls >= race_thresh_dict[race]), 'adjusted'] = \
        df.loc[race_ind & (df.cls >= race_thresh_dict[race]), 'ens'] + \
        w * df.loc[race_ind & (df.cls >= race_thresh_dict[race]), 'efs1']

adjusted_ensemble_preds = rankdata(df['adjusted'].values)

# 最終的な予測値を、cls、調整後ensemble、nn、autogluon の予測から統合
preds = [
    cls_preds,
    adjusted_ensemble_preds,
    -nn_w_preds,
    # autogluon_1_preds
]
df = pd.DataFrame(np.array(preds).T, columns=['cls','ens','nn'])
df = pd.concat([df, test_data['race_group']], axis=1)
df['final'] = 0

for race in df.race_group.unique():
    race_ind = df['race_group'] == race
    df.loc[race_ind, 'ens'] = rankdata(df.loc[race_ind, 'ens'])
    df.loc[race_ind, 'nn'] = rankdata(df.loc[race_ind, 'nn'])
    # df.loc[race_ind, 'auto1'] = rankdata(df.loc[race_ind, 'auto1'])
    df['final'] = df['ens'] + 0.2 * df['nn']

final_ensemble_preds = rankdata(df['final'])

# very_final_ensemble_preds = final_ensemble_preds + best_final_weight * rank_ensemble_preds


In [None]:
subm_data = pd.read_csv(Config.subm_path)
subm_data['prediction'] = final_ensemble_preds

subm_data.to_csv('submission.csv', index=False)
display(subm_data.head())