In [1]:

import numpy as np
import pandas as pd 
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# Пути к файлам

In [2]:

train_path = "/kaggle/input/equity-post-HCT-survival-predictions/train.csv"
test_path = "/kaggle/input/equity-post-HCT-survival-predictions/test.csv"
submission_path = "/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv"

# Загрузка данных

In [3]:
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [4]:
RMV = ["ID","efs","efs_time"]
FEATURES = [c for c in train_data.columns if not c in RMV]
print(f"Здесь {len(FEATURES)} Фичей: {FEATURES}")

Здесь 57 Фичей: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10']


# Преобразование категориальных фичей

In [5]:
hct_ci_mapping = {
    "arrhythmia": {"No": 0, "Not done": 0, "Yes": 1},  
    "cardiac": {"No": 0, "Not done": 0, "Yes": 1}, 
    "diabetes": {"No": 0, "Not done": 0, "Yes": 1},  
    "hepatic_mild": {"No": 0, "Not done": 0, "Yes": 1},
    "hepatic_severe": {"No": 0, "Not done": 0, "Yes": 3},
    "psych_disturb": {"No": 0, "Not done": 0, "Yes": 1}, 
    "obesity": {"No": 0, "Not done": 0, "Yes": 1}, 
    "rheum_issue": {"No": 0, "Not done": 0, "Yes": 2},
    "peptic_ulcer": {"No": 0, "Not done": 0, "Yes": 2},  
    "renal_issue": {"No": 0, "Not done": 0, "Yes": 2}, 
    "prior_tumor": {"No": 0, "Not done": 0, "Yes": 3}, 
    "pulm_moderate": {"No": 0, "Not done": 0, "Yes": 2}, 
    "pulm_severe": {"No": 0, "Not done": 0, "Yes": 3},  
}
def calculate_hct_ci_score(row, mapping):
        """
        Функция фычисляет hct_ci score
    
        Args:
            row (pd.Series): Patient Clinical Data
            mapping (dict): HCT-CI score mapping
    
        Returns:
            int: HCT-CI score
        """
    
        score = 0
    
        if "hepatic_severe" in row and row["hepatic_severe"] == "Yes":
            score += mapping["hepatic_severe"]["Yes"]
        elif "hepatic_mild" in row and row["hepatic_mild"] == "Yes":
            score += mapping["hepatic_mild"]["Yes"]
        if "pulm_moderate" in row and row["pulm_moderate"] == "Yes":
            score += mapping["pulm_moderate"]["Yes"]
        elif "pulm_severe" in row and row["pulm_severe"] == "Yes":
            score += mapping["pulm_severe"]["Yes"]
    
        # Other Conditions
        for condition, mapping_values in mapping.items():
            if condition not in ["hepatic_mild", "hepatic_severe","pulm_moderate", "pulm_severe"] and condition in row:
                score += mapping_values.get(row[condition], 0)
    
        return score

In [6]:
def cat2num(df):
    df['conditioning_intensity'] = df['conditioning_intensity'].map({
    'NMA': 1, 
    'RIC': 2,
    'MAC': 3,
    'TBD': None,
    'No drugs reported': None,
    'N/A, F(pre-TED) not submitted': None})
    
    df['tbi_status'] = df['tbi_status'].map({
    'No TBI': 0, 
    'TBI +- Other, <=cGy': 1,
    'TBI +- Other, -cGy, fractionated': 2,
    'TBI + Cy +- Other': 3,
    'TBI +- Other, -cGy, single': 4,
    'TBI +- Other, >cGy': 5,
    'TBI +- Other, unknown dose': None})
    
    df['dri_score'] = df['dri_score'].map({
    'Low': 1, 
    'Intermediate': 2,
    'Intermediate - TED AML case <missing cytogenetics': 3,
    'High': 4,
    'High - TED AML case <missing cytogenetics': 5,
    'Very High': 6,
    'N/A - pediatric': -3,
    'N/A - non-malignant indication': -1,
    'TBD cytogenetics': -2,
    'N/A - disease not classifiable': -4,
    'Missing disease status': 0})
    
    df['cyto_score'] = df['cyto_score'].map({
    'Poor': 4,
    'Normal': 3,
    'Intermediate': 2,
    'Favorable': 1,
    'TBD': -1,
    'Other': -2,
    'Not tested': None})
    
    df['cyto_score_detail'] = df['cyto_score_detail'].map({
    'Poor': 3, 
    'Intermediate': 2,
    'Favorable': 1,
    'TBD': -1,
    'Not tested': None})
    
    return df

In [7]:
def fill_hla_combined_low(row):
    if np.isnan(row['hla_combined_low']): 
        components = [
            row['hla_match_drb1_low'], row['hla_match_dqb1_low'], 
            row['hla_match_a_low'], row['hla_match_b_low'], row['hla_match_c_low']
        ]
        if all([not np.isnan(x) for x in components]):
            return sum(components)
        else:
            if not np.isnan(row['hla_low_res_8']) and not np.isnan(row['hla_match_dqb1_low']):
                return row['hla_low_res_8'] + row['hla_match_dqb1_low']
            elif not np.isnan(row['hla_low_res_6']): 
                components_6 = [
                    row['hla_match_dqb1_low'], row['hla_match_c_low']
                ]
                if all([not np.isnan(x) for x in components_6]):
                    return row['hla_low_res_6'] + sum(components_6)
                else: 
                    return sum([x for x in components if not np.isnan(x)])
    return row['hla_combined_low'] 

In [8]:
def add_features(df):
    df["hct_ci_score"] = df.apply(lambda row: calculate_hct_ci_score(row, hct_ci_mapping), axis=1)
    df['donor_recipient_age_diff'] = abs(df['donor_age'] - df['age_at_hct'])
    df = cat2num(df)
    df['hla_combined_low'] = df['hla_low_res_10']
    df['hla_combined_low'] = df.apply(fill_hla_combined_low, axis=1)
    df['hla_match_ratio'] = (df['hla_high_res_8'] + df['hla_low_res_8']) / 16
    df['years_since_2000'] = df['year_hct'] - 2000
    df['null_count'] = df.isnull().sum(axis=1)
    df['ci_score_danger'] = df['hct_ci_score'].apply(lambda x: 2 if x >= 3 else 1 if x >= 1 else 0)
    return df

train = add_features(train_data)
test = add_features(test_data)

In [9]:
CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"Среди данных фич, {len(CATS)} Категориальных фич: {CATS}")

Среди данных фич, 30 Категориальных фич: ['psych_disturb', 'diabetes', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']


In [10]:
train_data

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,hla_low_res_10,efs,efs_time,hct_ci_score,donor_recipient_age_diff,hla_combined_low,hla_match_ratio,years_since_2000,null_count,ci_score_danger
0,0,-1.0,No,,No,,,0.0,No,6.0,...,10.0,0.0,42.356,0,,10.0,,16,13,0
1,1,2.0,No,2.0,No,2.0,8.0,5.0,No,6.0,...,10.0,1.0,4.672,2,28.585,10.0,1.000,8,0,1
2,2,-1.0,No,,No,2.0,8.0,0.0,No,6.0,...,10.0,0.0,19.793,0,,10.0,1.000,19,7,0
3,3,4.0,No,2.0,No,2.0,8.0,0.0,No,6.0,...,10.0,0.0,102.349,1,14.015,10.0,1.000,9,0,1
4,4,4.0,No,,No,2.0,8.0,0.0,No,6.0,...,10.0,0.0,16.223,0,27.070,10.0,1.000,18,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,28795,3.0,NAN,1.0,No,2.0,8.0,0.0,No,6.0,...,10.0,0.0,18.633,3,26.924,10.0,1.000,18,7,2
28796,28796,4.0,No,4.0,Yes,1.0,4.0,0.0,No,5.0,...,8.0,1.0,4.892,4,12.695,8.0,0.625,17,1,2
28797,28797,-2.0,NAN,4.0,NAN,2.0,8.0,0.0,NAN,6.0,...,10.0,0.0,23.157,0,28.378,10.0,1.000,18,11,0
28798,28798,-1.0,No,4.0,No,1.0,4.0,0.0,No,3.0,...,5.0,0.0,52.351,3,58.030,5.0,0.500,18,7,2


In [11]:
# Целевая переменная и признаки
target = "efs"
features = [col for col in train_data.columns if col not in ["ID", "efs", "efs_time"]]


In [12]:
len(features)

64

In [13]:
cat_features = train_data.select_dtypes(include=["object"]).columns.tolist()

In [14]:
len(cat_features)

30

In [15]:
train_data.fillna(-999, inplace=True)
test_data.fillna(-999, inplace=True)

In [16]:
# Создам копии данных для XGBoost
train_data_xgb = train_data.copy()
test_data_xgb = test_data.copy()

In [17]:
# Преобразую категориальные признаки
for col in cat_features:
    train_data_xgb[col] = train_data_xgb[col].fillna("missing").astype(str)
    test_data_xgb[col] = test_data_xgb[col].fillna("missing").astype(str)


# Разделение на обучающую и валидационную выборки

In [18]:
# X_train, X_valid, y_train, y_valid = train_test_split(
#     train_data[features], train_data[target], test_size=0.2, random_state=42
# )

# Обучение модели Catboost

In [None]:
# model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, cat_features=cat_features, verbose=100)
# model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

# Кросс валидация

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cat_preds = np.zeros(len(test_data))
xgb_preds = np.zeros(len(test_data))

In [None]:
# Label Encoding
label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    train_data_xgb[col] = le.fit_transform(train_data_xgb[col])
    test_data_xgb[col] = le.transform(test_data_xgb[col])
    label_encoders[col] = le

In [None]:
for train_idx, val_idx in kf.split(train_data):
    X_train, X_valid = train_data.iloc[train_idx][features], train_data.iloc[val_idx][features]
    y_train, y_valid = train_data.iloc[train_idx][target], train_data.iloc[val_idx][target]

    # CatBoost
    cat_params = {
    'depth': 6, 
    'learning_rate': 0.04699005545173896, 
    'l2_leaf_reg': 6.853082507365295, 
    'colsample_bylevel': 0.9312642681213008, 
    'min_data_in_leaf': 14, 
    'grow_policy': 'Depthwise', 
    'bootstrap_type': 'Bernoulli', 
    'iterations': 1727
}
    cat_model = CatBoostRegressor(
         **cat_params,
        ##iterations=1000,
        ##learning_rate=0.05,
        ##depth=6,
        cat_features=cat_features,
        verbose=100)
    cat_model.fit(Pool(X_train, y_train, cat_features=cat_features), eval_set=Pool(X_valid, y_valid, cat_features=cat_features), early_stopping_rounds=50)
    cat_preds += cat_model.predict(test_data[features]) / kf.n_splits

    # XGBoost (работает с Label Encoded данными)
    X_train_xgb, X_valid_xgb = train_data_xgb.iloc[train_idx][features], train_data_xgb.iloc[val_idx][features]
    xgb_model = XGBRegressor(
        device="cpu",
        max_depth=5,  
        colsample_bytree=0.4309907360736148, 
        subsample=0.6727848987288046, 
        n_estimators=10_000,  
        learning_rate=0.03509792076095853, 
        eval_metric="mae",
        early_stopping_rounds=25,
        objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=10,
        reg_alpha= 2.950200470036872, 
        reg_lambda= 1.484334590329492,
        gamma = 0.008314053362236895
    )
    xgb_model.fit(X_train_xgb, y_train, eval_set=[(X_valid_xgb, y_valid)], early_stopping_rounds=50, verbose=100)
    xgb_preds += xgb_model.predict(test_data_xgb[features]) / kf.n_splits

# Ансамблирование

In [None]:
# Усредняем предсказания
final_preds = (cat_preds + xgb_preds) / 2

# Предсказание на тестовых данных

In [None]:
#test_predictions = model.predict(test_data[features])

# Формирование submission

In [None]:

submission = pd.DataFrame({"ID": test_data["ID"], "prediction": final_preds})
submission.to_csv("submission.csv", index=False)

In [None]:
submission

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
import lightgbm as lgb

# Преобразование категориальных признаков для XGBoost и LightGBM
for col in cat_features:
    train_data_xgb[col] = train_data_xgb[col].fillna("missing").astype(str)
    test_data_xgb[col] = test_data_xgb[col].fillna("missing").astype(str)
    
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cat_preds = np.zeros(len(test_data))
xgb_preds = np.zeros(len(test_data))
lgb_preds1 = np.zeros(len(test_data))
lgb_preds2 = np.zeros(len(test_data))

# Label Encoding для XGBoost и LightGBM
label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    train_data_xgb[col] = le.fit_transform(train_data_xgb[col])
    test_data_xgb[col] = le.transform(test_data_xgb[col])
    label_encoders[col] = le

for train_idx, val_idx in kf.split(train_data):
    # Разбиение для всех моделей
    X_train = train_data.iloc[train_idx][features]
    X_valid = train_data.iloc[val_idx][features]
    y_train = train_data.iloc[train_idx][target]
    y_valid = train_data.iloc[val_idx][target]
    
    # Для XGBoost и LightGBM используем данные с label encoding
    X_train_enc = train_data_xgb.iloc[train_idx][features]
    X_valid_enc = train_data_xgb.iloc[val_idx][features]
    
    # ---------------------------
    # CatBoost модель
    cat_params = {
        'depth': 6, 
        'learning_rate': 0.04699005545173896, 
        'l2_leaf_reg': 6.853082507365295, 
        'colsample_bylevel': 0.9312642681213008, 
        'min_data_in_leaf': 14, 
        'grow_policy': 'Depthwise', 
        'bootstrap_type': 'Bernoulli', 
        'iterations': 1727
    }
    cat_model = CatBoostRegressor(
         **cat_params,
         cat_features=cat_features,
         verbose=100
    )
    cat_model.fit(Pool(X_train, y_train, cat_features=cat_features), 
                  eval_set=Pool(X_valid, y_valid, cat_features=cat_features), 
                  early_stopping_rounds=50)
    cat_preds += cat_model.predict(test_data[features]) / kf.n_splits

    # ---------------------------
    # XGBoost модель
    xgb_model = XGBRegressor(
        device="cpu",
        max_depth=5,  
        colsample_bytree=0.4309907360736148, 
        subsample=0.6727848987288046, 
        n_estimators=10_000,  
        learning_rate=0.03509792076095853, 
        eval_metric="mae",
        early_stopping_rounds=50,
        objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=10,
        reg_alpha=2.950200470036872, 
        reg_lambda=1.484334590329492,
        gamma=0.008314053362236895
    )
    xgb_model.fit(X_train_enc, y_train, eval_set=[(X_valid_enc, y_valid)], verbose=100)
    xgb_preds += xgb_model.predict(test_data_xgb[features]) / kf.n_splits

    # ---------------------------
    # LightGBM модель 1 (boosting: gbdt)
    lgb_params1 = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'num_leaves': 31,
        'metric': 'mae',
        'verbose': -1,
        'seed': 42
    }
    lgb_model1 = lgb.LGBMRegressor(**lgb_params1, n_estimators=10_000)
    lgb_model1.fit(X_train_enc, y_train,
                   eval_set=[(X_valid_enc, y_valid)],
                   early_stopping_rounds=50,
                   verbose=100)
    lgb_preds1 += lgb_model1.predict(test_data_xgb[features]) / kf.n_splits

    # ---------------------------
    # LightGBM модель 2 (boosting: dart)
    lgb_params2 = {
        'objective': 'regression',
        'boosting_type': 'dart',
        'learning_rate': 0.01,
        'num_leaves': 31,
        'metric': 'mae',
        'verbose': -1,
        'seed': 42
    }
    lgb_model2 = lgb.LGBMRegressor(**lgb_params2, n_estimators=10_000)
    lgb_model2.fit(X_train_enc, y_train,
                   eval_set=[(X_valid_enc, y_valid)],
                   early_stopping_rounds=50,
                   verbose=100)
    lgb_preds2 += lgb_model2.predict(test_data_xgb[features]) / kf.n_splits

# ---------------------------
# Усреднение предсказаний от всех 4 моделей
final_preds = (cat_preds + xgb_preds + lgb_preds1 + lgb_preds2) / 4

submission = pd.DataFrame({"ID": test_data["ID"], "prediction": final_preds})
submission.to_csv("submission.csv", index=False)


0:	learn: 0.4956917	test: 0.4963420	best: 0.4963420 (0)	total: 39.8ms	remaining: 1m 8s
100:	learn: 0.4477689	test: 0.4540392	best: 0.4540392 (100)	total: 3.19s	remaining: 51.3s
200:	learn: 0.4395835	test: 0.4506268	best: 0.4506268 (200)	total: 6.19s	remaining: 47s
