In [17]:

import numpy as np
import pandas as pd 
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# Пути к файлам

In [18]:

train_path = "/kaggle/input/equity-post-HCT-survival-predictions/train.csv"
test_path = "/kaggle/input/equity-post-HCT-survival-predictions/test.csv"
submission_path = "/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv"

# Загрузка данных

In [19]:
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [20]:
RMV = ["ID","efs","efs_time"]
FEATURES = [c for c in train_data.columns if not c in RMV]
print(f"Здесь {len(FEATURES)} Фичей: {FEATURES}")

Здесь 57 Фичей: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10']


# Преобразование категориальных фичей

In [21]:
hct_ci_mapping = {
    "arrhythmia": {"No": 0, "Not done": 0, "Yes": 1},  
    "cardiac": {"No": 0, "Not done": 0, "Yes": 1}, 
    "diabetes": {"No": 0, "Not done": 0, "Yes": 1},  
    "hepatic_mild": {"No": 0, "Not done": 0, "Yes": 1},
    "hepatic_severe": {"No": 0, "Not done": 0, "Yes": 3},
    "psych_disturb": {"No": 0, "Not done": 0, "Yes": 1}, 
    "obesity": {"No": 0, "Not done": 0, "Yes": 1}, 
    "rheum_issue": {"No": 0, "Not done": 0, "Yes": 2},
    "peptic_ulcer": {"No": 0, "Not done": 0, "Yes": 2},  
    "renal_issue": {"No": 0, "Not done": 0, "Yes": 2}, 
    "prior_tumor": {"No": 0, "Not done": 0, "Yes": 3}, 
    "pulm_moderate": {"No": 0, "Not done": 0, "Yes": 2}, 
    "pulm_severe": {"No": 0, "Not done": 0, "Yes": 3},  
}
def calculate_hct_ci_score(row, mapping):
        """
        Функция фычисляет hct_ci score
    
        Args:
            row (pd.Series): Patient Clinical Data
            mapping (dict): HCT-CI score mapping
    
        Returns:
            int: HCT-CI score
        """
    
        score = 0
    
        if "hepatic_severe" in row and row["hepatic_severe"] == "Yes":
            score += mapping["hepatic_severe"]["Yes"]
        elif "hepatic_mild" in row and row["hepatic_mild"] == "Yes":
            score += mapping["hepatic_mild"]["Yes"]
        if "pulm_moderate" in row and row["pulm_moderate"] == "Yes":
            score += mapping["pulm_moderate"]["Yes"]
        elif "pulm_severe" in row and row["pulm_severe"] == "Yes":
            score += mapping["pulm_severe"]["Yes"]
    
        # Other Conditions
        for condition, mapping_values in mapping.items():
            if condition not in ["hepatic_mild", "hepatic_severe","pulm_moderate", "pulm_severe"] and condition in row:
                score += mapping_values.get(row[condition], 0)
    
        return score

In [22]:
def cat2num(df):
    df['conditioning_intensity'] = df['conditioning_intensity'].map({
    'NMA': 1, 
    'RIC': 2,
    'MAC': 3,
    'TBD': None,
    'No drugs reported': None,
    'N/A, F(pre-TED) not submitted': None})
    
    df['tbi_status'] = df['tbi_status'].map({
    'No TBI': 0, 
    'TBI +- Other, <=cGy': 1,
    'TBI +- Other, -cGy, fractionated': 2,
    'TBI + Cy +- Other': 3,
    'TBI +- Other, -cGy, single': 4,
    'TBI +- Other, >cGy': 5,
    'TBI +- Other, unknown dose': None})
    
    df['dri_score'] = df['dri_score'].map({
    'Low': 1, 
    'Intermediate': 2,
    'Intermediate - TED AML case <missing cytogenetics': 3,
    'High': 4,
    'High - TED AML case <missing cytogenetics': 5,
    'Very High': 6,
    'N/A - pediatric': -3,
    'N/A - non-malignant indication': -1,
    'TBD cytogenetics': -2,
    'N/A - disease not classifiable': -4,
    'Missing disease status': 0})
    
    df['cyto_score'] = df['cyto_score'].map({
    'Poor': 4,
    'Normal': 3,
    'Intermediate': 2,
    'Favorable': 1,
    'TBD': -1,
    'Other': -2,
    'Not tested': None})
    
    df['cyto_score_detail'] = df['cyto_score_detail'].map({
    'Poor': 3, 
    'Intermediate': 2,
    'Favorable': 1,
    'TBD': -1,
    'Not tested': None})
    
    return df

In [23]:
def fill_hla_combined_low(row):
    if np.isnan(row['hla_combined_low']): 
        components = [
            row['hla_match_drb1_low'], row['hla_match_dqb1_low'], 
            row['hla_match_a_low'], row['hla_match_b_low'], row['hla_match_c_low']
        ]
        if all([not np.isnan(x) for x in components]):
            return sum(components)
        else:
            if not np.isnan(row['hla_low_res_8']) and not np.isnan(row['hla_match_dqb1_low']):
                return row['hla_low_res_8'] + row['hla_match_dqb1_low']
            elif not np.isnan(row['hla_low_res_6']): 
                components_6 = [
                    row['hla_match_dqb1_low'], row['hla_match_c_low']
                ]
                if all([not np.isnan(x) for x in components_6]):
                    return row['hla_low_res_6'] + sum(components_6)
                else: 
                    return sum([x for x in components if not np.isnan(x)])
    return row['hla_combined_low'] 

In [24]:
def add_features(df):
    df["hct_ci_score"] = df.apply(lambda row: calculate_hct_ci_score(row, hct_ci_mapping), axis=1)
    df['donor_recipient_age_diff'] = abs(df['donor_age'] - df['age_at_hct'])
    df = cat2num(df)
    df['hla_combined_low'] = df['hla_low_res_10']
    df['hla_combined_low'] = df.apply(fill_hla_combined_low, axis=1)
    df['hla_match_ratio'] = (df['hla_high_res_8'] + df['hla_low_res_8']) / 16
    df['years_since_2000'] = df['year_hct'] - 2000
    df['null_count'] = df.isnull().sum(axis=1)
    df['ci_score_danger'] = df['hct_ci_score'].apply(lambda x: 2 if x >= 3 else 1 if x >= 1 else 0)
    return df

train = add_features(train_data)
test = add_features(test_data)

In [25]:
CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"Среди данных фич, {len(CATS)} Категориальных фич: {CATS}")

Среди данных фич, 30 Категориальных фич: ['psych_disturb', 'diabetes', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']


In [26]:
train_data

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,hla_low_res_10,efs,efs_time,hct_ci_score,donor_recipient_age_diff,hla_combined_low,hla_match_ratio,years_since_2000,null_count,ci_score_danger
0,0,-1.0,No,,No,,,0.0,No,6.0,...,10.0,0.0,42.356,0,,10.0,,16,13,0
1,1,2.0,No,2.0,No,2.0,8.0,5.0,No,6.0,...,10.0,1.0,4.672,2,28.585,10.0,1.000,8,0,1
2,2,-1.0,No,,No,2.0,8.0,0.0,No,6.0,...,10.0,0.0,19.793,0,,10.0,1.000,19,7,0
3,3,4.0,No,2.0,No,2.0,8.0,0.0,No,6.0,...,10.0,0.0,102.349,1,14.015,10.0,1.000,9,0,1
4,4,4.0,No,,No,2.0,8.0,0.0,No,6.0,...,10.0,0.0,16.223,0,27.070,10.0,1.000,18,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,28795,3.0,NAN,1.0,No,2.0,8.0,0.0,No,6.0,...,10.0,0.0,18.633,3,26.924,10.0,1.000,18,7,2
28796,28796,4.0,No,4.0,Yes,1.0,4.0,0.0,No,5.0,...,8.0,1.0,4.892,4,12.695,8.0,0.625,17,1,2
28797,28797,-2.0,NAN,4.0,NAN,2.0,8.0,0.0,NAN,6.0,...,10.0,0.0,23.157,0,28.378,10.0,1.000,18,11,0
28798,28798,-1.0,No,4.0,No,1.0,4.0,0.0,No,3.0,...,5.0,0.0,52.351,3,58.030,5.0,0.500,18,7,2


In [27]:
# Целевая переменная и признаки
target = "efs"
features = [col for col in train_data.columns if col not in ["ID", "efs", "efs_time"]]


In [29]:
len(features)

64

In [30]:
cat_features = train_data.select_dtypes(include=["object"]).columns.tolist()

In [34]:
len(cat_features)

30

In [31]:
train_data.fillna(-999, inplace=True)
test_data.fillna(-999, inplace=True)

In [32]:
# Создам копии данных для XGBoost
train_data_xgb = train_data.copy()
test_data_xgb = test_data.copy()

In [35]:
# Преобразую категориальные признаки
for col in cat_features:
    train_data_xgb[col] = train_data_xgb[col].fillna("missing").astype(str)
    test_data_xgb[col] = test_data_xgb[col].fillna("missing").astype(str)


# Разделение на обучающую и валидационную выборки

In [36]:
# X_train, X_valid, y_train, y_valid = train_test_split(
#     train_data[features], train_data[target], test_size=0.2, random_state=42
# )

# Обучение модели Catboost

In [37]:
# model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, cat_features=cat_features, verbose=100)
# model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

# Кросс валидация

In [38]:
# Label Encoding
label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    train_data_xgb[col] = le.fit_transform(train_data_xgb[col])
    test_data_xgb[col] = le.transform(test_data_xgb[col])
    label_encoders[col] = le

In [39]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cat_preds = np.zeros(len(test_data))
xgb_preds = np.zeros(len(test_data))

In [40]:
for train_idx, val_idx in kf.split(train_data):
    X_train, X_valid = train_data.iloc[train_idx][features], train_data.iloc[val_idx][features]
    y_train, y_valid = train_data.iloc[train_idx][target], train_data.iloc[val_idx][target]

    # CatBoost
    cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, cat_features=cat_features, verbose=100)
    cat_model.fit(Pool(X_train, y_train, cat_features=cat_features), eval_set=Pool(X_valid, y_valid, cat_features=cat_features), early_stopping_rounds=50)
    cat_preds += cat_model.predict(test_data[features]) / kf.n_splits

    # XGBoost (работает с Label Encoded данными)
    X_train_xgb, X_valid_xgb = train_data_xgb.iloc[train_idx][features], train_data_xgb.iloc[val_idx][features]
    xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=6, objective="reg:squarederror", eval_metric="rmse")
    xgb_model.fit(X_train_xgb, y_train, eval_set=[(X_valid_xgb, y_valid)], early_stopping_rounds=50, verbose=100)
    xgb_preds += xgb_model.predict(test_data_xgb[features]) / kf.n_splits

0:	learn: 0.4956728	test: 0.4962353	best: 0.4962353 (0)	total: 135ms	remaining: 2m 14s
100:	learn: 0.4484342	test: 0.4534428	best: 0.4534428 (100)	total: 7.02s	remaining: 1m 2s
200:	learn: 0.4412400	test: 0.4500004	best: 0.4500004 (200)	total: 13.5s	remaining: 53.7s
300:	learn: 0.4345282	test: 0.4482425	best: 0.4482191 (299)	total: 20.6s	remaining: 47.9s
400:	learn: 0.4298626	test: 0.4474071	best: 0.4473948 (399)	total: 27.3s	remaining: 40.8s
500:	learn: 0.4261466	test: 0.4471013	best: 0.4470937 (480)	total: 33.7s	remaining: 33.6s
600:	learn: 0.4225138	test: 0.4468820	best: 0.4468800 (596)	total: 40.4s	remaining: 26.8s
700:	learn: 0.4194171	test: 0.4467044	best: 0.4466666 (692)	total: 47.1s	remaining: 20.1s
800:	learn: 0.4165089	test: 0.4465697	best: 0.4465559 (765)	total: 54.4s	remaining: 13.5s
900:	learn: 0.4139308	test: 0.4463982	best: 0.4463959 (899)	total: 1m 1s	remaining: 6.73s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4463958715
bestIteration = 899

Shr



[100]	validation_0-rmse:0.45267
[200]	validation_0-rmse:0.45187
[248]	validation_0-rmse:0.45217
0:	learn: 0.4959962	test: 0.4948303	best: 0.4948303 (0)	total: 79.1ms	remaining: 1m 18s
100:	learn: 0.4490832	test: 0.4511778	best: 0.4511778 (100)	total: 6.56s	remaining: 58.4s
200:	learn: 0.4416795	test: 0.4476933	best: 0.4476933 (200)	total: 13.1s	remaining: 51.9s
300:	learn: 0.4348862	test: 0.4461429	best: 0.4461246 (297)	total: 20.4s	remaining: 47.5s
400:	learn: 0.4296792	test: 0.4453969	best: 0.4453955 (399)	total: 27.2s	remaining: 40.6s
500:	learn: 0.4255380	test: 0.4449543	best: 0.4449426 (499)	total: 33.9s	remaining: 33.8s
600:	learn: 0.4213973	test: 0.4447573	best: 0.4447485 (584)	total: 40.6s	remaining: 27s
700:	learn: 0.4178887	test: 0.4447196	best: 0.4446691 (676)	total: 47.3s	remaining: 20.2s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4445764192
bestIteration = 749

Shrink model to first 750 iterations.
[0]	validation_0-rmse:0.49435




[100]	validation_0-rmse:0.44988
[200]	validation_0-rmse:0.44812
[279]	validation_0-rmse:0.44860
0:	learn: 0.4958773	test: 0.4961152	best: 0.4961152 (0)	total: 72.3ms	remaining: 1m 12s
100:	learn: 0.4497297	test: 0.4495586	best: 0.4495586 (100)	total: 6.62s	remaining: 59s
200:	learn: 0.4421013	test: 0.4457835	best: 0.4457835 (200)	total: 13.1s	remaining: 52.2s
300:	learn: 0.4355811	test: 0.4439354	best: 0.4439354 (300)	total: 19.8s	remaining: 46s
400:	learn: 0.4308817	test: 0.4430175	best: 0.4430147 (388)	total: 27s	remaining: 40.3s
500:	learn: 0.4269243	test: 0.4426376	best: 0.4426376 (500)	total: 33.7s	remaining: 33.5s
600:	learn: 0.4234558	test: 0.4424224	best: 0.4423734 (586)	total: 40.2s	remaining: 26.7s
700:	learn: 0.4203235	test: 0.4422235	best: 0.4422161 (697)	total: 46.9s	remaining: 20s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4422161431
bestIteration = 697

Shrink model to first 698 iterations.
[0]	validation_0-rmse:0.49555




[100]	validation_0-rmse:0.44860
[200]	validation_0-rmse:0.44661
[251]	validation_0-rmse:0.44670
0:	learn: 0.4954219	test: 0.4961635	best: 0.4961635 (0)	total: 71.8ms	remaining: 1m 11s
100:	learn: 0.4476418	test: 0.4560502	best: 0.4560502 (100)	total: 6.96s	remaining: 1m 1s
200:	learn: 0.4403034	test: 0.4533444	best: 0.4533444 (200)	total: 13.3s	remaining: 52.9s
300:	learn: 0.4337253	test: 0.4519424	best: 0.4519326 (299)	total: 19.7s	remaining: 45.9s
400:	learn: 0.4294280	test: 0.4511333	best: 0.4511333 (400)	total: 26.6s	remaining: 39.7s
500:	learn: 0.4251268	test: 0.4507338	best: 0.4507148 (498)	total: 33.2s	remaining: 33s
600:	learn: 0.4216042	test: 0.4505304	best: 0.4504740 (586)	total: 40.2s	remaining: 26.7s
700:	learn: 0.4188201	test: 0.4504180	best: 0.4503964 (667)	total: 46.8s	remaining: 20s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4503853521
bestIteration = 714

Shrink model to first 715 iterations.
[0]	validation_0-rmse:0.49577




[100]	validation_0-rmse:0.45584
[199]	validation_0-rmse:0.45472
0:	learn: 0.4956422	test: 0.4958727	best: 0.4958727 (0)	total: 74.5ms	remaining: 1m 14s
100:	learn: 0.4477734	test: 0.4564605	best: 0.4564605 (100)	total: 6.47s	remaining: 57.6s
200:	learn: 0.4406113	test: 0.4534058	best: 0.4534058 (200)	total: 12.8s	remaining: 50.9s
300:	learn: 0.4336522	test: 0.4519343	best: 0.4519343 (300)	total: 19.9s	remaining: 46.2s
400:	learn: 0.4286809	test: 0.4513297	best: 0.4513285 (383)	total: 26.4s	remaining: 39.5s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4512864601
bestIteration = 425

Shrink model to first 426 iterations.
[0]	validation_0-rmse:0.49540




[100]	validation_0-rmse:0.45528
[200]	validation_0-rmse:0.45403
[249]	validation_0-rmse:0.45417


# Ансамблирование

In [41]:
# Усредняем предсказания
final_preds = (cat_preds + xgb_preds) / 2

# Предсказание на тестовых данных

In [42]:
#test_predictions = model.predict(test_data[features])

# Формирование submission

In [43]:

submission = pd.DataFrame({"ID": test_data["ID"], "prediction": final_preds})
submission.to_csv("submission.csv", index=False)

In [44]:
submission

Unnamed: 0,ID,prediction
0,28800,0.146889
1,28801,0.694377
2,28802,0.014893
