In [39]:

import numpy as np
import pandas as pd 
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# Пути к файлам

In [40]:

train_path = "/kaggle/input/equity-post-HCT-survival-predictions/train.csv"
test_path = "/kaggle/input/equity-post-HCT-survival-predictions/test.csv"
submission_path = "/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv"

# Загрузка данных

In [41]:
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [42]:
# Целевая переменная и признаки
target = "efs"
features = [col for col in train_data.columns if col not in ["ID", "efs", "efs_time"]]


In [43]:
cat_features = train_data.select_dtypes(include=["object"]).columns.tolist()

In [44]:
train_data.fillna(-999, inplace=True)
test_data.fillna(-999, inplace=True)

In [45]:
# Создам копии данных для XGBoost
train_data_xgb = train_data.copy()
test_data_xgb = test_data.copy()

In [46]:
# Преобразую категориальные признаки
for col in cat_features:
    train_data_xgb[col] = train_data_xgb[col].fillna("missing").astype(str)
    test_data_xgb[col] = test_data_xgb[col].fillna("missing").astype(str)


# Разделение на обучающую и валидационную выборки

In [47]:
# X_train, X_valid, y_train, y_valid = train_test_split(
#     train_data[features], train_data[target], test_size=0.2, random_state=42
# )

# Обучение модели Catboost

In [48]:
# model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, cat_features=cat_features, verbose=100)
# model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

# Кросс валидация

In [49]:
# Label Encoding
label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    train_data_xgb[col] = le.fit_transform(train_data_xgb[col])
    test_data_xgb[col] = le.transform(test_data_xgb[col])
    label_encoders[col] = le

In [50]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cat_preds = np.zeros(len(test_data))
xgb_preds = np.zeros(len(test_data))

In [51]:
for train_idx, val_idx in kf.split(train_data):
    X_train, X_valid = train_data.iloc[train_idx][features], train_data.iloc[val_idx][features]
    y_train, y_valid = train_data.iloc[train_idx][target], train_data.iloc[val_idx][target]

    # CatBoost
    cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, cat_features=cat_features, verbose=100)
    cat_model.fit(Pool(X_train, y_train, cat_features=cat_features), eval_set=Pool(X_valid, y_valid, cat_features=cat_features), early_stopping_rounds=50)
    cat_preds += cat_model.predict(test_data[features]) / kf.n_splits

    # XGBoost (работает с Label Encoded данными)
    X_train_xgb, X_valid_xgb = train_data_xgb.iloc[train_idx][features], train_data_xgb.iloc[val_idx][features]
    xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=6, objective="reg:squarederror", eval_metric="rmse")
    xgb_model.fit(X_train_xgb, y_train, eval_set=[(X_valid_xgb, y_valid)], early_stopping_rounds=50, verbose=100)
    xgb_preds += xgb_model.predict(test_data_xgb[features]) / kf.n_splits

0:	learn: 0.4955272	test: 0.4960645	best: 0.4960645 (0)	total: 59.3ms	remaining: 59.3s
100:	learn: 0.4478172	test: 0.4516996	best: 0.4516996 (100)	total: 6.11s	remaining: 54.4s
200:	learn: 0.4406517	test: 0.4485591	best: 0.4485591 (200)	total: 12.2s	remaining: 48.7s
300:	learn: 0.4340419	test: 0.4466285	best: 0.4466285 (300)	total: 18.4s	remaining: 42.7s
400:	learn: 0.4292495	test: 0.4459479	best: 0.4459479 (400)	total: 24.6s	remaining: 36.8s
500:	learn: 0.4256754	test: 0.4455471	best: 0.4455375 (498)	total: 31.4s	remaining: 31.3s
600:	learn: 0.4228411	test: 0.4453668	best: 0.4453337 (594)	total: 37.7s	remaining: 25s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4453337413
bestIteration = 594

Shrink model to first 595 iterations.
[0]	validation_0-rmse:0.49559




[100]	validation_0-rmse:0.45106
[200]	validation_0-rmse:0.45002
[223]	validation_0-rmse:0.45022
0:	learn: 0.4957907	test: 0.4945652	best: 0.4945652 (0)	total: 60.7ms	remaining: 1m
100:	learn: 0.4482785	test: 0.4496741	best: 0.4496741 (100)	total: 6.11s	remaining: 54.4s
200:	learn: 0.4415985	test: 0.4466069	best: 0.4466069 (200)	total: 12.3s	remaining: 48.9s
300:	learn: 0.4352471	test: 0.4445121	best: 0.4445121 (300)	total: 18.8s	remaining: 43.6s
400:	learn: 0.4306965	test: 0.4434724	best: 0.4434724 (400)	total: 25.2s	remaining: 37.6s
500:	learn: 0.4267637	test: 0.4430776	best: 0.4430684 (499)	total: 31.4s	remaining: 31.3s
600:	learn: 0.4231476	test: 0.4429127	best: 0.4429106 (599)	total: 37.7s	remaining: 25s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4428838957
bestIteration = 641

Shrink model to first 642 iterations.
[0]	validation_0-rmse:0.49423




[100]	validation_0-rmse:0.44993
[200]	validation_0-rmse:0.44784
[262]	validation_0-rmse:0.44808
0:	learn: 0.4956508	test: 0.4957923	best: 0.4957923 (0)	total: 59.4ms	remaining: 59.3s
100:	learn: 0.4490511	test: 0.4482099	best: 0.4482099 (100)	total: 6.57s	remaining: 58.5s
200:	learn: 0.4420047	test: 0.4442935	best: 0.4442935 (200)	total: 12.8s	remaining: 50.8s
300:	learn: 0.4356660	test: 0.4423767	best: 0.4423767 (300)	total: 19.1s	remaining: 44.3s
400:	learn: 0.4310700	test: 0.4417353	best: 0.4417353 (400)	total: 25.4s	remaining: 38s
500:	learn: 0.4275623	test: 0.4414461	best: 0.4414363 (499)	total: 31.7s	remaining: 31.6s
600:	learn: 0.4239894	test: 0.4410744	best: 0.4410704 (597)	total: 38.5s	remaining: 25.6s
700:	learn: 0.4214690	test: 0.4408923	best: 0.4408922 (692)	total: 45s	remaining: 19.2s
800:	learn: 0.4186472	test: 0.4407145	best: 0.4407085 (799)	total: 51.3s	remaining: 12.7s
900:	learn: 0.4159931	test: 0.4405964	best: 0.4405946 (899)	total: 57.7s	remaining: 6.33s
Stopped by 



[100]	validation_0-rmse:0.44813
[200]	validation_0-rmse:0.44688
[300]	validation_0-rmse:0.44686
[316]	validation_0-rmse:0.44698
0:	learn: 0.4953658	test: 0.4960601	best: 0.4960601 (0)	total: 61.7ms	remaining: 1m 1s
100:	learn: 0.4468422	test: 0.4552028	best: 0.4552028 (100)	total: 6.54s	remaining: 58.2s
200:	learn: 0.4396333	test: 0.4523430	best: 0.4523430 (200)	total: 13s	remaining: 51.5s
300:	learn: 0.4335776	test: 0.4506344	best: 0.4506344 (300)	total: 19.2s	remaining: 44.6s
400:	learn: 0.4295073	test: 0.4498701	best: 0.4498701 (400)	total: 25.6s	remaining: 38.2s
500:	learn: 0.4258177	test: 0.4494133	best: 0.4494038 (497)	total: 32s	remaining: 31.8s
600:	learn: 0.4231035	test: 0.4492354	best: 0.4492290 (598)	total: 38.7s	remaining: 25.7s
700:	learn: 0.4202389	test: 0.4491780	best: 0.4491707 (699)	total: 45.2s	remaining: 19.3s
800:	learn: 0.4177306	test: 0.4490500	best: 0.4490353 (784)	total: 51.5s	remaining: 12.8s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4



[100]	validation_0-rmse:0.45585
[200]	validation_0-rmse:0.45418
[236]	validation_0-rmse:0.45453
0:	learn: 0.4954396	test: 0.4956591	best: 0.4956591 (0)	total: 60.5ms	remaining: 1m
100:	learn: 0.4468204	test: 0.4558238	best: 0.4558238 (100)	total: 6.13s	remaining: 54.5s
200:	learn: 0.4397659	test: 0.4525955	best: 0.4525955 (200)	total: 12.2s	remaining: 48.5s
300:	learn: 0.4335470	test: 0.4510286	best: 0.4510286 (300)	total: 19s	remaining: 44.1s
400:	learn: 0.4296119	test: 0.4503374	best: 0.4503374 (400)	total: 25.2s	remaining: 37.7s
500:	learn: 0.4257896	test: 0.4501401	best: 0.4501401 (500)	total: 31.5s	remaining: 31.4s
600:	learn: 0.4226417	test: 0.4499350	best: 0.4499271 (575)	total: 37.8s	remaining: 25.1s
700:	learn: 0.4195765	test: 0.4498100	best: 0.4498061 (699)	total: 44.1s	remaining: 18.8s
800:	learn: 0.4170907	test: 0.4497730	best: 0.4497218 (782)	total: 50.9s	remaining: 12.6s
900:	learn: 0.4151013	test: 0.4496146	best: 0.4495998 (884)	total: 57.2s	remaining: 6.29s
Stopped by o



[100]	validation_0-rmse:0.45557
[200]	validation_0-rmse:0.45335
[277]	validation_0-rmse:0.45332


# Ансамблирование

In [52]:
# Усредняем предсказания
final_preds = (cat_preds + xgb_preds) / 2

# Предсказание на тестовых данных

In [53]:
#test_predictions = model.predict(test_data[features])

# Формирование submission

In [58]:

submission = pd.DataFrame({"ID": test_data["ID"], "prediction": final_preds})
submission.to_csv("submission.csv", index=False)

In [57]:
submission

Unnamed: 0,ID,prediction
0,28800,0.146896
1,28801,0.682974
2,28802,0.007342
