# Подготовка данных

In [1]:
%matplotlib inline
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import sklearn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv(r"InternationalBifurca_DATA_2023-10-30_0629.csv", sep=',')

In [3]:
df = df.dropna(subset=['sex'])
df

Unnamed: 0,record_id,date,sex,age,adhoc_pci,weight,height,race,clinical_presentation,time_from_mi_symptoms_onse,...,time_to_death_f5,time_to_acs_f5,time_to_stroke_f5,time_to_pci_f5,time_to_cabg_f5,hospitalization_f5,bleeding_f5,major_required_trans_f5,tlr_f5,tvr_f5
0,MNRI0001,2018-02-01,2.0,77.0,1.0,84.0,165.0,1.0,5.0,4.0,...,,,,,,,,,,
1,MNRI0002,2018-01-24,1.0,68.0,0.0,81.0,171.0,1.0,1.0,,...,,,,,,,,,,
2,MNRI0003,2018-01-24,1.0,62.0,0.0,74.0,180.0,1.0,4.0,,...,,,,,,,,,,
3,MNRI0004,2018-01-30,1.0,67.0,1.0,84.0,167.0,1.0,2.0,,...,,,,,,,,,,
4,MNRI0005,2018-01-30,1.0,57.0,0.0,103.0,174.0,1.0,1.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2053,TRCH0026,2019-03-11,1.0,67.0,1.0,90.0,174.0,1.0,2.0,,...,,,,,,,,,,
2054,TRCH0027,2019-03-18,1.0,69.0,1.0,60.0,174.0,1.0,3.0,1.0,...,,,,,,,,,,
2055,TRCH0028,2019-03-19,2.0,81.0,0.0,50.0,160.0,1.0,2.0,,...,,,,,,,,,,
2056,TRCH0029,2019-03-28,1.0,86.0,1.0,74.0,170.0,1.0,3.0,1.0,...,,,,,,,,,,


In [4]:
anyInf = df[df == np.inf].sum()
anyInf[anyInf != 0]

stent_distal_vessel_size    inf
sb_stent_sb_diametr         inf
dtype: object

In [5]:
patient_info_cols = ['record_id', 'date', 'sex', 'age', 'adhoc_pci', 'weight', 'height', 
                     'race', 'clinical_presentation', 'time_from_mi_symptoms_onse', 
                     'ccs_class', 'diabet', 'insulin_diabetes', 'hypertension', 'smoking', 
                     'dyslipidemia', 'anemia', 'atrial_fibrilation', 'oac_use', 'valvular_disease', 
                     'valvular_disease_was_previ', 'if_yes_what_type___1', 'if_yes_what_type___2', 
                     'if_yes_what_type___3', 'if_yes_what_type___4', 'if_yes_what_type___5', 
                     'if_yes_what_type___6', 'if_yes_what_type___7', 'ef', 'creatinine', 'ckd', 
                     'mi_history', 'cerebrovascular_disease', 'previously_treated_cerebro', 'previous_stroke_tia', 
                     'peripheral_artery_disease', 'previously_treated_periphe', 'copd', 'history_of_cancer', 
                     'previous_pci', 'previous_cabg']

invention_cols = ['single_vessel', 'trifurcation', 'several_biffurcations', 'bifurcation_location', 
                  'lesion_ivolves', 'protected_left_main', 'angle', 'calcium', 'trombosis', 
                  'total_trobotic_occlusion', 'restenosis_reocclusion', 'overlap_sb', 'cto_bifurc', 
                  'syntax_score', 'medina_proximal', 'medina_distal', 'medina_side', 'mb_length_proximal', 
                  'sb_length', 'proximal_diametr', 'distal_diametr', 'side_diametr', 'stenosis_proximal', 
                  'stenosis_distal', 'timi_flow_main_branch', 'side_stenosis', 'timi_flow_side_branch', 
                  'major_lm', 'major_non_lm', 'minor_criteria', 'main_branch_rvd', 'def', 'def_2']

operation_cols = ['side_protection', 'main_predilatation', 'side_predilat', 'trombus_aspiration', 
                  'kissing_predilitation', 'stenting', 'drug_coated_balloon', 'balloon_angioplasty', 
                  'stent_was_implated_from_lm', 'stent_number', 'stent_number_bif', 'stent_technique', 
                  'first_stent_impanted', 'provisional_2_stent_techni', 'stent_direction', 'defered_stenting', 
                  'stent_diameter', 'stent_length', 'stent_type___1', 'stent_type___2', 'stent_type___3', 
                  'stent_type___4', 'stent_type___5', 'stent_type___6', 'stent_type___7', 'stent_type___9', 
                  'stent_type___8', 'dstent2', 'stent_length2', 'stent_distal_vessel_size', 
                  'sb_stent_side_branch_diametr', 'sb_stent_sb_diametr', 'twostent_technique', 
                  'sb_dilatation', 'stent_postdilatation', 'proximal_optimization', 'pot', 
                  'pot_balloon_diametr', 'kissing_post', 'modified_kis', 'several_kissing']

In [6]:
non_vascular_deaths = ['MNRI1054', 'MNRI1191', 'MNRI1351', 'MNRI1352', 'MNRI1473', 'MNRI1670', 'MNRI0637', 'MNRI0656', 'MNRI0751', 'MNRI0758',
                      'MNRI0805', 'MNRI0818', 'MNRI1054', 'MNRI0087', 'MNRI1191', 'MNRI0108', 'MNRI0307', 'MNRI0215', 'MNRI0322', 'MNRI0293',
                      'MNRI0156', 'MNRI0215', 'MNRI0488', 'MNRI0612', 'MNRI0708', 'MNRI0767', 'MNRI0772', 'MNRI0786', 'MNRI1105', 'MNRI1186',
                      'MNRI1462', 'MNRI1633']

In [7]:
df = df[~df['record_id'].isin(non_vascular_deaths)]

In [8]:
df

Unnamed: 0,record_id,date,sex,age,adhoc_pci,weight,height,race,clinical_presentation,time_from_mi_symptoms_onse,...,time_to_death_f5,time_to_acs_f5,time_to_stroke_f5,time_to_pci_f5,time_to_cabg_f5,hospitalization_f5,bleeding_f5,major_required_trans_f5,tlr_f5,tvr_f5
0,MNRI0001,2018-02-01,2.0,77.0,1.0,84.0,165.0,1.0,5.0,4.0,...,,,,,,,,,,
1,MNRI0002,2018-01-24,1.0,68.0,0.0,81.0,171.0,1.0,1.0,,...,,,,,,,,,,
2,MNRI0003,2018-01-24,1.0,62.0,0.0,74.0,180.0,1.0,4.0,,...,,,,,,,,,,
3,MNRI0004,2018-01-30,1.0,67.0,1.0,84.0,167.0,1.0,2.0,,...,,,,,,,,,,
4,MNRI0005,2018-01-30,1.0,57.0,0.0,103.0,174.0,1.0,1.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2053,TRCH0026,2019-03-11,1.0,67.0,1.0,90.0,174.0,1.0,2.0,,...,,,,,,,,,,
2054,TRCH0027,2019-03-18,1.0,69.0,1.0,60.0,174.0,1.0,3.0,1.0,...,,,,,,,,,,
2055,TRCH0028,2019-03-19,2.0,81.0,0.0,50.0,160.0,1.0,2.0,,...,,,,,,,,,,
2056,TRCH0029,2019-03-28,1.0,86.0,1.0,74.0,170.0,1.0,3.0,1.0,...,,,,,,,,,,


In [9]:
patient_present_cols = patient_info_cols + invention_cols + operation_cols
patient_present_df = pd.DataFrame({col_name: df[col_name] for col_name in patient_present_cols})

In [10]:
patient_present_df

Unnamed: 0,record_id,date,sex,age,adhoc_pci,weight,height,race,clinical_presentation,time_from_mi_symptoms_onse,...,sb_stent_sb_diametr,twostent_technique,sb_dilatation,stent_postdilatation,proximal_optimization,pot,pot_balloon_diametr,kissing_post,modified_kis,several_kissing
0,MNRI0001,2018-02-01,2.0,77.0,1.0,84.0,165.0,1.0,5.0,4.0,...,0.952381,7.0,0.0,1.0,0.0,,,1.0,0.0,0.0
1,MNRI0002,2018-01-24,1.0,68.0,0.0,81.0,171.0,1.0,1.0,,...,1.333333,,0.0,0.0,0.0,,,1.0,0.0,0.0
2,MNRI0003,2018-01-24,1.0,62.0,0.0,74.0,180.0,1.0,4.0,,...,1.153846,,0.0,1.0,0.0,,,0.0,0.0,
3,MNRI0004,2018-01-30,1.0,67.0,1.0,84.0,167.0,1.0,2.0,,...,1.285714,,0.0,0.0,1.0,1.0,5.0,0.0,0.0,
4,MNRI0005,2018-01-30,1.0,57.0,0.0,103.0,174.0,1.0,1.0,,...,1.590909,,1.0,0.0,0.0,,,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2053,TRCH0026,2019-03-11,1.0,67.0,1.0,90.0,174.0,1.0,2.0,,...,1.500000,,1.0,1.0,1.0,0.0,,1.0,1.0,0.0
2054,TRCH0027,2019-03-18,1.0,69.0,1.0,60.0,174.0,1.0,3.0,1.0,...,1.100000,,0.0,0.0,0.0,,,0.0,0.0,
2055,TRCH0028,2019-03-19,2.0,81.0,0.0,50.0,160.0,1.0,2.0,,...,1.000000,7.0,1.0,1.0,1.0,0.0,,1.0,1.0,0.0
2056,TRCH0029,2019-03-28,1.0,86.0,1.0,74.0,170.0,1.0,3.0,1.0,...,0.750000,,0.0,0.0,1.0,1.0,4.0,0.0,0.0,


In [11]:
def remove_columns_with_nan_threshold(df, threshold=250):
    """
    Удаление колонок из DataFrame с количеством NaN больше заданного порога.

    Параметры:
    df : pandas.DataFrame
        Исходный DataFrame, из которого будут удалены колонки.
    threshold : int
        Пороговое значение количества NaN для удаления колонки.

    Возвращает:
    pandas.DataFrame
        DataFrame после удаления колонок.
    """
    # Считаем количество NaN в каждой колонке
    nan_counts = df.isnull().sum()

    # Получаем список колонок, которые нужно удалить
    columns_to_drop = nan_counts[nan_counts > threshold].index

    # Удаляем колонки
    df_dropped = df.drop(columns=columns_to_drop)

    return df_dropped

In [12]:
patient_present_df = remove_columns_with_nan_threshold(patient_present_df, 500)

In [13]:
# patient_present_df = patient_present_df.drop(['record_id', 'date'], axis = 1)
patient_present_df = patient_present_df.drop(['date'], axis = 1)
patient_present_df

Unnamed: 0,record_id,sex,age,adhoc_pci,weight,height,race,clinical_presentation,diabet,hypertension,...,stent_type___6,stent_type___7,stent_type___9,stent_type___8,stent_distal_vessel_size,sb_stent_sb_diametr,sb_dilatation,stent_postdilatation,kissing_post,modified_kis
0,MNRI0001,2.0,77.0,1.0,84.0,165.0,1.0,5.0,0.0,1.0,...,0,0,0,0,1.111111,0.952381,0.0,1.0,1.0,0.0
1,MNRI0002,1.0,68.0,0.0,81.0,171.0,1.0,1.0,0.0,1.0,...,0,0,0,0,1.290323,1.333333,0.0,0.0,1.0,0.0
2,MNRI0003,1.0,62.0,0.0,74.0,180.0,1.0,4.0,0.0,1.0,...,0,0,0,0,1.000000,1.153846,0.0,1.0,0.0,0.0
3,MNRI0004,1.0,67.0,1.0,84.0,167.0,1.0,2.0,0.0,1.0,...,0,0,0,0,0.957447,1.285714,0.0,0.0,0.0,0.0
4,MNRI0005,1.0,57.0,0.0,103.0,174.0,1.0,1.0,0.0,1.0,...,0,0,0,0,1.250000,1.590909,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2053,TRCH0026,1.0,67.0,1.0,90.0,174.0,1.0,2.0,2.0,1.0,...,0,0,0,0,1.200000,1.500000,1.0,1.0,1.0,1.0
2054,TRCH0027,1.0,69.0,1.0,60.0,174.0,1.0,3.0,1.0,1.0,...,0,0,1,0,1.100000,1.100000,0.0,0.0,0.0,0.0
2055,TRCH0028,2.0,81.0,0.0,50.0,160.0,1.0,2.0,2.0,1.0,...,0,0,0,0,1.000000,1.000000,1.0,1.0,1.0,1.0
2056,TRCH0029,1.0,86.0,1.0,74.0,170.0,1.0,3.0,1.0,1.0,...,0,0,0,0,1.090909,0.750000,0.0,0.0,0.0,0.0


In [14]:
patient_present_df.replace(to_replace = [np.inf, -np.inf], value= None, inplace=True)

In [15]:
cols_to_drop = patient_present_df.columns[patient_present_df.nunique() <= 1]
patient_present_df = patient_present_df.drop(cols_to_drop, axis=1)
patient_present_df

Unnamed: 0,record_id,sex,age,adhoc_pci,weight,height,race,clinical_presentation,diabet,hypertension,...,stent_type___6,stent_type___7,stent_type___9,stent_type___8,stent_distal_vessel_size,sb_stent_sb_diametr,sb_dilatation,stent_postdilatation,kissing_post,modified_kis
0,MNRI0001,2.0,77.0,1.0,84.0,165.0,1.0,5.0,0.0,1.0,...,0,0,0,0,1.111111,0.952381,0.0,1.0,1.0,0.0
1,MNRI0002,1.0,68.0,0.0,81.0,171.0,1.0,1.0,0.0,1.0,...,0,0,0,0,1.290323,1.333333,0.0,0.0,1.0,0.0
2,MNRI0003,1.0,62.0,0.0,74.0,180.0,1.0,4.0,0.0,1.0,...,0,0,0,0,1.0,1.153846,0.0,1.0,0.0,0.0
3,MNRI0004,1.0,67.0,1.0,84.0,167.0,1.0,2.0,0.0,1.0,...,0,0,0,0,0.957447,1.285714,0.0,0.0,0.0,0.0
4,MNRI0005,1.0,57.0,0.0,103.0,174.0,1.0,1.0,0.0,1.0,...,0,0,0,0,1.25,1.590909,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2053,TRCH0026,1.0,67.0,1.0,90.0,174.0,1.0,2.0,2.0,1.0,...,0,0,0,0,1.2,1.5,1.0,1.0,1.0,1.0
2054,TRCH0027,1.0,69.0,1.0,60.0,174.0,1.0,3.0,1.0,1.0,...,0,0,1,0,1.1,1.1,0.0,0.0,0.0,0.0
2055,TRCH0028,2.0,81.0,0.0,50.0,160.0,1.0,2.0,2.0,1.0,...,0,0,0,0,1.0,1.0,1.0,1.0,1.0,1.0
2056,TRCH0029,1.0,86.0,1.0,74.0,170.0,1.0,3.0,1.0,1.0,...,0,0,0,0,1.090909,0.75,0.0,0.0,0.0,0.0


In [16]:
numerical = ['age', 'weight', 'height', 'ef', 'creatinine', 'ckd', 'angle', 'mb_length_proximal', 'proximal_diametr', 'distal_diametr', 'side_diametr',
            'stenosis_proximal', 'stenosis_distal', 'side_stenosis', 'minor_criteria', 'main_branch_rvd', 'stent_diameter', 'stent_length',
            'stent_distal_vessel_size', 'sb_stent_sb_diametr']
cathegorical = ['sex', 'race', 'clinical_presentation', 'bifurcation_location', 'stent_number', 'stent_number_bif', 'stent_technique',
               'stent_direction', ]
binary = ['diabet', 'adhoc_pci', 'hypertension', 'smoking', 'dyslipidemia', 'anemia', 'atrial_fibrilation', 'oac_use', 'if_yes_what_type___1',
         'if_yes_what_type___2', 'if_yes_what_type___3', 'if_yes_what_type___4',  'if_yes_what_type___6', 
          'mi_history', 'cerebrovascular_disease', 'peripheral_artery_disease', 'copd', 'history_of_cancer',
         'previous_pci', 'previous_cabg', 'single_vessel', 'trifurcation', 'several_biffurcations', 'calcium', 'trombosis', 'restenosis_reocclusion',
         'cto_bifurc', 'medina_proximal', 'medina_distal', 'medina_side', 'major_lm', 'major_non_lm', 'def', 'def_2', 'side_protection', 'main_predilatation',
         'side_predilat', 'kissing_predilitation', 'stenting', 'defered_stenting', 'stent_type___1', 'stent_type___2', 'stent_type___3', 'stent_type___4',
         'stent_type___5', 'stent_type___6', 'stent_type___7', 'stent_type___9', 'stent_type___8', 'sb_dilatation', 'stent_postdilatation',
         'kissing_post', 'modified_kis']

In [17]:
without_second_bif = df
adverse_events = without_second_bif['event_type_followup_f2___1'] \
| without_second_bif['event_type_followup_f2___2'] \
| without_second_bif['event_type_followup_f2_v2___1'] \
| without_second_bif['event_type_followup_f2_v2___2']
print(sum(adverse_events))
print(len(adverse_events))

146
1961


In [18]:
combined = zip(without_second_bif['event_type_followup_f2___1'],  
               without_second_bif['event_type_followup_f2___2'] * 2,
               without_second_bif['event_type_followup_f2_v2___1'] * 3, 
               without_second_bif['event_type_followup_f2_v2___2'] * 4)

combined_adverse_events = np.array([max(t) for t in combined])

In [19]:
unique, counts = np.unique(combined_adverse_events, return_counts=True)
value_counts = dict(zip(unique, counts))
print(value_counts)

{0: 1815, 1: 41, 2: 17, 3: 51, 4: 37}


In [20]:
id_col = patient_present_df['record_id']
patient_present_df = patient_present_df.drop('record_id', axis=1)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(patient_present_df, combined_adverse_events, test_size=0.4, stratify=combined_adverse_events, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=42)

In [22]:
y_train[y_train != 0] = 1
y_test[y_test != 0] = 1
y_val[y_val != 0] = 1

In [23]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

imputer_categorical = SimpleImputer(strategy='most_frequent')
X_train[cathegorical + binary] = imputer_categorical.fit_transform(X_train[cathegorical + binary])
X_test[cathegorical + binary] = imputer_categorical.transform(X_test[cathegorical + binary])
X_val[cathegorical + binary] = imputer_categorical.transform(X_val[cathegorical + binary])

imputer = IterativeImputer(random_state=0)
columns = list(X_train.columns)
X_train = pd.DataFrame(data = imputer.fit_transform(X_train), columns = columns)
X_test = pd.DataFrame(data = imputer.transform(X_test), columns = columns)
X_val = pd.DataFrame(data = imputer.transform(X_val), columns = columns)

X_train

Unnamed: 0,sex,age,adhoc_pci,weight,height,race,clinical_presentation,diabet,hypertension,smoking,...,stent_type___6,stent_type___7,stent_type___9,stent_type___8,stent_distal_vessel_size,sb_stent_sb_diametr,sb_dilatation,stent_postdilatation,kissing_post,modified_kis
0,2.0,68.0,0.0,76.000000,154.000000,1.0,1.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.857143,1.034483,0.0,1.0,0.0,0.0
1,1.0,67.0,0.0,80.000000,167.000000,1.0,5.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.000000,1.086957,0.0,0.0,0.0,0.0
2,1.0,52.0,1.0,100.000000,170.000000,1.0,3.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.937500,0.731707,0.0,0.0,0.0,0.0
3,1.0,75.0,0.0,87.000000,179.000000,1.0,1.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.071429,1.304348,0.0,1.0,0.0,0.0
4,2.0,65.0,1.0,86.614613,159.897683,1.0,3.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.757576,0.833333,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,1.0,57.0,0.0,90.000000,167.000000,2.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.290323,1.142857,0.0,0.0,1.0,0.0
1172,1.0,59.0,1.0,75.000000,168.000000,1.0,3.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.206897,1.346154,0.0,0.0,0.0,0.0
1173,1.0,69.0,0.0,66.000000,170.000000,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.166667,1.206897,0.0,0.0,1.0,1.0
1174,1.0,57.0,0.0,93.000000,185.000000,1.0,2.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.290323,1.333333,0.0,0.0,1.0,0.0


In [24]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)
df_for_ohe = pd.concat([X_train[cathegorical], X_test[cathegorical], X_val[cathegorical]], ignore_index=True)
ohe.fit(df_for_ohe)

In [25]:
def process_ohe(dataframe, cat_cols, encoder):
    encoded_columns = pd.DataFrame(encoder.transform(dataframe[cat_cols]))

    encoded_columns.columns = encoder.get_feature_names_out(cat_cols)

    dataframe = pd.concat([dataframe, encoded_columns], axis=1)

    dataframe.drop(cat_cols, axis=1, inplace=True)

    return dataframe

In [26]:
X_train = process_ohe(X_train, cathegorical, ohe)
X_test = process_ohe(X_test, cathegorical, ohe)
X_val = process_ohe(X_val, cathegorical, ohe)
X_train

Unnamed: 0,age,adhoc_pci,weight,height,diabet,hypertension,smoking,dyslipidemia,anemia,atrial_fibrilation,...,stent_number_bif_3.0,stent_number_bif_4.0,stent_technique_0.0,stent_technique_1.0,stent_direction_1.0,stent_direction_2.0,stent_direction_3.0,stent_direction_4.0,stent_direction_5.0,stent_direction_6.0
0,68.0,0.0,76.000000,154.000000,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,67.0,0.0,80.000000,167.000000,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,52.0,1.0,100.000000,170.000000,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,75.0,0.0,87.000000,179.000000,2.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,65.0,1.0,86.614613,159.897683,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,57.0,0.0,90.000000,167.000000,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1172,59.0,1.0,75.000000,168.000000,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1173,69.0,0.0,66.000000,170.000000,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1174,57.0,0.0,93.000000,185.000000,2.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [27]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

feature_selector = SelectKBest(f_classif, k=40)
X_feature_selection = feature_selector.fit_transform(X_train, y_train)
X_feature_selection.shape

strong_cols = []

feature_scores = feature_selector.scores_
features = X_train.columns
features_scores_sorted = sorted(zip(features, feature_scores), key=lambda x: x[1], reverse=True)
for col in features_scores_sorted[:30]:
    strong_cols.append(col[0])

In [28]:
X_train = X_train[strong_cols]
X_test = X_test[strong_cols]
X_val = X_val[strong_cols]
X_train

Unnamed: 0,anemia,cerebrovascular_disease,ef,age,peripheral_artery_disease,single_vessel,ckd,copd,creatinine,stent_type___5,...,bifurcation_location_2.0,trifurcation,stent_number_1.0,main_predilatation,stent_type___3,side_predilat,medina_side,side_stenosis,distal_diametr,side_diametr
0,0.0,0.0,62.0,68.0,0.0,0.0,70.982905,0.0,96.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,80.000000,3.5,2.9
1,0.0,0.0,63.0,67.0,0.0,1.0,69.345077,0.0,97.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.000000,2.5,2.3
2,0.0,0.0,35.0,52.0,0.0,0.0,66.916820,0.0,109.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,3.2,4.1
3,0.0,0.0,55.0,75.0,0.0,1.0,33.745645,0.0,168.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.000000,2.8,2.3
4,0.0,0.0,48.0,65.0,0.0,1.0,65.051015,0.0,105.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,95.000000,3.3,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,0.0,0.0,58.0,57.0,0.0,0.0,77.315887,0.0,106.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.000000,3.1,3.5
1172,0.0,0.0,38.0,59.0,0.0,0.0,73.353636,0.0,97.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,8.553568,2.9,2.6
1173,1.0,0.0,65.0,69.0,1.0,0.0,90.590030,0.0,70.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,40.000000,3.0,2.9
1174,0.0,0.0,68.0,57.0,0.0,0.0,66.070134,0.0,107.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.000000,3.1,3.0


In [29]:
strong_cols

['anemia',
 'cerebrovascular_disease',
 'ef',
 'age',
 'peripheral_artery_disease',
 'single_vessel',
 'ckd',
 'copd',
 'creatinine',
 'stent_type___5',
 'if_yes_what_type___1',
 'atrial_fibrilation',
 'calcium',
 'minor_criteria',
 'mi_history',
 'adhoc_pci',
 'stent_length',
 'clinical_presentation_2.0',
 'def',
 'sb_dilatation',
 'bifurcation_location_2.0',
 'trifurcation',
 'stent_number_1.0',
 'main_predilatation',
 'stent_type___3',
 'side_predilat',
 'medina_side',
 'side_stenosis',
 'distal_diametr',
 'side_diametr']

In [30]:
corr_matrix = X_train.corr().abs()

upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

threshold = 0.4

to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]

to_drop

['creatinine',
 'minor_criteria',
 'medina_side',
 'side_stenosis',
 'side_diametr']

In [31]:
X_train = X_train.drop(columns=to_drop)
X_test = X_test.drop(columns=to_drop)
X_val = X_val.drop(columns=to_drop)

X_train

Unnamed: 0,anemia,cerebrovascular_disease,ef,age,peripheral_artery_disease,single_vessel,ckd,copd,stent_type___5,if_yes_what_type___1,...,clinical_presentation_2.0,def,sb_dilatation,bifurcation_location_2.0,trifurcation,stent_number_1.0,main_predilatation,stent_type___3,side_predilat,distal_diametr
0,0.0,0.0,62.0,68.0,0.0,0.0,70.982905,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.5
1,0.0,0.0,63.0,67.0,0.0,1.0,69.345077,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.5
2,0.0,0.0,35.0,52.0,0.0,0.0,66.916820,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.2
3,0.0,0.0,55.0,75.0,0.0,1.0,33.745645,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.8
4,0.0,0.0,48.0,65.0,0.0,1.0,65.051015,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,0.0,0.0,58.0,57.0,0.0,0.0,77.315887,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.1
1172,0.0,0.0,38.0,59.0,0.0,0.0,73.353636,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.9
1173,1.0,0.0,65.0,69.0,1.0,0.0,90.590030,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,3.0
1174,0.0,0.0,68.0,57.0,0.0,0.0,66.070134,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.1


In [32]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)

(1176, 25)
(392, 25)
(393, 25)
(1176,)


In [33]:
print(X_train.columns)

Index(['anemia', 'cerebrovascular_disease', 'ef', 'age',
       'peripheral_artery_disease', 'single_vessel', 'ckd', 'copd',
       'stent_type___5', 'if_yes_what_type___1', 'atrial_fibrilation',
       'calcium', 'mi_history', 'adhoc_pci', 'stent_length',
       'clinical_presentation_2.0', 'def', 'sb_dilatation',
       'bifurcation_location_2.0', 'trifurcation', 'stent_number_1.0',
       'main_predilatation', 'stent_type___3', 'side_predilat',
       'distal_diametr'],
      dtype='object')


# Syntetic data

In [34]:
X_train_with_ID = X_train.copy().reset_index(drop=False)
X_train_with_ID

Unnamed: 0,index,anemia,cerebrovascular_disease,ef,age,peripheral_artery_disease,single_vessel,ckd,copd,stent_type___5,...,clinical_presentation_2.0,def,sb_dilatation,bifurcation_location_2.0,trifurcation,stent_number_1.0,main_predilatation,stent_type___3,side_predilat,distal_diametr
0,0,0.0,0.0,62.0,68.0,0.0,0.0,70.982905,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.5
1,1,0.0,0.0,63.0,67.0,0.0,1.0,69.345077,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.5
2,2,0.0,0.0,35.0,52.0,0.0,0.0,66.916820,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.2
3,3,0.0,0.0,55.0,75.0,0.0,1.0,33.745645,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.8
4,4,0.0,0.0,48.0,65.0,0.0,1.0,65.051015,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,1171,0.0,0.0,58.0,57.0,0.0,0.0,77.315887,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.1
1172,1172,0.0,0.0,38.0,59.0,0.0,0.0,73.353636,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.9
1173,1173,1.0,0.0,65.0,69.0,1.0,0.0,90.590030,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,3.0
1174,1174,0.0,0.0,68.0,57.0,0.0,0.0,66.070134,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.1


In [35]:
X_train_with_ID_and_target = X_train_with_ID.copy()
X_train_with_ID_and_target['target'] = y_train
X_train_with_ID_and_target

Unnamed: 0,index,anemia,cerebrovascular_disease,ef,age,peripheral_artery_disease,single_vessel,ckd,copd,stent_type___5,...,def,sb_dilatation,bifurcation_location_2.0,trifurcation,stent_number_1.0,main_predilatation,stent_type___3,side_predilat,distal_diametr,target
0,0,0.0,0.0,62.0,68.0,0.0,0.0,70.982905,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.5,0
1,1,0.0,0.0,63.0,67.0,0.0,1.0,69.345077,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.5,0
2,2,0.0,0.0,35.0,52.0,0.0,0.0,66.916820,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.2,0
3,3,0.0,0.0,55.0,75.0,0.0,1.0,33.745645,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.8,1
4,4,0.0,0.0,48.0,65.0,0.0,1.0,65.051015,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,1171,0.0,0.0,58.0,57.0,0.0,0.0,77.315887,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.1,0
1172,1172,0.0,0.0,38.0,59.0,0.0,0.0,73.353636,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.9,1
1173,1173,1.0,0.0,65.0,69.0,1.0,0.0,90.590030,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,3.0,1
1174,1174,0.0,0.0,68.0,57.0,0.0,0.0,66.070134,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.1,0


In [35]:
from synthcity.plugins import Plugins

# Print the list of available plugins
print(Plugins().list())

                  variable OMP_PATH to the location of the header before importing keopscore or pykeops,
                  e.g. using os.environ: import os; os.environ['OMP_PATH'] = '/path/to/omp/header'


[2025-04-15T13:27:55.260059+0400][18547][CRITICAL] module disabled: /Users/ivan.petrov/HSE/.venv/lib/python3.12/site-packages/synthcity/plugins/generic/plugin_goggle.py


['tvae', 'image_cgan', 'great', 'dpgan', 'uniform_sampler', 'radialgan', 'privbayes', 'pategan', 'timegan', 'survae', 'adsgan', 'ddpm', 'fflows', 'arf', 'aim', 'dummy_sampler', 'decaf', 'survival_ctgan', 'nflow', 'image_adsgan', 'ctgan', 'survival_gan', 'timevae', 'marginal_distributions', 'bayesian_network', 'rtvae', 'survival_nflow']


# Оптимизация CatBoost на auc-roc для 10 фолдов

In [36]:
# X_train_new = pd.concat([X_train, syntetic_minority_dropped])
# X_train_new

In [37]:
# y_train_new = np.concatenate((y_train, syntetic_target))
# y_train_new

In [36]:
from sklearn.preprocessing import StandardScaler

scaller = StandardScaler()
scaller.fit(X_train)

In [37]:
import joblib

joblib.dump(scaller, "./scaler.save")

['./scaler.save']

In [38]:
X_train_folds = pd.concat([X_train, X_val])
y_train_k_fold = np.concatenate((y_train, y_val), axis=0)

In [39]:
scaled_features = scaller.transform(X_train_folds)

In [40]:
train_df = pd.DataFrame(data=scaled_features, columns=X_train_folds.columns)
train_df

Unnamed: 0,anemia,cerebrovascular_disease,ef,age,peripheral_artery_disease,single_vessel,ckd,copd,stent_type___5,if_yes_what_type___1,...,clinical_presentation_2.0,def,sb_dilatation,bifurcation_location_2.0,trifurcation,stent_number_1.0,main_predilatation,stent_type___3,side_predilat,distal_diametr
0,-0.223607,-0.386730,0.564508,0.435605,-0.271979,-0.873567,-0.280152,-0.257261,-0.092608,-0.141237,...,-0.472996,19.773720,-0.225695,-0.899573,-0.217244,-1.052391,-1.290994,-0.792975,2.383656,1.275464
1,-0.223607,-0.386730,0.659124,0.331590,-0.271979,1.144733,-0.380007,-0.257261,-0.092608,-0.141237,...,-0.472996,-0.050572,-0.225695,1.111639,-0.217244,0.950217,0.774597,-0.792975,-0.419524,-0.838327
2,-0.223607,-0.386730,-1.990120,-1.228627,-0.271979,-0.873567,-0.528053,-0.257261,-0.092608,-0.141237,...,-0.472996,-0.050572,-0.225695,-0.899573,-0.217244,-1.052391,-1.290994,1.261074,-0.419524,0.641327
3,-0.223607,-0.386730,-0.097803,1.163707,-0.271979,1.144733,-2.550440,-0.257261,-0.092608,7.080285,...,-0.472996,-0.050572,-0.225695,1.111639,-0.217244,0.950217,0.774597,-0.792975,-0.419524,-0.204189
4,-0.223607,-0.386730,-0.760114,0.123561,-0.271979,1.144733,-0.641808,-0.257261,-0.092608,-0.141237,...,-0.472996,-0.050572,-0.225695,-0.899573,-0.217244,0.950217,-1.290994,-0.792975,2.383656,0.852706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1563,-0.223607,-0.386730,0.659124,-1.124613,-0.271979,1.144733,-0.269584,-0.257261,-0.092608,-0.141237,...,-0.472996,-0.050572,-0.225695,-0.899573,-0.217244,0.950217,-1.290994,1.261074,-0.419524,-0.626948
1564,-0.223607,-0.386730,0.848356,1.475750,-0.271979,1.144733,-0.789773,-0.257261,-0.092608,-0.141237,...,2.114182,-0.050572,-0.225695,-0.899573,-0.217244,0.950217,-1.290994,-0.792975,2.383656,0.218569
1565,-0.223607,-0.386730,-0.003187,-0.292497,-0.271979,-0.873567,0.155904,-0.257261,-0.092608,-0.141237,...,-0.472996,-0.050572,-0.225695,1.111639,-0.217244,-1.052391,-1.290994,-0.792975,-0.419524,-0.204189
1566,-0.223607,2.585783,0.753740,0.539620,3.676754,1.144733,-0.637765,-0.257261,-0.092608,-0.141237,...,-0.472996,-0.050572,-0.225695,1.111639,-0.217244,-1.052391,-1.290994,-0.792975,-0.419524,-0.901740


In [48]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import pickle
import sys
import warnings
import json
import os

# Suppress warnings
warnings.filterwarnings("ignore")

def objective(params):
    # Convert parameters to proper format
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    params['min_samples_split'] = int(params['min_samples_split'])
    params['min_samples_leaf'] = int(params['min_samples_leaf'])
    
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    roc_auc_scores = []
    f1_scores = []
    precision_scores = []
    recall_scores = []
    accuracy_scores = []

    for train_index, test_index in kf.split(X_train_folds, y_train_k_fold):
        # Get train and test data for this fold
        X_train = scaller.transform(X_train_folds.iloc[train_index])
        X_test = scaller.transform(X_train_folds.iloc[test_index])
        y_train = y_train_k_fold[train_index]
        y_test = y_train_k_fold[test_index]
        
        # Initialize classifier with current parameters
        classifier = RandomForestClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            min_samples_leaf=params['min_samples_leaf'],
            max_features=params['max_features'],
            class_weight='balanced',
            random_state=42
        )
        
        # Fit the model
        classifier.fit(X_train, y_train)
        
        # Make predictions
        predictions = classifier.predict_proba(X_test)[:, 1]
        y_pred = classifier.predict(X_test)
        
        # Calculate metrics
        roc_auc = roc_auc_score(y_test, predictions)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        accuracy = accuracy_score(y_test, y_pred)
        
        roc_auc_scores.append(roc_auc)
        f1_scores.append(f1)
        precision_scores.append(precision)
        recall_scores.append(recall)
        accuracy_scores.append(accuracy)

    mean_metrics = {
        'roc_auc': np.mean(roc_auc_scores),
        'f1': np.mean(f1_scores),
        'precision': np.mean(precision_scores),
        'recall': np.mean(recall_scores),
        'accuracy': np.mean(accuracy_scores),
        'roc_auc_scores': roc_auc_scores,
        'f1_scores': f1_scores,
        'precision_scores': precision_scores,
        'recall_scores': recall_scores,
        'accuracy_scores': accuracy_scores
    }
    
    print(f"Trial completed - AUC: {mean_metrics['roc_auc']:.4f}, Params: {params}")
    
    return {'loss': -mean_metrics['roc_auc'], 'status': STATUS_OK, 'params': params, 'mean_metrics': mean_metrics}

# Define the search space for Random Forest
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 500, 100),  # 100, 200, 300, 400, 500
    'max_depth': hp.choice('max_depth', [4, 6, 8, 10]),         # Depth of trees
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 2),  # Minimum samples to split node
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 5, 1),  # Minimum samples in leaf node
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])  # Feature subset strategy
}

# Run hyperparameter optimization with limited trials
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,  # Limit to 10 evaluations for efficiency
            trials=trials)

# Extract best parameters
best_trial = trials.best_trial['result']
best_params = best_trial['params']
best_metrics = best_trial['mean_metrics']

print("Hyperparameter optimization completed.")
print(f"Best parameters: {best_params}")
print(f"Best mean AUC: {best_metrics['roc_auc']:.4f}")

# Train final model with best hyperparameters on full dataset
best_model = RandomForestClassifier(
    n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']),
    min_samples_split=int(best_params['min_samples_split']),
    min_samples_leaf=int(best_params['min_samples_leaf']),
    max_features=best_params['max_features'],
    random_state=42
)

# Prepare the full dataset for final training
X_train_final = scaller.transform(X_train_folds)
y_train_final = y_train_k_fold

# Fit final model
best_model.fit(X_train_final, y_train_final)

# Save the best model
with open('scores_CV_RandomForest_original.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save metrics and best parameters to files
metrics_and_params = {
    'best_parameters': {
        'n_estimators': int(best_params['n_estimators']),
        'max_depth': int(best_params['max_depth']),
        'min_samples_split': int(best_params['min_samples_split']),
        'min_samples_leaf': int(best_params['min_samples_leaf']),
        'max_features': best_params['max_features']
    },
    'evaluation_metrics': best_metrics
}

with open('scores_CV_RandomForest_original.json', 'w') as f:
    json.dump(metrics_and_params, f)

print("Best parameters, model, and evaluation metrics saved.")

Trial completed - AUC: 0.7218, Params: {'max_depth': 6, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 300}
Trial completed - AUC: 0.7360, Params: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 500}
Trial completed - AUC: 0.7129, Params: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 300}
Trial completed - AUC: 0.7443, Params: {'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 4, 'n_estimators': 200}
Trial completed - AUC: 0.7224, Params: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Trial completed - AUC: 0.7255, Params: {'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 300}
Trial completed - AUC: 0.7454, Params: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples

In [49]:
X_test_scaled = scaller.transform(X_test)

# Make predictions
test_predictions = best_model.predict_proba(X_test_scaled)[:, 1]
test_pred_labels = best_model.predict(X_test_scaled)

# Calculate metrics
test_roc_auc = roc_auc_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_pred_labels, zero_division=0)
test_precision = precision_score(y_test, test_pred_labels, zero_division=0)
test_recall = recall_score(y_test, test_pred_labels, zero_division=0)
test_accuracy = accuracy_score(y_test, test_pred_labels)

print(f"Test ROC AUC: {test_roc_auc:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Add test metrics to the saved results
metrics_and_params['test_metrics'] = {
    'roc_auc': test_roc_auc,
    'f1': test_f1,
    'precision': test_precision,
    'recall': test_recall,
    'accuracy': test_accuracy
}

# Update the saved metrics file with test results
with open('scores_RandomForest_original.json', 'w') as f:
    json.dump(metrics_and_params, f)

print("Test metrics added to results file.")

Test ROC AUC: 0.6450
Test F1 Score: 0.0000
Test Precision: 0.0000
Test Recall: 0.0000
Test Accuracy: 0.9262
Test metrics added to results file.
