In [None]:

import pandas as pd
from itertools import combinations
from tqdm import tqdm 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.model_selection import RandomizedSearchCV
from itertools import combinations
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [46]:
train_csv_path = r"train_data.csv"
test_csv_path = r"test_data.csv"

In [47]:
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

In [48]:
train_df.head(5)

Unnamed: 0,id,Birth_Date,Weight,Height,Urban_Rural,Occupation,Insurance_Type,Family_History,Cancer_Type,Stage_at_Diagnosis,...,Tumor_Size,Surgery_Date,Chemotherapy_Drugs,Radiation_Sessions,Immunotherapy,Targeted_Therapy,Recurrence_Status,Smoking_History,Alcohol_Use,label
0,1,1994-07-01,64.9,155.0cm,Urban,Unemployed,UEBMI,No,Breast,II,...,8.0,2024-10-19,"Paclitaxel,Docetaxel,Doxorubicin",16,No,Yes,NO,Never,Regular,1
1,2,1992-07-16,61.4,171.0cm,Urban,Factory Worker,UEBMI,Yes,Breast,I,...,10.0,2021-02-28,"Cyclophosphamide,Paclitaxel,Doxorubicin,Docetaxel",10,No,No,Yes,Former,Regular,1
2,3,1948-06-23,60.7,170.0cm,Rural,Unemployed,NRCMS,No,Stomach,IV,...,13.0,2022-09-25,"Fluorouracil,Cisplatin",21,Yes,No,NO,Former,Never,0
3,4,1954-11-26,70.2,171.0cm,Urban,Farmer,URBMI,Yes,Cervical,IV,...,3.0,2024-09-13,Cisplatin,10,No,Yes,NO,Never,Regular,1
4,5,1979-07-08,100.3,186.0cm,Rural,Office Worker,Self-pay,Yes,Lung,II,...,12.0,2023-12-08,"Gemcitabine,Carboplatin",6,Yes,No,Yes,Former,Never,0


In [49]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26473 entries, 0 to 26472
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  26473 non-null  int64  
 1   Birth_Date          26473 non-null  object 
 2   Weight              26473 non-null  float64
 3   Height              26473 non-null  object 
 4   Urban_Rural         26473 non-null  object 
 5   Occupation          26473 non-null  object 
 6   Insurance_Type      26473 non-null  object 
 7   Family_History      26473 non-null  object 
 8   Cancer_Type         26473 non-null  object 
 9   Stage_at_Diagnosis  26473 non-null  object 
 10  Diagnosis_Date      26473 non-null  object 
 11  Symptoms            25972 non-null  object 
 12  Tumor_Size          26473 non-null  float64
 13  Surgery_Date        18898 non-null  object 
 14  Chemotherapy_Drugs  22423 non-null  object 
 15  Radiation_Sessions  26473 non-null  int64  
 16  Immu

##### Binary nominal categorical features

In [50]:
# Label encoding is totally fine for binary features, even if they're not ordinal.
# Since they only have two values, there's no risk of introducing a false order 

print("Immunotherapy:", train_df["Immunotherapy"].unique())
print("Targeted_Therapy:", train_df["Targeted_Therapy"].unique())
print("Recurrence_Status:", train_df["Recurrence_Status"].unique())
print("Family_History:", train_df["Family_History"].unique())
print("Urban_Rural:", train_df["Urban_Rural"].unique())

Immunotherapy: ['No' 'Yes']
Targeted_Therapy: ['Yes' 'No']
Recurrence_Status: ['NO' 'Yes']
Family_History: ['No' 'Yes']
Urban_Rural: ['Urban' 'Rural']


In [51]:
def clean_yes_no_columns(df, columns):
    for col in columns:
        df[col] = df[col].str.strip().str.lower().map({'yes': 1, 'no': 0})
    return df

def clean_urban_rural_column(df, col):
    df[col] = df[col].str.strip().str.lower().map({'urban' : 1, 'rural': 0})
    return df

binary_columns = ["Immunotherapy", "Targeted_Therapy", "Recurrence_Status", "Family_History"]

# Apply the cleaning function
train_df = clean_yes_no_columns(train_df, binary_columns)
test_df = clean_yes_no_columns(test_df, binary_columns)
train_df = clean_urban_rural_column(train_df, "Urban_Rural")
test_df = clean_urban_rural_column(test_df, "Urban_Rural")

binary_columns += ["Urban_Rural"]

print(train_df[binary_columns].head())

   Immunotherapy  Targeted_Therapy  Recurrence_Status  Family_History  \
0              0                 1                  0               0   
1              0                 0                  1               1   
2              1                 0                  0               0   
3              0                 1                  0               1   
4              1                 0                  1               1   

   Urban_Rural  
0            1  
1            1  
2            0  
3            1  
4            0  


##### Ordinal categorical features

In [52]:
print("Stage_at_Diagnosis:", train_df["Stage_at_Diagnosis"].unique())
print("Smoking_History:", train_df["Smoking_History"].unique())
print("Alcohol_Use:", train_df["Alcohol_Use"].unique())

Stage_at_Diagnosis: ['II' 'I' 'IV' 'III']
Smoking_History: ['Never' 'Former' 'Current']
Alcohol_Use: ['Regular' 'Never' 'Occasional']


#### nonbinary nominal categorical features

In [53]:
occupation_label_rate = train_df.groupby("Occupation")["label"].mean().sort_values(ascending=False)

print(occupation_label_rate)

Occupation
Retired           0.509354
Farmer            0.503804
Office Worker     0.493036
Unemployed        0.488062
Factory Worker    0.434376
Name: label, dtype: float64


In [54]:
def encode_occupation(occupation):
    if occupation == "Retired":
        return 5
    elif occupation == "Farmer":
        return 5
    elif occupation == "Office Worker":
        return 4
    elif occupation == "Unemployed":
        return 4
    else:
        return 1

train_df["Occupation_risk"] = train_df["Occupation"].apply(encode_occupation)
test_df["Occupation_risk"] = test_df["Occupation"].apply(encode_occupation)


train_df["Occupation_risk"].head()

0    4
1    1
2    4
3    5
4    4
Name: Occupation_risk, dtype: int64

In [55]:
# Define mapping dictionaries
# stage_mapping = {'I': 1, 'II': 1, 'III': 2, 'IV': 3}
# smoking_mapping = {'Never': 0, 'Former': 0, 'Current': 1}
# alcohol_mapping = {'Never': 0, 'Occasional': 1, 'Regular': 2}

stage_mapping = {'I': 3, 'II': 3, 'III': 2, 'IV': 1}
smoking_mapping = {'Never': 5, 'Former': 6, 'Current': 1}
alcohol_mapping = {'Never': 4, 'Occasional': 5, 'Regular': 1}

train_df["Stage_at_Diagnosis"] = train_df["Stage_at_Diagnosis"].map(stage_mapping)
train_df["Smoking_History"] = train_df["Smoking_History"].map(smoking_mapping)
train_df["Alcohol_Use"] = train_df["Alcohol_Use"].map(alcohol_mapping)

test_df["Stage_at_Diagnosis"] = test_df["Stage_at_Diagnosis"].map(stage_mapping)
test_df["Smoking_History"] = test_df["Smoking_History"].map(smoking_mapping)
test_df["Alcohol_Use"] = test_df["Alcohol_Use"].map(alcohol_mapping)

print(train_df[["Stage_at_Diagnosis", "Smoking_History", "Alcohol_Use"]].head())

   Stage_at_Diagnosis  Smoking_History  Alcohol_Use
0                   3                5            1
1                   3                6            1
2                   1                6            4
3                   1                5            1
4                   3                6            4


In [56]:
tumor_size_label_rate = train_df.groupby("Alcohol_Use")["label"].mean().sort_values(ascending=False)

tumor_size_label_rate.head(50)

Alcohol_Use
5    0.533302
4    0.506496
1    0.387364
Name: label, dtype: float64

In [57]:
tumor_size_label_rate = train_df.groupby("Smoking_History")["label"].mean().sort_values(ascending=False)

tumor_size_label_rate.head(50)

Smoking_History
6    0.528948
5    0.511012
1    0.400618
Name: label, dtype: float64

In [58]:
tumor_size_label_rate = train_df.groupby("Stage_at_Diagnosis")["label"].mean().sort_values(ascending=False)

tumor_size_label_rate.head(50)

Stage_at_Diagnosis
3    0.593648
2    0.408820
1    0.342963
Name: label, dtype: float64

In [59]:
print("Weight range:")
print("Min:", train_df["Tumor_Size"].min())
print("Max:", train_df["Tumor_Size"].max())


Weight range:
Min: 0.5
Max: 15.0


In [60]:
tumor_size_label_rate = train_df.groupby("Tumor_Size")["label"].mean()#.sort_values(ascending=False)

tumor_size_label_rate.head(150)


Tumor_Size
0.5     0.423077
0.6     0.339623
0.7     0.444444
0.8     0.611940
0.9     0.508475
          ...   
14.6    0.358209
14.7    0.339286
14.8    0.492958
14.9    0.516667
15.0    0.379310
Name: label, Length: 146, dtype: float64

In [61]:
cancer_label_rate = train_df.groupby("Cancer_Type")["label"].mean().sort_values(ascending=False)

print(cancer_label_rate)

Cancer_Type
Breast        0.589909
Liver         0.551669
Cervical      0.527449
Colorectal    0.470946
Stomach       0.453249
Esophageal    0.445479
Lung          0.360735
Name: label, dtype: float64


In [62]:
def encode_cancer_group(cancer_type):
    if cancer_type == "Breast":
        return 10
    elif cancer_type == "Liver":
        return 8
    elif cancer_type == "Cervical":
        return 7
    elif cancer_type == "Colorectal":
        return 5
    elif cancer_type in ["Stomach", "Esophageal"]:
        return 4
    elif cancer_type == "Lung":
        return 1

train_df["Cancer_Type_Grouped"] = train_df["Cancer_Type"].apply(encode_cancer_group)
test_df["Cancer_Type_Grouped"] = test_df["Cancer_Type"].apply(encode_cancer_group)
#train_df = train_df.drop(columns=["Cancer_Type"])

train_df["Cancer_Type_Grouped"].head()

0    10
1    10
2     4
3     7
4     1
Name: Cancer_Type_Grouped, dtype: int64

In [63]:
#print("Occupation:", train_df["Occupation"].unique())
print("Insurance_Type:", train_df["Insurance_Type"].unique())
#print("Cancer_Type:", train_df["Cancer_Type"].unique())
# print("Symptoms:", train_df["Symptoms"].unique())

Insurance_Type: ['UEBMI' 'NRCMS' 'URBMI' 'Self-pay']


In [64]:
# Compute proportion of label==1 (died) for each insurance
Insurance_label_rate = train_df.groupby("Insurance_Type")["label"].mean().sort_values(ascending=False)

print(Insurance_label_rate)

Insurance_Type
Self-pay    0.496903
URBMI       0.485027
NRCMS       0.482305
UEBMI       0.478733
Name: label, dtype: float64


In [65]:
one_hot_columns = ["Insurance_Type", "Cancer_Type", "Occupation"]

one_hot_encoded = pd.get_dummies(train_df[one_hot_columns], prefix=one_hot_columns).astype(int)

train_df = train_df.drop(columns=one_hot_columns)
train_df = pd.concat([train_df, one_hot_encoded], axis=1)


one_hot_encoded = pd.get_dummies(test_df[one_hot_columns], prefix=one_hot_columns).astype(int)

test_df = test_df.drop(columns=one_hot_columns)
test_df = pd.concat([test_df, one_hot_encoded], axis=1)

train_df[one_hot_encoded.columns].head()

Unnamed: 0,Insurance_Type_NRCMS,Insurance_Type_Self-pay,Insurance_Type_UEBMI,Insurance_Type_URBMI,Cancer_Type_Breast,Cancer_Type_Cervical,Cancer_Type_Colorectal,Cancer_Type_Esophageal,Cancer_Type_Liver,Cancer_Type_Lung,Cancer_Type_Stomach,Occupation_Factory Worker,Occupation_Farmer,Occupation_Office Worker,Occupation_Retired,Occupation_Unemployed
0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0


In [66]:
# insurance_weight_map = {
#     "Insurance_Type_Self-pay": 1,
#     "Insurance_Type_URBMI": 0,
#     "Insurance_Type_NRCMS": 0,
#     "Insurance_Type_UEBMI": 0
# }

# # Reverse mapping to get risk groupings
# insruance_groups = defaultdict(list)
# for insurance, weight in insurance_weight_map.items():
#     insruance_groups[weight].append(insurance)

# # Create count columns for each risk group
# train_df["Self_pay"] = train_df[insruance_groups[1]].sum(axis=1)
# # train_df["not_Self_pay"] = train_df[insruance_groups[0]].sum(axis=1)

# #train_df = train_df.drop(columns=list(insurance_weight_map.keys()))

# train_df[["Self_pay"]].head()

In [67]:
train_df["Height"] = train_df["Height"].str.replace("cm", "").astype(float)
test_df["Height"] = test_df["Height"].str.replace("cm", "").astype(float)

print("Height range train:")
print("Min:", train_df["Height"].min())
print("Max:", train_df["Height"].max())

print("Weight range train:")
print("Min:", train_df["Weight"].min())
print("Max:", train_df["Weight"].max())

print("Height range test:")
print("Min:", test_df["Height"].min())
print("Max:", test_df["Height"].max())

print("Weight range test:")
print("Min:", test_df["Weight"].min())
print("Max:", test_df["Weight"].max())


Height range train:
Min: 150.0
Max: 195.0
Weight range train:
Min: 31.5
Max: 133.1
Height range test:
Min: 150.0
Max: 195.0
Weight range test:
Min: 31.5
Max: 133.1


In [68]:
# Create bins from 150 to 195 with step 5
bins = list(range(150, 200, 5))# [150, 155, ..., 195]
labels = [f"{((b - 150)//5)}" for b in bins[:-1]] 

train_df["Height_Group"] = pd.cut(train_df["Height"], bins=bins, labels=labels, right=False)
test_df["Height_Group"] = pd.cut(test_df["Height"], bins=bins, labels=labels, right=False)

#train_df = train_df.drop(columns=["Height"])

height_death_rate = train_df.groupby("Height_Group")["label"].mean().sort_values(ascending=False)
print(height_death_rate)

Height_Group
0    0.505208
2    0.499475
4    0.494398
6    0.490307
1    0.486847
8    0.480127
3    0.475742
5    0.474485
7    0.468893
Name: label, dtype: float64


  height_death_rate = train_df.groupby("Height_Group")["label"].mean().sort_values(ascending=False)


In [69]:
bins = np.arange(30, 135 + 5, 5)  # includes 135

labels = [f"{(b-30)/5}" for b in bins[:-1]] 

train_df["Weight_Group"] = pd.cut(train_df["Weight"], bins=bins, labels=labels, right=False)
test_df["Weight_Group"] = pd.cut(test_df["Weight"], bins=bins, labels=labels, right=False)

#train_df = train_df.drop(columns=["Weight"])

weight_death_rate = train_df.groupby("Weight_Group")["label"].mean().sort_values(ascending=False)
print(weight_death_rate)

Weight_Group
5.0     0.617021
7.0     0.588209
6.0     0.586976
4.0     0.576825
3.0     0.534063
8.0     0.520579
9.0     0.490426
10.0    0.469985
2.0     0.453150
20.0    0.450000
11.0    0.422668
12.0    0.401434
18.0    0.373832
19.0    0.359788
13.0    0.356312
17.0    0.354125
1.0     0.345272
15.0    0.341220
14.0    0.334270
16.0    0.308219
0.0     0.270270
Name: label, dtype: float64


  weight_death_rate = train_df.groupby("Weight_Group")["label"].mean().sort_values(ascending=False)


In [70]:
weight_death_rate = train_df.groupby("Weight_Group")["label"].mean()

def assign_weight_risk(death_rate):
    if death_rate < 0.31:
        return 0
    elif death_rate < 0.40:
        return 1
    elif death_rate < 0.50:
        return 2
    elif death_rate < 0.60:
        return 3
    else:
        return 4

weight_risk_map = weight_death_rate.apply(assign_weight_risk).to_dict()

train_df["Weight_Risk_Level"] = train_df["Weight_Group"].map(weight_risk_map)
test_df["Weight_Risk_Level"] = test_df["Weight_Group"].map(weight_risk_map)

#train_df = train_df.drop(columns=["Weight_Group"])
# test_df = test_df.drop(columns=["Weight_Group"])

train_df[["Weight_Risk_Level"]].head()

  weight_death_rate = train_df.groupby("Weight_Group")["label"].mean()


Unnamed: 0,Weight_Risk_Level
0,3
1,3
2,3
3,3
4,1


In [71]:
train_df["BMI"] = train_df["Weight"] / ((train_df["Height"] / 100) ** 2)
test_df["BMI"] = test_df["Weight"] / ((test_df["Height"] / 100) ** 2)

In [72]:
train_df["Chemotherapy_Drugs"] = train_df["Chemotherapy_Drugs"].fillna("")
test_df["Chemotherapy_Drugs"] = test_df["Chemotherapy_Drugs"].fillna("")

all_drugs = train_df["Chemotherapy_Drugs"].str.split(",").explode().str.strip()
unique_drugs = sorted(
    ["Drug_" + drug.replace(" ", "_") for drug in all_drugs.unique() if drug.strip() != ""]
)

print(unique_drugs)

['Drug_Carboplatin', 'Drug_Cisplatin', 'Drug_Cyclophosphamide', 'Drug_Docetaxel', 'Drug_Doxorubicin', 'Drug_Fluorouracil', 'Drug_Gemcitabine', 'Drug_Irinotecan', 'Drug_Leucovorin', 'Drug_Oxaliplatin', 'Drug_Paclitaxel', 'Drug_Sorafenib']


In [73]:
for drug_column in unique_drugs:
    original = drug_column.replace("Drug_", "").replace("_", " ")
    train_df[drug_column] = train_df["Chemotherapy_Drugs"].apply(
        lambda x: int(original in [d.strip() for d in x.split(",")])
    )
    test_df[drug_column] = test_df["Chemotherapy_Drugs"].apply(
        lambda x: int(original in [d.strip() for d in x.split(",")])
    )

train_df = train_df.drop(columns=["Chemotherapy_Drugs"])
test_df = test_df.drop(columns=["Chemotherapy_Drugs"])

train_df[unique_drugs].head()

Unnamed: 0,Drug_Carboplatin,Drug_Cisplatin,Drug_Cyclophosphamide,Drug_Docetaxel,Drug_Doxorubicin,Drug_Fluorouracil,Drug_Gemcitabine,Drug_Irinotecan,Drug_Leucovorin,Drug_Oxaliplatin,Drug_Paclitaxel,Drug_Sorafenib
0,0,0,0,1,1,0,0,0,0,0,1,0
1,0,0,1,1,1,0,0,0,0,0,1,0
2,0,1,0,0,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,0,0


In [74]:
train_df["Symptoms"] = train_df["Symptoms"].fillna("")
test_df["Symptoms"] = test_df["Symptoms"].fillna("")

all_symptoms = train_df["Symptoms"].str.split(",").explode().str.strip()
unique_symptoms = sorted(
    ["Symptom_" + symptom.replace(" ", "_") for symptom in all_symptoms.unique() if symptom.strip() != ""]
)

print(unique_symptoms)


['Symptom_Blood_in_Stool', 'Symptom_Cough', 'Symptom_Fatigue', 'Symptom_Lump', 'Symptom_Nausea', 'Symptom_Pain', 'Symptom_Swelling', 'Symptom_Vomiting', 'Symptom_Weight_Loss']


In [75]:
for symptom in unique_symptoms:
    raw_symptom = symptom.replace("Symptom_", "").replace("_", " ")
    train_df[symptom] = train_df["Symptoms"].apply(
        lambda x: int(raw_symptom in [s.strip() for s in x.split(",")])
    )
    test_df[symptom] = test_df["Symptoms"].apply(
        lambda x: int(raw_symptom in [s.strip() for s in x.split(",")])
    )

train_df = train_df.drop(columns=["Symptoms"])
test_df = test_df.drop(columns=["Symptoms"])

train_df[unique_symptoms].head()


Unnamed: 0,Symptom_Blood_in_Stool,Symptom_Cough,Symptom_Fatigue,Symptom_Lump,Symptom_Nausea,Symptom_Pain,Symptom_Swelling,Symptom_Vomiting,Symptom_Weight_Loss
0,0,1,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0
3,0,0,0,0,1,0,0,1,0
4,0,1,0,0,0,0,0,0,1


In [76]:
train_df.head()

Unnamed: 0,id,Birth_Date,Weight,Height,Urban_Rural,Family_History,Stage_at_Diagnosis,Diagnosis_Date,Tumor_Size,Surgery_Date,...,Drug_Sorafenib,Symptom_Blood_in_Stool,Symptom_Cough,Symptom_Fatigue,Symptom_Lump,Symptom_Nausea,Symptom_Pain,Symptom_Swelling,Symptom_Vomiting,Symptom_Weight_Loss
0,1,1994-07-01,64.9,155.0,1,0,3,2020-02-10,8.0,2024-10-19,...,0,0,1,0,0,0,0,0,0,1
1,2,1992-07-16,61.4,171.0,1,1,3,2014-08-17,10.0,2021-02-28,...,0,1,0,0,0,0,0,0,0,0
2,3,1948-06-23,60.7,170.0,0,0,1,2014-09-25,13.0,2022-09-25,...,0,0,0,0,0,1,0,0,1,0
3,4,1954-11-26,70.2,171.0,1,1,1,2021-01-04,3.0,2024-09-13,...,0,0,0,0,0,1,0,0,1,0
4,5,1979-07-08,100.3,186.0,0,1,3,2019-07-26,12.0,2023-12-08,...,0,0,1,0,0,0,0,0,0,1


In [77]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26473 entries, 0 to 26472
Data columns (total 60 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   id                         26473 non-null  int64   
 1   Birth_Date                 26473 non-null  object  
 2   Weight                     26473 non-null  float64 
 3   Height                     26473 non-null  float64 
 4   Urban_Rural                26473 non-null  int64   
 5   Family_History             26473 non-null  int64   
 6   Stage_at_Diagnosis         26473 non-null  int64   
 7   Diagnosis_Date             26473 non-null  object  
 8   Tumor_Size                 26473 non-null  float64 
 9   Surgery_Date               18898 non-null  object  
 10  Radiation_Sessions         26473 non-null  int64   
 11  Immunotherapy              26473 non-null  int64   
 12  Targeted_Therapy           26473 non-null  int64   
 13  Recurrence_Status          2647

In [78]:
train_df.to_csv(r"preprocessed_data.csv", index=False)

In [79]:
symptom_cols = [col for col in train_df.columns if col.startswith("Symptom_")]

symptom_death_rates = {
    symptom: train_df[train_df[symptom] == 1]["label"].mean()
    for symptom in symptom_cols
}

symptom_death_df = pd.DataFrame.from_dict(symptom_death_rates, orient="index", columns=["Death_Rate"])
symptom_death_df = symptom_death_df.sort_values(by="Death_Rate", ascending=False)

# Print result
print(symptom_death_df)

                        Death_Rate
Symptom_Blood_in_Stool    0.495470
Symptom_Cough             0.489619
Symptom_Weight_Loss       0.489619
Symptom_Fatigue           0.487145
Symptom_Pain              0.487145
Symptom_Lump              0.483967
Symptom_Swelling          0.483967
Symptom_Nausea            0.470701
Symptom_Vomiting          0.470701


In [80]:
symptom_weight_map = {
    "Symptom_Nausea": 1, #high risk
    "Symptom_Vomiting": 1,
    "Symptom_Swelling": 2,
    "Symptom_Lump": 2,
    "Symptom_Pain": 3,
    "Symptom_Fatigue": 3,
    "Symptom_Cough": 4,  
    "Symptom_Weight_Loss": 4,
    "Symptom_Blood_in_Stool": 5  #low risk
}

symptom_groups = defaultdict(list)
for symptom, weight in symptom_weight_map.items():
    symptom_groups[weight].append(symptom)

train_df["level_1_Risk_Symptoms_Count"] = train_df[symptom_groups[1]].sum(axis=1)
train_df["level_2_Risk_Symptoms_Count"] = train_df[symptom_groups[2]].sum(axis=1)
train_df["level_3_Risk_Symptoms_Count"] = train_df[symptom_groups[3]].sum(axis=1)
train_df["level_4_Risk_Symptoms_Count"] = train_df[symptom_groups[4]].sum(axis=1)
train_df["level_5_Risk_Symptoms_Count"] = train_df[symptom_groups[5]].sum(axis=1)

test_df["level_1_Risk_Symptoms_Count"] = test_df[symptom_groups[1]].sum(axis=1)
test_df["level_2_Risk_Symptoms_Count"] = test_df[symptom_groups[2]].sum(axis=1)
test_df["level_3_Risk_Symptoms_Count"] = test_df[symptom_groups[3]].sum(axis=1)
test_df["level_4_Risk_Symptoms_Count"] = test_df[symptom_groups[4]].sum(axis=1)
test_df["level_5_Risk_Symptoms_Count"] = test_df[symptom_groups[5]].sum(axis=1)


train_df[["level_1_Risk_Symptoms_Count",
            "level_2_Risk_Symptoms_Count", 
            "level_3_Risk_Symptoms_Count",
            "level_4_Risk_Symptoms_Count",
            "level_5_Risk_Symptoms_Count"
            ]].head()


Unnamed: 0,level_1_Risk_Symptoms_Count,level_2_Risk_Symptoms_Count,level_3_Risk_Symptoms_Count,level_4_Risk_Symptoms_Count,level_5_Risk_Symptoms_Count
0,0,0,0,2,0
1,0,0,0,0,1
2,2,0,0,0,0
3,2,0,0,0,0
4,0,0,0,2,0


In [81]:
drug_cols = [col for col in train_df.columns if col.startswith("Drug_")]

drug_death_rates = {
    drug: train_df[train_df[drug] == 1]["label"].mean()
    for drug in drug_cols
}

drug_death_df = pd.DataFrame.from_dict(drug_death_rates, orient="index", columns=["Death_Rate"])
drug_death_df = drug_death_df.sort_values(by="Death_Rate", ascending=False)

print(drug_death_df)

                       Death_Rate
Drug_Docetaxel           0.678844
Drug_Cyclophosphamide    0.677473
Drug_Doxorubicin         0.658591
Drug_Paclitaxel          0.525006
Drug_Sorafenib           0.505656
Drug_Leucovorin          0.474401
Drug_Irinotecan          0.471781
Drug_Oxaliplatin         0.465548
Drug_Cisplatin           0.456558
Drug_Gemcitabine         0.402018
Drug_Carboplatin         0.392639
Drug_Fluorouracil        0.381719


In [82]:
drug_weight_map = {
    "Drug_Gemcitabine": 1,  #high risk 
    "Drug_Carboplatin": 1,
    "Drug_Fluorouracil": 1,
    "Drug_Leucovorin": 3,  #mid_high risk
    "Drug_Irinotecan": 3,
    "Drug_Oxaliplatin": 3, 
    "Drug_Cisplatin": 3,
    "Drug_Paclitaxel": 4,  #mid_low risk
    "Drug_Sorafenib": 4,
    "Drug_Docetaxel": 8,   #low risk
    "Drug_Cyclophosphamide": 8,
    "Drug_Doxorubicin": 8
}

risk_groups = defaultdict(list)
for drug, weight in drug_weight_map.items():
    risk_groups[weight].append(drug)

train_df["Low_Risk_Drug_Count"] = train_df[risk_groups[1]].sum(axis=1)
train_df["Mid_Low_Risk_Drug_Count"] = train_df[risk_groups[3]].sum(axis=1)
train_df["Mid_High_Risk_Drug_Count"] = train_df[risk_groups[4]].sum(axis=1)
train_df["High_Risk_Drug_Count"] = train_df[risk_groups[8]].sum(axis=1)

test_df["Low_Risk_Drug_Count"] = test_df[risk_groups[1]].sum(axis=1)
test_df["Mid_Low_Risk_Drug_Count"] = test_df[risk_groups[3]].sum(axis=1)
test_df["Mid_High_Risk_Drug_Count"] = test_df[risk_groups[4]].sum(axis=1)
test_df["High_Risk_Drug_Count"] = test_df[risk_groups[8]].sum(axis=1)

train_df[["High_Risk_Drug_Count", "Mid_High_Risk_Drug_Count", "Mid_Low_Risk_Drug_Count", "Low_Risk_Drug_Count"]].head()

Unnamed: 0,High_Risk_Drug_Count,Mid_High_Risk_Drug_Count,Mid_Low_Risk_Drug_Count,Low_Risk_Drug_Count
0,2,1,0,0
1,3,1,0,0
2,0,0,1,1
3,0,0,1,0
4,0,0,0,2


#### new features age at diagnosis and age at surgery and days from diagnosis to surgery

All the rows with missing Age_at_Diagnosis have a Birth_Date of 1800-01-01, which is clearly a placeholder or invalid default — likely used to represent "unknown birth date" in the original data. So we consider them as outliers

In [83]:

train_df.loc[train_df["Birth_Date"] == "1800-01-01 00:00:00", "Birth_Date"] = "1800-01-01"
test_df.loc[test_df["Birth_Date"] == "1800-01-01 00:00:00", "Birth_Date"] = "1800-01-01"

In [84]:
train_df["Birth_Date"] = pd.to_datetime(train_df["Birth_Date"], errors="coerce")
train_df["Diagnosis_Date"] = pd.to_datetime(train_df["Diagnosis_Date"], errors="coerce")
train_df["Surgery_Date"] = pd.to_datetime(train_df["Surgery_Date"], errors="coerce")

test_df["Birth_Date"] = pd.to_datetime(test_df["Birth_Date"], errors="coerce")
test_df["Diagnosis_Date"] = pd.to_datetime(test_df["Diagnosis_Date"], errors="coerce")
test_df["Surgery_Date"] = pd.to_datetime(test_df["Surgery_Date"], errors="coerce")

invalid_birth_mask1 = train_df["Birth_Date"] == pd.Timestamp("1800-01-01")
invalid_birth_mask2 = test_df["Birth_Date"] == pd.Timestamp("1800-01-01")

train_df["Had_Surgery"] = train_df["Surgery_Date"].notna().astype(int)
test_df["Had_Surgery"] = test_df["Surgery_Date"].notna().astype(int)


train_df["Weeks_to_Surgery"] = ((train_df["Surgery_Date"] - train_df["Diagnosis_Date"]).dt.days // 7).fillna(-1).astype(int)
test_df["Weeks_to_Surgery"] = ((test_df["Surgery_Date"] - test_df["Diagnosis_Date"]).dt.days // 7).fillna(-1).astype(int)

train_df["Age_at_Diagnosis"] = ((train_df["Diagnosis_Date"] - train_df["Birth_Date"]).dt.days // 365)
train_df.loc[invalid_birth_mask1, "Age_at_Diagnosis"] = -1
train_df["Age_at_Diagnosis"] = train_df["Age_at_Diagnosis"].astype("Int64")

test_df["Age_at_Diagnosis"] = ((test_df["Diagnosis_Date"] - test_df["Birth_Date"]).dt.days // 365)
test_df.loc[invalid_birth_mask2, "Age_at_Diagnosis"] = -1
test_df["Age_at_Diagnosis"] = test_df["Age_at_Diagnosis"].astype("Int64")


train_df["Age_at_Surgery"] = ((train_df["Surgery_Date"] - train_df["Birth_Date"]).dt.days // 365).fillna(-1)
train_df.loc[invalid_birth_mask1, "Age_at_Surgery"] = -1
train_df["Age_at_Surgery"] = train_df["Age_at_Surgery"].astype("Int64")

test_df["Age_at_Surgery"] = ((test_df["Surgery_Date"] - test_df["Birth_Date"]).dt.days // 365).fillna(-1)
test_df.loc[invalid_birth_mask2, "Age_at_Surgery"] = -1
test_df["Age_at_Surgery"] = test_df["Age_at_Surgery"].astype("Int64")


# Drop the raw date columns 
train_df = train_df.drop(columns=["Birth_Date", "Diagnosis_Date", "Surgery_Date"])
test_df = test_df.drop(columns=["Birth_Date", "Diagnosis_Date", "Surgery_Date"])

print(train_df[["Age_at_Diagnosis", "Age_at_Surgery", "Weeks_to_Surgery"]].head())

   Age_at_Diagnosis  Age_at_Surgery  Weeks_to_Surgery
0                25              30               244
1                22              28               341
2                66              74               417
3                66              69               192
4                40              44               228


In [85]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26473 entries, 0 to 26472
Data columns (total 70 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   id                           26473 non-null  int64   
 1   Weight                       26473 non-null  float64 
 2   Height                       26473 non-null  float64 
 3   Urban_Rural                  26473 non-null  int64   
 4   Family_History               26473 non-null  int64   
 5   Stage_at_Diagnosis           26473 non-null  int64   
 6   Tumor_Size                   26473 non-null  float64 
 7   Radiation_Sessions           26473 non-null  int64   
 8   Immunotherapy                26473 non-null  int64   
 9   Targeted_Therapy             26473 non-null  int64   
 10  Recurrence_Status            26473 non-null  int64   
 11  Smoking_History              26473 non-null  int64   
 12  Alcohol_Use                  26473 non-null  int64   
 13  l

In [86]:
no_surgery_df = train_df[train_df["Had_Surgery"] == 0]

print(no_surgery_df[["id", "Had_Surgery", "Weeks_to_Surgery", "Age_at_Surgery"]].head(50))


          id  Had_Surgery  Weeks_to_Surgery  Age_at_Surgery
17868  17869            0                -1              -1
17869  17870            0                -1              -1
17870  17871            0                -1              -1
17871  17872            0                -1              -1
17872  17873            0                -1              -1
17873  17874            0                -1              -1
17874  17875            0                -1              -1
17876  17877            0                -1              -1
17877  17878            0                -1              -1
17878  17879            0                -1              -1
17879  17880            0                -1              -1
17881  17882            0                -1              -1
17883  17884            0                -1              -1
17885  17886            0                -1              -1
17886  17887            0                -1              -1
17887  17888            0               

#### new feature total chemou drugs

In [87]:
train_df["Total_Chemo_Score"] = sum(
    train_df[drug] * weight
    for drug, weight in drug_weight_map.items()
    if drug in train_df.columns
)

test_df["Total_Chemo_Score"] = sum(
    test_df[drug] * weight
    for drug, weight in drug_weight_map.items()
    if drug in test_df.columns
)

# Drop one-hot encoded drug columns
#train_df = train_df.drop(columns=[drug for drug in drug_weight_map if drug in train_df.columns])
#test_df = test_df.drop(columns=[drug for drug in drug_weight_map if drug in test_df.columns])

train_df[["Total_Chemo_Score"]].head()


Unnamed: 0,Total_Chemo_Score
0,20
1,28
2,4
3,3
4,2


#### new feature total symptom count

In [88]:
train_df["Total_Symptom_Score"] = sum(
    train_df[symptom] * weight
    for symptom, weight in symptom_weight_map.items()
    if symptom in train_df.columns
)

test_df["Total_Symptom_Score"] = sum(
    test_df[symptom] * weight
    for symptom, weight in symptom_weight_map.items()
    if symptom in test_df.columns
)

train_df = train_df.drop(columns=[symptom for symptom in symptom_weight_map if symptom in train_df.columns])
test_df = test_df.drop(columns=[symptom for symptom in symptom_weight_map if symptom in test_df.columns])

train_df[["Total_Symptom_Score"]].head()


Unnamed: 0,Total_Symptom_Score
0,8
1,5
2,2
3,2
4,8


#### new nonlinear features

In [89]:
interaction_features = [
    "Alcohol_Use",
    "Smoking_History",
    "Stage_at_Diagnosis",
    "Tumor_Size",
    "Age_at_Diagnosis",
    "Age_at_Surgery",
    "Total_Symptom_Score",
    "Total_Chemo_Score"
]

for r in [2, 8]:
    for combo in combinations(interaction_features, r):
        new_col = "_x_".join(combo)
        product1 = train_df[combo[0]]
        product2 = test_df[combo[0]]
        for col in combo[1:]:
            product1 *= train_df[col]
            product2 *= test_df[col]
        train_df[new_col] = product1
        test_df[new_col] = product2

train_df["Symptoms_per_Drug"] = train_df["Total_Symptom_Score"] / (train_df["Total_Chemo_Score"] + 1)
test_df["Symptoms_per_Drug"] = test_df["Total_Symptom_Score"] / (test_df["Total_Chemo_Score"] + 1)


train_df["Both_Therapies"] = train_df["Immunotherapy"] + train_df["Targeted_Therapy"]
test_df["Both_Therapies"] = test_df["Immunotherapy"] + test_df["Targeted_Therapy"]


In [90]:
train_df.to_csv(r"added_features.csv", index=False)

In [91]:
df = pd.read_csv("added_features.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26473 entries, 0 to 26472
Data columns (total 94 columns):
 #   Column                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                         --------------  -----  
 0   id                                                                                                                                             26473 non-null  int64  
 1   Weight                                                                                                                                         26473 non-null  float64
 2   Height                                                                                                                                         26473 non-null  float64
 3   Urban_Rural               

In [None]:
df = pd.read_csv("added_features.csv") 

exclude_cols = ["id", "label"]

# Select numeric columns excluding these
feature_cols = [
    col for col in df.columns
    if col not in exclude_cols
]

In [None]:
df = pd.read_csv("added_features.csv")

# Calculate correlation with label
correlations = {col: df[col].corr(df["label"]) for col in feature_cols}

correlation_df = pd.DataFrame.from_dict(correlations, orient="index", columns=["correlation_with_label"])

correlation_df["abs_correlation"] = correlation_df["correlation_with_label"].abs()

correlation_df = correlation_df.sort_values(by="abs_correlation", ascending=False)

correlation_df.reset_index(inplace=True)
correlation_df.rename(columns={"index": "Feature"}, inplace=True)
correlation_df.to_csv("feature_label_correlations.csv", index=False)


print("Top 10 strongest correlations (positive or negative):")
correlation_df.head(20)


🔝 Top 10 strongest correlations (positive or negative):


Unnamed: 0,Feature,correlation_with_label,abs_correlation
0,Stage_at_Diagnosis,0.214559,0.214559
1,Smoking_History,0.213315,0.213315
2,Smoking_History_x_Stage_at_Diagnosis,0.213315,0.213315
3,Weight_Risk_Level,0.181485,0.181485
4,Alcohol_Use,0.1803,0.1803
5,Alcohol_Use_x_Stage_at_Diagnosis,0.1803,0.1803
6,Cancer_Type_Grouped,0.14475,0.14475
7,High_Risk_Drug_Count,0.14065,0.14065
8,BMI,-0.138218,0.138218
9,Drug_Doxorubicin,0.137006,0.137006


In [None]:
threshold = 0

# Keep only features above the threshold
features_to_drop = correlation_df[correlation_df["abs_correlation"] < threshold].index.tolist()

df_dropped = df.drop(columns=features_to_drop)

df_dropped.to_csv("added_features.csv", index=False)

print(f"Dropped {len(features_to_drop)} features with abs(corr) < {threshold}")

✅ Dropped 0 features with abs(corr) < 0


In [None]:
df = pd.read_csv("added_features.csv") 

exclude_cols = ["id", "label"]

feature_cols = [
    col for col in df.columns
    if col not in exclude_cols
]


In [None]:
df = pd.read_csv("added_features.csv") 

feature_cols

top_corrs = []


for col1, col2 in tqdm(combinations(feature_cols, 2), total=len(feature_cols) * (len(feature_cols) - 1) // 2):
    corr = df[col1].corr(df[col2])
    if not np.isnan(corr):
        top_corrs.append((col1, col2, corr, abs(corr)))

corr_df = pd.DataFrame(top_corrs, columns=["Feature_1", "Feature_2", "Correlation", "Abs_Correlation"])

corr_df = corr_df.sort_values("Abs_Correlation", ascending=False)
corr_df.to_csv("feature_pairwise_correlations.csv", index=False)

corr_df.head(20)


100%|██████████| 4186/4186 [00:47<00:00, 88.39it/s] 


Unnamed: 0,Feature_1,Feature_2,Correlation,Abs_Correlation
3618,Age_at_Diagnosis,Age_at_Diagnosis_x_Total_Chemo_Score,1.0,1.0
3653,Age_at_Surgery,Age_at_Surgery_x_Total_Chemo_Score,1.0,1.0
996,Alcohol_Use,Alcohol_Use_x_Stage_at_Diagnosis,1.0,1.0
922,Smoking_History,Smoking_History_x_Stage_at_Diagnosis,1.0,1.0
3717,Total_Symptom_Score,Total_Symptom_Score_x_Total_Chemo_Score,1.0,1.0
30,Weight,Weight_Group,0.997718,0.997718
119,Height,Height_Group,0.99405,0.99405
3620,Age_at_Diagnosis,Age_at_Surgery_x_Total_Chemo_Score,0.95661,0.95661
4166,Age_at_Diagnosis_x_Total_Chemo_Score,Age_at_Surgery_x_Total_Chemo_Score,0.95661,0.95661
3651,Age_at_Surgery,Age_at_Diagnosis_x_Total_Chemo_Score,0.95661,0.95661


In [None]:
target_corr_df = pd.read_csv("feature_label_correlations.csv")
pairwise_corr_df = pd.read_csv("feature_pairwise_correlations.csv")

target_corr = dict(zip(target_corr_df["Feature"], target_corr_df["abs_correlation"]))

high_corr_pairs = pairwise_corr_df[pairwise_corr_df["Abs_Correlation"] > 0.9]

# Step 4: Determine which feature to drop from each pair
to_drop = set()
for _, row in high_corr_pairs.iterrows():
    f1, f2 = row["Feature_1"], row["Feature_2"]

    score1 = target_corr.get(f1, 0)
    score2 = target_corr.get(f2, 0)
    print(f"Feature {f1} with corr {score1} and Feature {f2} with corr {score2}")
    # Drop the one with lower correlation to label
    if score1 >= score2:
        to_drop.add(f2)
        print(f"We keep feature {f1}")
    else:
        to_drop.add(f1)
        print(f"We keep feature {f2}")

df = pd.read_csv("added_features.csv")

df_reduced = df.drop(columns=list(to_drop), errors="ignore")
test_df = test_df.drop(columns=list(to_drop), errors="ignore")
df_reduced.to_csv("final_features.csv", index=False)


print(f"Dropped {len(to_drop)} redundant features due to high inter-feature correlation.")
print(sorted(to_drop))



Feature Age_at_Diagnosis with corr 0.008944142404855 and Feature Age_at_Diagnosis_x_Total_Chemo_Score with corr 0.008944142404855
We keep feature Age_at_Diagnosis
Feature Age_at_Surgery with corr 0.0359947708038703 and Feature Age_at_Surgery_x_Total_Chemo_Score with corr 0.0359947708038703
We keep feature Age_at_Surgery
Feature Alcohol_Use with corr 0.1802996397218876 and Feature Alcohol_Use_x_Stage_at_Diagnosis with corr 0.1802996397218876
We keep feature Alcohol_Use
Feature Smoking_History with corr 0.2133149544472466 and Feature Smoking_History_x_Stage_at_Diagnosis with corr 0.2133149544472466
We keep feature Smoking_History
Feature Total_Symptom_Score with corr 0.1050056945571586 and Feature Total_Symptom_Score_x_Total_Chemo_Score with corr 0.1050056945571586
We keep feature Total_Symptom_Score
Feature Weight with corr 0.1225173605509144 and Feature Weight_Group with corr 0.1220815365885201
We keep feature Weight
Feature Height with corr 0.0150731958118355 and Feature Height_Group 

In [98]:
df = pd.read_csv("final_features.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26473 entries, 0 to 26472
Data columns (total 79 columns):
 #   Column                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                         --------------  -----  
 0   id                                                                                                                                             26473 non-null  int64  
 1   Weight                                                                                                                                         26473 non-null  float64
 2   Height                                                                                                                                         26473 non-null  float64
 3   Urban_Rural               

In [99]:
df = pd.read_csv("final_features.csv")
X = df.drop(columns=["label", "id"])
y = df["label"]

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

param_distributions = {
    "n_estimators": [200, 300, 500],
    "max_depth": [5, 7, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 1, 5],
    "min_child_weight": [1, 3, 5],
    "reg_alpha": [0, 0.1, 0.5],  
    "reg_lambda": [1, 1.5, 2.0]  
}

model = xgb.XGBClassifier(eval_metric="logloss", use_label_encoder=False)

search = RandomizedSearchCV(
    model,
    param_distributions,
    n_iter=200,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    random_state=42
)

search.fit(X_train_scaled, y_train)

preds = search.predict(X_val_scaled)

acc = accuracy_score(y_val, preds)
prec = precision_score(y_val, preds, average='binary')  
rec = recall_score(y_val, preds, average='binary')
f1 = f1_score(y_val, preds, average='binary')

print(f"XGBoost Best Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, preds))



🔍 Tuning XGBoost...


KeyboardInterrupt: 

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X_reduced, y, test_size=0.2, stratify=y, random_state=42)

# # Parameter grids (simplified for random search)
# param_distributions = {
#     "Logistic Regression": {
#         "classifier__C": [0.01, 0.1, 1, 10],
#         "classifier__solver": ["lbfgs"]
#     },
#     "Support Vector Machine": {
#         "classifier__C": [0.1, 1, 10],
#         "classifier__kernel": ["linear", "rbf"]
#     },
#     "Decision Tree": {
#         "classifier__max_depth": [5, 10, 20, None],
#         "classifier__min_samples_split": [2, 5, 10]
#     },
#     "Random Forest": {
#         "classifier__n_estimators": [100, 200],
#         "classifier__max_depth": [10, 20, None],
#         "classifier__min_samples_split": [2, 5]
#     },
#     "XGBoost": {
#         "classifier__n_estimators": [100, 200],
#         "classifier__max_depth": [3, 5, 7],
#         "classifier__learning_rate": [0.05, 0.1]
#     }
# }

# base_models = {
#     "Logistic Regression": LogisticRegression(max_iter=1000),
#     #"Support Vector Machine": SVC(),
#     "Decision Tree": DecisionTreeClassifier(),
#     "Random Forest": RandomForestClassifier(),
#     "XGBoost": xgb.XGBClassifier(eval_metric="logloss")
# }

# # Evaluate all using RandomizedSearchCV
# results = {}
# best_models = {}

# for name, model in base_models.items():
#     pipe = Pipeline([
#         ("scaler", StandardScaler()),
#         ("classifier", model)
#     ])
    
#     search = RandomizedSearchCV(
#         pipe,
#         param_distributions[name],
#         n_iter=1,  # Number of random combinations to try
#         cv=3,
#         scoring="accuracy",
#         n_jobs=-1,
#         random_state=42
#     )
    
#     search.fit(X_train, y_train)
#     best_models[name] = search.best_estimator_
#     preds = search.predict(X_val)
#     acc = accuracy_score(y_val, preds)
#     results[name] = acc
#     print(f" {name} best accuracy: {acc:.4f}")

# # Show best model overall
# best_model_name = max(results, key=results.get)
# print(f"Best tuned model: {best_model_name} ({results[best_model_name]:.4f})")


In [None]:
test_ids = test_df["id"]

X_test = test_df.drop(columns=["id"])

X_test_scaled = scaler.transform(X_test)

test_preds = search.best_estimator_.predict(X_test_scaled)

submission_df = pd.DataFrame({
    "id": test_ids,
    "label": test_preds
})

submission_df.to_csv("xgb_test_predictions.csv", index=False)

📁 Saved predictions to xgb_test_predictions.csv ✅
