In [1]:
import sys
import os

# Add project root to Python path
sys.path.append(os.path.abspath(".."))
from configs.bootstrap import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PROJECT_ROOT = Path("..").resolve()

DATA_DIR = PROJECT_ROOT / "data" / "raw"
train_path = DATA_DIR / "train.csv"
test_path = DATA_DIR / "test.csv"

In [3]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [4]:
train_df["Heart Disease"]=train_df["Heart Disease"].map({"Presence":1,"Absence":0})

In [5]:
train_df.head(5)

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,1
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,0
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,0
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,0
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,1


## 1. DATA UNDERSTANDING 


In [119]:
columns=train_df.columns
columns

Index(['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Max HR',
       'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Heart Disease'],
      dtype='str')

In [120]:
print("Data shape:", train_df.shape)

Data shape: (630000, 15)


In [121]:
print("No of Rows:",train_df.shape[0])
print("No of Cols:",train_df.shape[1])


No of Rows: 630000
No of Cols: 15


In [122]:
# no of columns are 14
# no of rows are 630000
# 13 features and 1 target variable

In [123]:
train_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       630000 non-null  int64  
 1   Age                      630000 non-null  int64  
 2   Sex                      630000 non-null  int64  
 3   Chest pain type          630000 non-null  int64  
 4   BP                       630000 non-null  int64  
 5   Cholesterol              630000 non-null  int64  
 6   FBS over 120             630000 non-null  int64  
 7   EKG results              630000 non-null  int64  
 8   Max HR                   630000 non-null  int64  
 9   Exercise angina          630000 non-null  int64  
 10  ST depression            630000 non-null  float64
 11  Slope of ST              630000 non-null  int64  
 12  Number of vessels fluro  630000 non-null  int64  
 13  Thallium                 630000 non-null  int64  
 14  Heart Disease  

In [124]:
train_df["Sex"].value_counts()

Sex
1    450283
0    179717
Name: count, dtype: int64

In [125]:
train_df["Chest pain type"].value_counts()

Chest pain type
4    329179
3    197278
2     74941
1     28602
Name: count, dtype: int64

In [126]:
train_df["BP"].value_counts()

BP
120    116574
130    106259
140     99775
150     44272
110     42639
160     24133
125     20480
112     18905
128     18559
138     18523
118     12128
132     11669
108      9853
135      9264
145      8156
124      7046
100      5914
134      5568
152      4828
180      4269
105      4134
126      3873
122      3773
115      3738
136      3640
142      3603
178      1982
170      1944
102      1754
94       1711
174       780
144       728
146       684
172       679
101       674
104       665
106       651
200       646
165       632
192       615
158       599
155       587
117       577
156       552
129       507
123       502
148       493
154       108
111        81
114        52
175        39
109        36
141        28
168        21
103        19
149        17
127        13
131        12
116        11
133         9
147         8
162         3
99          3
96          1
95          1
184         1
Name: count, dtype: int64

In [127]:
# ID -> WILL USE AS INDEX 
# Age -> NUMERICAL   ( USEFULL COLUMN AS IT CAN BE USED TO FIND OUT THE AGE GROUP FOR HEART DISEASE PATIENTS WHICH CAN BE USEFUL FOR PREDICTION)
# Sex -> CATEGORICAL-> NUMERICAL ENCODED IN 0 AND 1 (0 FOR FEMALE AND 1 FOR MALE)

# ChestPainType -> CATEGORICAL -> ONE HOT ENCODED (4 CATEGORIES: ASY, ATA, NAP, TA) 
# 1.Typical angina
# 2.Atypical angina
# 3.Non-anginal pain
# 4.Asymptomatic -> Dominant CATEGORY 


# Bp (Blood Pressure) -> NUMERICAL (USEFULL COLUMN AS HIGH BLOOD PRESSURE CAN BE A RISK FACTOR FOR HEART DISEASE)


In [128]:
train_df.iloc[70]

id                         70.0000
Age                        64.0000
Sex                         0.0000
Chest pain type             3.0000
BP                        120.0000
Cholesterol               211.0000
FBS over 120                0.0000
EKG results                 0.0000
Max HR                    162.0000
Exercise angina             0.0000
ST depression               0.0000
Slope of ST                 1.0000
Number of vessels fluro     0.0000
Thallium                    7.0000
Heart Disease               1.0000
Name: 70, dtype: float64

In [6]:
# null value containing features.

def findNullData(train_df:pd.DataFrame)-> None:
    for column in columns:
        nullCount=train_df[column].isna().sum()
        if nullCount>0:
            print(f"{column} contains {nullCount} null values.")

findNullData(train_df)

NameError: name 'columns' is not defined

In [None]:
# duplicate value containing features.

def findDuplicareData(train_df:pd.DataFrame)-> None:
        duplicateCount=train_df.duplicated().sum()
        if duplicateCount>0:
            print(f"Data contains {duplicateCount} duplicate values.")

findDuplicareData(train_df)

In [7]:
def drop_column(*args):
    train_df:pd.DataFrame=args[0]
    column_names=args[1:]
    print(f"Dataframe shape before drop:{train_df.shape}")
    train_df.drop(columns=column_names if len(column_names)>1 else column_names[0],inplace=True)
    print(f"{column_names} sucessfully dropped.")
    print(f"Dataframe shape After drop:{train_df.shape}")



In [8]:
X=train_df.iloc[:,0:-1]
Y=train_df.iloc[:,-1]

In [9]:
X

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,0,58,1,4,152,239,0,0,158,1,3.6000,2,2,7
1,1,52,1,1,125,325,0,2,171,0,0.0000,1,0,3
2,2,56,0,2,160,188,0,2,151,0,0.0000,1,0,3
3,3,44,0,3,134,229,0,2,150,0,1.0000,2,0,3
4,4,58,1,4,140,234,0,2,125,1,3.8000,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
629995,629995,56,0,1,110,226,0,0,132,0,0.0000,1,0,7
629996,629996,54,1,4,128,249,1,2,150,0,0.0000,2,0,3
629997,629997,67,1,4,130,275,0,0,149,0,0.0000,1,2,7
629998,629998,52,1,4,140,199,0,2,157,0,0.0000,1,0,6


In [10]:
X.columns

Index(['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Max HR',
       'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium'],
      dtype='object')

In [11]:
drop_column(X,X.columns[0])

Dataframe shape before drop:(630000, 14)
('id',) sucessfully dropped.
Dataframe shape After drop:(630000, 13)


In [12]:
X

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,58,1,4,152,239,0,0,158,1,3.6000,2,2,7
1,52,1,1,125,325,0,2,171,0,0.0000,1,0,3
2,56,0,2,160,188,0,2,151,0,0.0000,1,0,3
3,44,0,3,134,229,0,2,150,0,1.0000,2,0,3
4,58,1,4,140,234,0,2,125,1,3.8000,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
629995,56,0,1,110,226,0,0,132,0,0.0000,1,0,7
629996,54,1,4,128,249,1,2,150,0,0.0000,2,0,3
629997,67,1,4,130,275,0,0,149,0,0.0000,1,2,7
629998,52,1,4,140,199,0,2,157,0,0.0000,1,0,6


In [13]:
Y

0         1
1         0
2         0
3         0
4         1
         ..
629995    0
629996    0
629997    1
629998    1
629999    0
Name: Heart Disease, Length: 630000, dtype: int64

2. Model Building

In [17]:
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix

In [141]:
N_SPLITS = 5
print("\nTarget Distribution:")
print(train_df["Heart Disease"].value_counts(normalize=True))



Target Distribution:
Heart Disease
0   0.5517
1   0.4483
Name: proportion, dtype: float64


In [15]:
X_test = test_df.drop(columns=["id"])

In [153]:
skf = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

train_scores = []
val_scores = []
models=[]

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    
    print(f"\n========== Fold {fold+1} ==========")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    

    model = LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=1.0,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric=["accuracy", "auc"],
        callbacks=[]
    )
    models.append(model)
    
    # Predictions
    train_pred = model.predict_proba(X_train)[:, 1]
    val_pred = model.predict_proba(X_val)[:, 1]
    
    # Store OOF
    oof_preds[val_idx] = val_pred
    
    # Store test preds
    test_preds += model.predict_proba(X_test)[:, 1] / N_SPLITS
    
    # Scores
    train_auc = roc_auc_score(y_train, train_pred)
    val_auc = roc_auc_score(y_val, val_pred)
    
    train_scores.append(train_auc)
    val_scores.append(val_auc)
    
    print(f"Train AUC: {train_auc:.5f}")
    print(f"Valid AUC: {val_auc:.5f}")


# ==========================================
# 7. FINAL CV RESULTS
# ==========================================
print("\n==============================")
print(f"Mean Train AUC: {np.mean(train_scores):.5f}")
print(f"Mean Valid AUC: {np.mean(val_scores):.5f}")
print(f"OOF AUC: {roc_auc_score(y, oof_preds):.5f}")
print("==============================")


[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021282 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 415
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
Train AUC: 0.96272
Valid AUC: 0.95478

[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017580 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 418
[LightGBM] [Info] Number of data points in the train set: 504000, nu

NameError: name 'y' is not defined

## TRAIN MODEL USING DIFFERENT PARAMS

#### 1 XgBoost

In [19]:
from xgboost import XGBClassifier

xgb_oof = np.zeros(len(X))
xgb_test = np.zeros(len(X_test))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in skf.split(X, Y):
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]

    model = XGBClassifier(
        n_estimators=2000,
        learning_rate=0.04729251623266966,
        max_depth=4,
        subsample=0.8806643349439545,
        colsample_bytree=0.7007181431585127,
        reg_alpha=1.1217471989768009,
        reg_lambda=0.3372236757990388,
        eval_metric="auc",
        random_state=42,
        n_jobs=-1,
        tree_method="hist",
    )

    model.fit(X_train, y_train)

    xgb_oof[val_idx] = model.predict_proba(X_val)[:,1]
    xgb_test += model.predict_proba(X_test)[:,1] / 5

print("XGB OOF AUC:", roc_auc_score(Y, xgb_oof))


XGB OOF AUC: 0.9553567502868096


In [29]:
from catboost import CatBoostClassifier

cat_oof = np.zeros(len(X))
cat_test = np.zeros(len(X_test))

for train_idx, val_idx in skf.split(X, Y):
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]

    model = CatBoostClassifier(
        iterations=2000,
        learning_rate=0.09963870125051141,
        depth=4,
        l2_leaf_reg=3.9620956129798675,
        loss_function="Logloss",
        eval_metric="AUC",
        random_seed=42,
        verbose=0
    )

    model.fit(X_train, y_train)

    cat_oof[val_idx] = model.predict_proba(X_val)[:,1]
    cat_test += model.predict_proba(X_test)[:,1] / 5

print("CatBoost OOF AUC:", roc_auc_score(Y, cat_oof))


CatBoost OOF AUC: 0.9554395246337422


RANDOM SEARCH CV:
Best Params: {'model__subsample': 0.7, 'model__reg_lambda': 0.5, 'model__reg_alpha': 0.5, 'model__num_leaves': 31, 'model__n_estimators': 1600, 'model__min_child_samples': 40, 'model__max_depth': 8, 'model__learning_rate': 0.02, 'model__colsample_bytree': 0.7}

OPTUNA:
Best AUC: 0.9553633184913405
Best Params: {'learning_rate': 0.04689367370315809, 'num_leaves': 49, 'max_depth': 4, 'min_child_samples': 32, 'subsample': 0.7223117664101711, 'colsample_bytree': 0.8352262295510223, 'reg_alpha': 2.248822478298761, 'reg_lambda': 0.17728212280528755}


In [21]:
from lightgbm import LGBMClassifier

lgb_oof = np.zeros(len(X))
lgb_test = np.zeros(len(X_test))

for train_idx, val_idx in skf.split(X, Y):
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]

  
    model = LGBMClassifier(
        n_estimators=1600,
        learning_rate=0.04689367370315809,
        num_leaves=49,
        subsample=0.7223117664101711,
        colsample_bytree=0.8352262295510223,
        reg_alpha= 2.248822478298761,
        reg_lambda= 0.17728212280528755,
        random_state=42,
        n_jobs=-1,
        min_child_samples=32,
        max_depth=4
        
        
    )
    
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric=["accuracy", "auc"],
        callbacks=[]
    )

    lgb_oof[val_idx] = model.predict_proba(X_val)[:,1]
    lgb_test += model.predict_proba(X_test)[:,1] / 5

print("LGBM OOF AUC:", roc_auc_score(Y, lgb_oof))


[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 422
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 417
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [In

In [22]:
os.getcwd()

'c:\\Users\\HP\\Documents\\Kaggle_Compition_Bacancy\\notebooks'

In [28]:
np.save("./xgb_oof.npy",xgb_oof)
np.save("./xgb_test.npy",xgb_test)
np.save("./lgb_oof.npy",xgb_oof)
np.save("./lgb_test.npy",xgb_test)


In [None]:
np.save("./xgb_oof.npy",xgb_oof)

In [210]:
oof_matrix = np.column_stack([
    lgb_oof,
    xgb_oof,
    cat_oof
])

test_matrix = np.column_stack([
    lgb_test,
    xgb_test,
    cat_test
])


In [204]:
final_test_preds = test_matrix.mean(axis=1)


In [205]:
final_test_preds

array([0.96017598, 0.00914352, 0.98751238, ..., 0.0464447 , 0.18398643,
       0.02569946], shape=(270000,))

In [206]:
test_preds

array([0.96687972, 0.01142295, 0.98669502, ..., 0.04504858, 0.16601311,
       0.02380722], shape=(270000,))

In [207]:
submission_path= PROJECT_ROOT / "data" / "raw" / "sample_submission.csv"
submission_df = pd.read_csv(submission_path) 

In [208]:
submission_df["Heart Disease"] = test_preds
submission_df.to_csv("xg-light-cat-1.csv", index=False)

print("Submission saved: xg-light-cat-1.csv")

Submission saved: xg-light-cat-1.csv


In [152]:
submission_df

Unnamed: 0,id,Heart Disease
0,630000,0.9674
1,630001,0.0108
2,630002,0.9878
3,630003,0.0080
4,630004,0.1745
...,...,...
269995,899995,0.1274
269996,899996,0.6564
269997,899997,0.0464
269998,899998,0.1691


## CAT/LGB/XG COMBINE

In [211]:
oof_matrix = np.column_stack([
    lgb_oof,
    xgb_oof,
    cat_oof
])

test_matrix = np.column_stack([
    lgb_test,
    xgb_test,
    cat_test
])

final_test_preds = test_matrix.mean(axis=1)


In [212]:
submission_path= PROJECT_ROOT / "data" / "raw" / "sample_submission.csv"
submission_df = pd.read_csv(submission_path) 

In [213]:
submission_df["Heart Disease"] = final_test_preds
submission_df.to_csv("optuna-cat-lgb-xg-boost-combine.csv", index=False)

print("Submission saved: lgbm_baseline.csv")

Submission saved: lgbm_baseline.csv


In [None]:
oof_matrix = np.column_stack([
    lgb_oof,
    xgb_oof,
    cat_oof
])

test_matrix = np.column_stack([
    lgb_test,
    xgb_test,
    cat_test
])
from sklearn.linear_model import LogisticRegression

meta_model = LogisticRegression(    
    solver="lbfgs",
    max_iter=1000,
    random_state=42)

meta_model.fit(oof_matrix, Y)

final_test_preds = meta_model.predict_proba(test_matrix)[:,1]

print("Stacked OOF AUC:", roc_auc_score(Y, meta_model.predict_proba(oof_matrix)[:,1]))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Stacked OOF AUC: 0.9540783550479291


In [48]:
final_test_preds.shape

(270000,)

In [49]:
submission_path= PROJECT_ROOT / "data" / "raw" / "sample_submission.csv"
submission_df = pd.read_csv(submission_path) 

In [50]:
submission_df["Heart Disease"] = final_test_preds
submission_df.to_csv("stack3-logistic-cat-lgb-xg-boost-combine.csv", index=False)

print("Submission saved: lgbm_baseline.csv")

Submission saved: lgbm_baseline.csv


In [177]:
submission_df

Unnamed: 0,id,Heart Disease
0,630000,0.9503
1,630001,0.0385
2,630002,0.9606
3,630003,0.0378
4,630004,0.1267
...,...,...
269995,899995,0.1079
269996,899996,0.7363
269997,899997,0.0494
269998,899998,0.1157


In [1]:
d={"model":"mean"}


In [4]:
d.get("model1",["median"])

['median']

## HISTGRADIENT BOOST META

In [54]:
xgb_oof_loaded=np.load(r"C:\Users\HP\Documents\Kaggle_Compition_Bacancy\submissions\submission11\oof_preds_xgb.npy")
cat_oof_loaded=np.load(r"C:\Users\HP\Documents\Kaggle_Compition_Bacancy\submissions\submission12\models\oof_preds_cat.npy")

xgb_test_loaded=np.load(r"C:\Users\HP\Documents\Kaggle_Compition_Bacancy\submissions\submission11\test_preds_xgb.npy")
cat_test_loaded=np.load(r"C:\Users\HP\Documents\Kaggle_Compition_Bacancy\submissions\submission12\models\test_preds_cat.npy")

oof_matrix_loaded=np.column_stack([
    xgb_oof_loaded,
    cat_oof_loaded])

test_matrix_loaded=np.column_stack([
    xgb_test_loaded,
    cat_test_loaded])


from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score


# =========================
# Meta Model
# =========================
meta_model = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=500,
    max_depth=3,
    max_leaf_nodes=15,
    l2_regularization=0.5,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=42
)

meta_model.fit(oof_matrix_loaded, Y)

# OOF stacked score
stacked_oof = meta_model.predict_proba(oof_matrix_loaded)[:, 1]
print("Stacked OOF AUC:", roc_auc_score(Y, stacked_oof))

# Final test prediction
final_test_preds = meta_model.predict_proba(test_matrix_loaded)[:, 1]


Stacked OOF AUC: 0.9555301563895783


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score

# Build OOF matrix (add hgb if you have it)
oof_matrix = np.column_stack([
    lgb_oof,
    xgb_oof,
    cat_oof
])

test_matrix = np.column_stack([
    lgb_test,
    xgb_test,
    cat_test
])

# =========================
# Meta Model
# =========================
meta_model = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=500,
    max_depth=3,
    max_leaf_nodes=15,
    l2_regularization=0.5,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=42
)

meta_model.fit(oof_matrix, Y)

# OOF stacked score
stacked_oof = meta_model.predict_proba(oof_matrix)[:, 1]
print("Stacked OOF AUC:", roc_auc_score(Y, stacked_oof))

# Final test prediction
final_test_preds = meta_model.predict_proba(test_matrix)[:, 1]


Stacked OOF AUC: 0.955546213220738
Stacked OOF AUC: 0.955546213220738


In [111]:
import mlflow
import pathlib

BASE_PATH = pathlib.Path(
    r"C:\Users\HP\Documents\Kaggle_Compition_Bacancy\src\mlruns"
)

BASE_PATH.mkdir(parents=True, exist_ok=True)

mlflow.set_tracking_uri(str(BASE_PATH))


In [None]:
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    log_loss,
    confusion_matrix,
    classification_report,
    roc_curve
)

def log_stacking_run(
    meta_model,
    Y_true,
    stacked_oof,
    final_test_preds,
    oof_matrix,
    test_matrix,
    base_models=("lgb", "xgb", "cat"),
    threshold=0.5
):



    mlflow.set_experiment("stacking_experiment_histgradient")

    with mlflow.start_run(run_name="hist_meta_stacking"):

        # =========================
        # Log Meta Model Parameters
        # =========================
        mlflow.log_params(meta_model.get_params())

        # =========================
        # Log Base Model Info
        # =========================
        mlflow.log_param("base_models", "_".join(base_models))
        mlflow.log_metric("num_base_models", len(base_models))

        # =========================
        # Metrics
        # =========================
        y_pred_labels = (stacked_oof > threshold).astype(int)

        metrics = {
            "stacked_roc_auc": roc_auc_score(Y_true, stacked_oof),
            "stacked_accuracy": accuracy_score(Y_true, y_pred_labels),
            "stacked_precision": precision_score(Y_true, y_pred_labels),
            "stacked_recall": recall_score(Y_true, y_pred_labels),
            "stacked_f1": f1_score(Y_true, y_pred_labels),
            "stacked_log_loss": log_loss(Y_true, stacked_oof),
        }

        mlflow.log_metrics(metrics)

        # =========================
        # Threshold analysis
        # =========================
        for t in [0.3, 0.4, 0.5, 0.6, 0.7]:
            preds = (stacked_oof > t).astype(int)
            mlflow.log_metric(f"stacked_f1_at_{t}", f1_score(Y_true, preds))

        # =========================
        # Confusion Matrix
        # =========================
        cm = confusion_matrix(Y_true, y_pred_labels)
        np.save("stack_confusion_matrix.npy", cm)
        mlflow.log_artifact("stack_confusion_matrix.npy")

        # =========================
        # Classification Report
        # =========================
        report = classification_report(Y_true, y_pred_labels)
        with open("stack_classification_report.txt", "w") as f:
            f.write(report)
        mlflow.log_artifact("stack_classification_report.txt")

        # =========================
        # ROC Curve
        # =========================
        fpr, tpr, _ = roc_curve(Y_true, stacked_oof)
        roc_data = np.vstack((fpr, tpr)).T
        np.save("stack_roc_curve.npy", roc_data)
        mlflow.log_artifact("stack_roc_curve.npy")

        # =========================
        # Save OOF + Test matrices
        # =========================
        np.save("stack_oof_matrix.npy", oof_matrix)
        np.save("stack_test_matrix.npy", test_matrix)

        mlflow.log_artifact("stack_oof_matrix.npy")
        mlflow.log_artifact("stack_test_matrix.npy")

        # =========================
        # Save stacked predictions
        # =========================
        np.save("stacked_oof_preds.npy", stacked_oof)
        np.save("stacked_test_preds.npy", final_test_preds)

        mlflow.log_artifact("stacked_oof_preds.npy")
        mlflow.log_artifact("stacked_test_preds.npy")

        # =========================
        # Log Meta Model
        # =========================
        mlflow.sklearn.log_model(meta_model, "hist_meta_model")

    print("✅ Stacking run logged to MLflow")


Saved stacking outputs successfully


In [117]:
import numpy as np
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    log_loss,
    confusion_matrix,
    classification_report,
    roc_curve
)
import os

def save_stacking_logs(
    meta_model,
    Y_true,
    stacked_oof,
    final_test_preds,
    oof_matrix,
    test_matrix,
    base_models=("lgb", "xgb", "cat"),
    thresholds=(0.3, 0.4, 0.5, 0.6, 0.7),
    save_dir="stacking_logs"
):
    """
    Save stacking run results locally:
    - metrics
    - threshold-based F1
    - confusion matrix
    - classification report
    - ROC curve
    - OOF/test matrices
    - stacked predictions
    - meta model parameters
    """

    os.makedirs(save_dir, exist_ok=True)

    # =========================
    # Main metrics at 0.5
    # =========================
    y_pred_labels = (stacked_oof > 0.5).astype(int)

    metrics = {
        "roc_auc": roc_auc_score(Y_true, stacked_oof),
        "accuracy": accuracy_score(Y_true, y_pred_labels),
        "precision": precision_score(Y_true, y_pred_labels),
        "recall": recall_score(Y_true, y_pred_labels),
        "f1": f1_score(Y_true, y_pred_labels),
        "log_loss": log_loss(Y_true, stacked_oof),
    }

    # Save metrics
    metrics_file = os.path.join(save_dir, "metrics.txt")
    with open(metrics_file, "w") as f:
        for k, v in metrics.items():
            f.write(f"{k}: {v}\n")
    print(f"✅ Metrics saved at {metrics_file}")

    # =========================
    # Threshold-based F1
    # =========================
    threshold_file = os.path.join(save_dir, "threshold_f1.txt")
    with open(threshold_file, "w") as f:
        for t in thresholds:
            preds_t = (stacked_oof > t).astype(int)
            f1_t = f1_score(Y_true, preds_t)
            f.write(f"F1 at threshold {t}: {f1_t}\n")
    print(f"✅ Threshold F1 saved at {threshold_file}")

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(Y_true, y_pred_labels)
    cm_file = os.path.join(save_dir, "confusion_matrix.npy")
    np.save(cm_file, cm)
    print(f"✅ Confusion matrix saved at {cm_file}")

    # =========================
    # Classification Report
    # =========================
    report = classification_report(Y_true, y_pred_labels)
    report_file = os.path.join(save_dir, "classification_report.txt")
    with open(report_file, "w") as f:
        f.write(report)
    print(f"✅ Classification report saved at {report_file}")

    # =========================
    # ROC Curve
    # =========================
    fpr, tpr, _ = roc_curve(Y_true, stacked_oof)
    roc_data = np.vstack((fpr, tpr)).T
    roc_file = os.path.join(save_dir, "roc_curve.npy")
    np.save(roc_file, roc_data)
    print(f"✅ ROC curve saved at {roc_file}")

    # =========================
    # Save matrices & predictions
    # =========================
    np.save(os.path.join(save_dir, "oof_matrix.npy"), oof_matrix)
    np.save(os.path.join(save_dir, "test_matrix.npy"), test_matrix)
    np.save(os.path.join(save_dir, "stacked_oof_preds.npy"), stacked_oof)
    np.save(os.path.join(save_dir, "stacked_test_preds.npy"), final_test_preds)
    print(f"✅ OOF/Test matrices and predictions saved in {save_dir}")

    # =========================
    # Save meta model parameters
    # =========================
    params_file = os.path.join(save_dir, "meta_model_params.txt")
    with open(params_file, "w") as f:
        for k, v in meta_model.get_params().items():
            f.write(f"{k}: {v}\n")
    print(f"✅ Meta model parameters saved at {params_file}")


In [116]:
save_stacking_logs(
    meta_model=meta_model,
    Y_true=Y,
    stacked_oof=stacked_oof,
    final_test_preds=final_test_preds,
    oof_matrix=oof_matrix,
    test_matrix=test_matrix
)


✅ Metrics saved at stacking_logs\metrics.txt
✅ Threshold F1 saved at stacking_logs\threshold_f1.txt
✅ Confusion matrix saved at stacking_logs\confusion_matrix.npy
✅ Classification report saved at stacking_logs\classification_report.txt
✅ ROC curve saved at stacking_logs\roc_curve.npy
✅ OOF/Test matrices and predictions saved in stacking_logs
✅ Meta model parameters saved at stacking_logs\meta_model_params.txt


In [67]:
suspected_noise = (
    ((xgb_oof_loaded > 0.9) & (Y == 0)) |
    ((xgb_oof_loaded < 0.1) & (Y == 1))
)


In [68]:
print("Noise %:", suspected_noise.mean())


Noise %: 0.019668253968253967


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

meta_oof = np.zeros(len(Y))
meta_test_preds = np.zeros(len(test_matrix))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in skf.split(oof_matrix, Y):
    
    X_train_meta, X_val_meta = oof_matrix[train_idx], oof_matrix[val_idx]
    y_train_meta, y_val_meta = Y.iloc[train_idx], Y.iloc[val_idx]
    
    meta_model = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_iter=500,
        max_depth=3,
        max_leaf_nodes=15,
        l2_regularization=0.5,
        random_state=42
    )
    
    # Train
    meta_model.fit(X_train_meta, y_train_meta)
    
    # OOF prediction
    meta_oof[val_idx] = meta_model.predict_proba(X_val_meta)[:,1]
    
    # Test prediction (average across folds)
    meta_test_preds += meta_model.predict_proba(test_matrix)[:,1] / skf.n_splits


print("Proper Stacked OOF AUC:", roc_auc_score(Y, meta_oof))


Proper Stacked OOF AUC: 0.9553963460591419


# 2 LEVEL STACKING

In [78]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

# Containers
meta1_oof = np.zeros((len(Y), 3))   # 3 meta models
meta1_test = np.zeros((len(test_matrix), 3))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(oof_matrix, Y)):
    
    X_train, X_val = oof_matrix[train_idx], oof_matrix[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    # -------------------
    # 1️⃣ HistGradientBoosting
    # -------------------
    hist_model = HistGradientBoostingClassifier(
          learning_rate=0.05,
    max_iter=500,
    max_depth=3,
    max_leaf_nodes=15,
    l2_regularization=0.5,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=42
    )
    
    hist_model.fit(X_train, y_train)
    
    meta1_oof[val_idx, 0] = hist_model.predict_proba(X_val)[:,1]
    meta1_test[:, 0] += hist_model.predict_proba(test_matrix)[:,1] / skf.n_splits
    
    
    # -------------------
    # 2️⃣ Logistic Regression
    # -------------------
    log_model = LogisticRegression(
           solver="lbfgs",
    max_iter=1000,
    random_state=42
    )
    
    log_model.fit(X_train, y_train)
    
    meta1_oof[val_idx, 1] = log_model.predict_proba(X_val)[:,1]
    meta1_test[:, 1] += log_model.predict_proba(test_matrix)[:,1] / skf.n_splits
    
    
    # -------------------
    # 3️⃣ LightGBM
    # -------------------
    lgb_model = LGBMClassifier(
      n_estimators=1600,
        learning_rate=0.04689367370315809,
        num_leaves=49,
        subsample=0.7223117664101711,
        colsample_bytree=0.8352262295510223,
        reg_alpha= 2.248822478298761,
        reg_lambda= 0.17728212280528755,
        random_state=42,
        n_jobs=-1,
        min_child_samples=32,
        max_depth=4
    )
    
    lgb_model.fit(X_train, y_train)
    
    meta1_oof[val_idx, 2] = lgb_model.predict_proba(X_val)[:,1]
    meta1_test[:, 2] += lgb_model.predict_proba(test_matrix)[:,1] / skf.n_splits


print("Level-1 Stacked AUC:", roc_auc_score(Y, meta1_oof.mean(axis=1)))


[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008479 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383




[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006545 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383




[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383




[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001605 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383




[LightGBM] [Info] Number of positive: 225964, number of negative: 278036
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448341 -> initscore=-0.207375
[LightGBM] [Info] Start training from score -0.207375




Level-1 Stacked AUC: 0.9553912121763006


In [79]:
final_oof = np.zeros(len(Y))
final_test = np.zeros(len(test_matrix))

for fold, (train_idx, val_idx) in enumerate(skf.split(meta1_oof, Y)):
    
    X_train_final, X_val_final = meta1_oof[train_idx], meta1_oof[val_idx]
    y_train_final, y_val_final = Y.iloc[train_idx], Y.iloc[val_idx]
    
    final_model = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_iter=500,
        max_depth=3,
        max_leaf_nodes=15,
        l2_regularization=0.5,
        random_state=42
    )
    
    final_model.fit(X_train_final, y_train_final)
    
    final_oof[val_idx] = final_model.predict_proba(X_val_final)[:,1]
    final_test += final_model.predict_proba(meta1_test)[:,1] / 5


print("Final Level-2 Stacked AUC:", roc_auc_score(Y, final_oof))


Final Level-2 Stacked AUC: 0.955348356387371


In [80]:
submission_path= PROJECT_ROOT / "data" / "raw" / "sample_submission.csv"
submission_df = pd.read_csv(submission_path) 

In [81]:
submission_df["Heart Disease"] = final_test
submission_df.to_csv("2level-stack-histgradient-folds-cat-xg-boost-combine.csv", index=False)

print("Submission saved: 2level-stack-histgradient-folds-cat-xg-boost-combine.csv")

Submission saved: 2level-stack-histgradient-folds-cat-xg-boost-combine.csv


In [82]:
submission_df

Unnamed: 0,id,Heart Disease
0,630000,0.9564
1,630001,0.0064
2,630002,0.9889
3,630003,0.0061
4,630004,0.2120
...,...,...
269995,899995,0.1465
269996,899996,0.6816
269997,899997,0.0509
269998,899998,0.1761


# LGB AS META MODEL

In [178]:
from lightgbm import LGBMClassifier

meta_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=8,
    random_state=42
)

meta_model.fit(oof_matrix, Y)

final_test_preds = meta_model.predict_proba(test_matrix)[:,1]




In [179]:
final_test_preds.shape

(270000,)

In [180]:
submission_path= PROJECT_ROOT / "data" / "raw" / "sample_submission.csv"
submission_df = pd.read_csv(submission_path) 

In [181]:
submission_df["Heart Disease"] = final_test_preds
submission_df.to_csv("stack-LGBOOST-cat-lgb-xg-boost-combine.csv", index=False)

print("Submission saved: stack-LGBOOST-cat-lgb-xg-boost-combine.csv")

Submission saved: stack-LGBOOST-cat-lgb-xg-boost-combine.csv


In [182]:
submission_df

Unnamed: 0,id,Heart Disease
0,630000,0.9624
1,630001,0.0048
2,630002,0.9850
3,630003,0.0040
4,630004,0.2021
...,...,...
269995,899995,0.1673
269996,899996,0.6614
269997,899997,0.0411
269998,899998,0.1827


## RIDGE META

In [184]:
from sklearn.linear_model import Ridge

meta_model = Ridge(alpha=1.0)
meta_model.fit(oof_matrix, Y)

final_test_preds = meta_model.predict(test_matrix)


In [185]:
final_test_preds

array([0.95660234, 0.00744344, 0.98867252, ..., 0.04535912, 0.18142654,
       0.02603861], shape=(270000,))

In [187]:
final_test_preds.shape

(270000,)

In [188]:
submission_path= PROJECT_ROOT / "data" / "raw" / "sample_submission.csv"
submission_df = pd.read_csv(submission_path) 

In [189]:
submission_df["Heart Disease"] = final_test_preds
submission_df.to_csv("stack-Ridge-cat-lgb-xg-boost-combine.csv", index=False)

print("Submission saved: stack-LGBOOST-cat-lgb-xg-boost-combine.csv")

Submission saved: stack-LGBOOST-cat-lgb-xg-boost-combine.csv


In [190]:
oof_df = pd.DataFrame({
    "lgb": lgb_oof,
    "xgb": xgb_oof,
    "cat": cat_oof,
    "target": Y
})

print(oof_df.corr())


          lgb    xgb    cat  target
lgb    1.0000 0.9983 0.9980  0.8187
xgb    0.9983 1.0000 0.9985  0.8188
cat    0.9980 0.9985 1.0000  0.8198
target 0.8187 0.8188 0.8198  1.0000


In [191]:
from scipy.stats import rankdata

blend = (
    rankdata(lgb_oof) +
    rankdata(xgb_oof) +
    rankdata(cat_oof    )
)

print("Rank Blend AUC:",
      roc_auc_score(Y, blend))


Rank Blend AUC: 0.9553419607763806


## USING LIGHT BOOST 5 MODELS

In [192]:
models

[LGBMClassifier(colsample_bytree=0.8, learning_rate=0.03, n_estimators=2000,
                n_jobs=-1, num_leaves=64, random_state=42, reg_alpha=1.0,
                reg_lambda=1.0, subsample=0.8),
 LGBMClassifier(colsample_bytree=0.8, learning_rate=0.03, n_estimators=2000,
                n_jobs=-1, num_leaves=64, random_state=42, reg_alpha=1.0,
                reg_lambda=1.0, subsample=0.8),
 LGBMClassifier(colsample_bytree=0.8, learning_rate=0.03, n_estimators=2000,
                n_jobs=-1, num_leaves=64, random_state=42, reg_alpha=1.0,
                reg_lambda=1.0, subsample=0.8),
 LGBMClassifier(colsample_bytree=0.8, learning_rate=0.03, n_estimators=2000,
                n_jobs=-1, num_leaves=64, random_state=42, reg_alpha=1.0,
                reg_lambda=1.0, subsample=0.8),
 LGBMClassifier(colsample_bytree=0.8, learning_rate=0.03, n_estimators=2000,
                n_jobs=-1, num_leaves=64, random_state=42, reg_alpha=1.0,
                reg_lambda=1.0, subsample=0.8)]

In [197]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np

# Reshape OOF for sklearn
X_meta = oof_preds.reshape(-1, 1)

meta_model = LogisticRegression(
    solver="lbfgs",
    max_iter=1000,
    random_state=42
)

meta_model.fit(X_meta, Y)

# OOF stacked predictions
stack_oof = meta_model.predict_proba(X_meta)[:, 1]

print("Original OOF AUC:", roc_auc_score(Y, oof_preds))
print("Meta (Calibrated) OOF AUC:", roc_auc_score(Y, stack_oof))

# Apply to test
X_test_meta = test_preds.reshape(-1, 1)
stack_test_preds = meta_model.predict_proba(X_test_meta)[:, 1]


Original OOF AUC: 0.9549529328880091
Meta (Calibrated) OOF AUC: 0.9549529328880091


In [198]:
stack_test_preds

array([0.95122285, 0.04230353, 0.95676016, ..., 0.0518908 , 0.10580823,
       0.04561972], shape=(270000,))

In [199]:
submission_path= PROJECT_ROOT / "data" / "raw" / "sample_submission.csv"
submission_df = pd.read_csv(submission_path) 

In [200]:
submission_df["Heart Disease"] = stack_test_preds
submission_df.to_csv("stack-lightboost-5models-cat-lgb-xg-boost-combine.csv", index=False)

print("Submission saved: stack-lightboost-5models-cat-lgb-xg-boost-combine.csv")

Submission saved: stack-lightboost-5models-cat-lgb-xg-boost-combine.csv
