In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy.stats import mode

In [2]:
df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
arr = df.values.astype(float)
finite_mask = np.isfinite(arr)
col_min = np.nanmin(np.where(finite_mask, arr, np.nan), axis=0)
col_max = np.nanmax(np.where(finite_mask, arr, np.nan), axis=0)
arr[arr == -np.inf] = col_min[np.where(arr == -np.inf)[1]]
arr[arr == np.inf]  = col_max[np.where(arr == np.inf)[1]]
df_fixed = pd.DataFrame(arr, columns=df.columns)
df = df_fixed

In [4]:
arr = test_df.values.astype(float)
finite_mask = np.isfinite(arr)
col_min = np.nanmin(np.where(finite_mask, arr, np.nan), axis=0)
col_max = np.nanmax(np.where(finite_mask, arr, np.nan), axis=0)
arr[arr == -np.inf] = col_min[np.where(arr == -np.inf)[1]]
arr[arr == np.inf]  = col_max[np.where(arr == np.inf)[1]]
test_df_fixed = pd.DataFrame(arr, columns=test_df.columns)
test_df = test_df_fixed

In [5]:
cols = df.columnscols = df.columns
test_cols = test_df.columns
imputer = SimpleImputer(strategy='mean')
df[cols] = imputer.fit_transform(df[cols])
test_df[test_cols] = imputer.fit_transform(test_df[test_cols])
test_cols = test_df.columns
imputer = SimpleImputer(strategy='mean')
df[cols] = imputer.fit_transform(df[cols])
test_df[test_cols] = imputer.fit_transform(test_df[test_cols])

  cols = df.columnscols = df.columns


In [6]:
df = df.drop(columns = 'id')
test_id = test_df['id']
test_df = test_df.drop(columns = 'id')

In [7]:
Y = df['Y']
X = df.drop(columns = 'Y')

In [8]:
cols = X.columns
test_cols = test_df.columns
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=cols)
test_df = scaler.fit_transform(test_df)
test_df = pd.DataFrame(test_df, columns=test_cols)

In [9]:


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = []  # if you later have test data

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y), 1):
    print(f"\n----- Fold {fold} -----")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]


    model = CatBoostClassifier(
        iterations=5000,
        learning_rate=0.01,
        depth=6,
        eval_metric="AUC",
        random_seed=42,
        verbose=200,
        class_weights=[1, (len(Y)-sum(Y))/sum(Y)],
        use_best_model=True
        )
    model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

    # Out-of-fold predictions
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

    # Evaluate
    acc = accuracy_score(y_val, (oof_preds[val_idx] > 0.5).astype(int))
    auc = roc_auc_score(y_val, oof_preds[val_idx])
    print(f"Fold {fold} Accuracy: {acc:.4f}, ROC AUC: {auc:.4f}")

# Overall performance
final_acc = accuracy_score(Y, (oof_preds > 0.5).astype(int))
final_auc = roc_auc_score(Y, oof_preds)

print(f"\nOverall Accuracy: {final_acc:.4f}")
print(f"Overall ROC AUC:  {final_auc:.4f}")


----- Fold 1 -----
0:	test: 0.7755322	best: 0.7755322 (0)	total: 172ms	remaining: 14m 20s
200:	test: 0.7915561	best: 0.7915561 (200)	total: 9.25s	remaining: 3m 40s
400:	test: 0.7931822	best: 0.7931822 (400)	total: 18.4s	remaining: 3m 30s
600:	test: 0.7935286	best: 0.7935349 (595)	total: 28.3s	remaining: 3m 27s
800:	test: 0.7937426	best: 0.7937426 (800)	total: 37.3s	remaining: 3m 15s
1000:	test: 0.7939969	best: 0.7939988 (999)	total: 50.7s	remaining: 3m 22s
1200:	test: 0.7939532	best: 0.7940559 (1069)	total: 1m 3s	remaining: 3m 20s
1400:	test: 0.7940797	best: 0.7940816 (1381)	total: 1m 13s	remaining: 3m 8s
1600:	test: 0.7941215	best: 0.7941921 (1501)	total: 1m 22s	remaining: 2m 55s
1800:	test: 0.7940272	best: 0.7941921 (1501)	total: 1m 30s	remaining: 2m 40s
2000:	test: 0.7939501	best: 0.7941921 (1501)	total: 1m 38s	remaining: 2m 27s
2200:	test: 0.7936386	best: 0.7941921 (1501)	total: 1m 46s	remaining: 2m 14s
2400:	test: 0.7936287	best: 0.7941921 (1501)	total: 1m 53s	remaining: 2m 3s
26

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test_df))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y), 1):
    print(f"\n----- Fold {fold} -----")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]

    model = CatBoostClassifier(
        iterations=5000,
        learning_rate=0.05,
        depth=6,
        eval_metric="AUC",
        random_seed=42,
        verbose=200,
        class_weights=[1, (len(Y)-sum(Y))/sum(Y)],
        use_best_model=True
        )
    model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

    # OOF validation predictions (probabilities)
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

    # Average test predictions across folds (probabilities)
    test_preds += model.predict_proba(test_df)[:, 1] / skf.n_splits

    # Metrics for this fold
    acc = accuracy_score(y_val, (oof_preds[val_idx] > 0.5).astype(int))
    auc = roc_auc_score(y_val, oof_preds[val_idx])
    print(f"Fold {fold} Accuracy: {acc:.4f}, ROC AUC: {auc:.4f}")

# Overall metrics
final_acc = accuracy_score(Y, (oof_preds > 0.5).astype(int))
final_auc = roc_auc_score(Y, oof_preds)
print(f"\nOverall Accuracy: {final_acc:.4f}")
print(f"Overall ROC AUC:  {final_auc:.4f}")



# Save to CSV
submission = pd.DataFrame({
    "id": test_id,     # replace with your test IDs if available
    "prediction": test_preds
})
submission.to_csv("submission.csv", index=False)

print("\n✅ Predictions saved to submission.csv")


----- Fold 1 -----
0:	test: 0.7755322	best: 0.7755322 (0)	total: 23.9ms	remaining: 1m 59s
200:	test: 0.7939720	best: 0.7940050 (177)	total: 7.52s	remaining: 2m 59s
400:	test: 0.7929694	best: 0.7941045 (218)	total: 15.2s	remaining: 2m 54s
600:	test: 0.7919837	best: 0.7941045 (218)	total: 22.8s	remaining: 2m 46s
800:	test: 0.7905032	best: 0.7941045 (218)	total: 30.2s	remaining: 2m 38s
1000:	test: 0.7894921	best: 0.7941045 (218)	total: 37.7s	remaining: 2m 30s
1200:	test: 0.7880870	best: 0.7941045 (218)	total: 45.2s	remaining: 2m 23s
1400:	test: 0.7866257	best: 0.7941045 (218)	total: 52.5s	remaining: 2m 14s
1600:	test: 0.7858794	best: 0.7941045 (218)	total: 1m	remaining: 2m 7s
1800:	test: 0.7852002	best: 0.7941045 (218)	total: 1m 7s	remaining: 1m 59s
2000:	test: 0.7839992	best: 0.7941045 (218)	total: 1m 14s	remaining: 1m 51s
2200:	test: 0.7827738	best: 0.7941045 (218)	total: 1m 22s	remaining: 1m 44s
2400:	test: 0.7817413	best: 0.7941045 (218)	total: 1m 29s	remaining: 1m 36s
2600:	test: 0.