In [69]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')


df_train['Y'] = df_train['Y'].astype(int)


df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)


X = df_train.drop(['id', 'Y'], axis=1)
y = df_train['Y']

median_values = X.median()
X_imputed = X.fillna(median_values)


# Count of each class
counts = df_train['Y'].value_counts()

# Percentage
percentages = df_train['Y'].value_counts(normalize=True) * 100

print("Counts:\n", counts)
print("\nPercentages:\n", percentages)


Counts:
 Y
0    48732
1    14361
Name: count, dtype: int64

Percentages:
 Y
0    77.238362
1    22.761638
Name: proportion, dtype: float64


In [57]:
# --- Modeling and Evaluation (Gradient Boosting) ---

# Initialize the model and Stratified K-Fold
# This is a solid baseline model setup
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []
# print("--- Starting 5-Fold Stratified Cross-Validation ---")

for fold, (train_index, val_index) in enumerate(skf.split(X_imputed, y)):
    X_train, X_val = X_imputed.iloc[train_index], X_imputed.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict probabilities for the validation set
    y_pred_proba = model.predict_proba(X_val)[:, 1]

    # Calculate AUC-ROC score
    fold_auc = roc_auc_score(y_val, y_pred_proba)
    auc_scores.append(fold_auc)

print(f"Mean AUC-ROC: {np.mean(auc_scores):.4f}")

Mean AUC-ROC: 0.7921


In [61]:
# --- Final Prediction and Submission Pipeline (Requires test.csv) ---

def create_submission(model, X_train, y_train, median_values):
    # 1. Fit the final model on the ENTIRE training set
    model.fit(X_train, y_train)

    # 2. Load the test data
    try:
        df_test = pd.read_csv('test.csv')
    except FileNotFoundError:
        print("Error: test.csv not found. Cannot generate submission file.")
        return

    # 3. Prepare Test Data (Same steps as training data)
    X_test = df_test.drop('id', axis=1)

    # Replace inf values with NaN
    X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Impute missing values using the MEDIANS calculated from the TRAINING data
    X_test_imputed = X_test.fillna(median_values)

    # 4. Predict probabilities
    # We predict the probability of the positive class (1)
    test_pred_proba = model.predict_proba(X_test_imputed)[:, 1]

    # 5. Create Submission DataFrame
    submission = pd.DataFrame({
        'id': df_test['id'],
        'Target': test_pred_proba
    })

    # 6. Save the file
    submission.to_csv('submission.csv', index=False)
    print("Submission file 'submission.csv' created successfully.")

# Example function call (You would run this after finding the best hyperparameters)
# create_submission(model, X_imputed, y, median_values)

In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
# You must install this package: pip install catboost
from catboost import CatBoostClassifier 

# --- 1. Load and Prepare Data ---
df_train = pd.read_csv('train.csv')

# Convert the boolean target to integer (1 for True, 0 for False)
df_train['Y'] = df_train['Y'].astype(int)

# Replace all infinite values with NaN 
# CatBoost handles NaNs automatically, so we don't impute the whole set.
df_train.replace([np.inf, -np.inf], np.nan, inplace=True)

X = df_train.drop(['id', 'Y'], axis=1)
y = df_train['Y']

# --- 2. Cross-Validation with CatBoost ---

# Initialize CatBoost Model
model_cat = CatBoostClassifier(
    iterations=500,               # Number of trees
    learning_rate=0.05,
    depth=6,
    loss_function='Logloss',      # For binary classification
    eval_metric='AUC',            # Optimization metric
    random_seed=42,
    verbose=0,                    # Suppress output
    allow_writing_files=False     # Important for cloud environments
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores_cat = []

print("Starting 5-Fold Stratified Cross-Validation with CatBoost...")

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Train the model 
    # CatBoost handles features X_train and X_val containing NaN values directly.
    model_cat.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=20)
    
    # Predict probabilities 
    y_pred_proba = model_cat.predict_proba(X_val)[:, 1]
    fold_auc = roc_auc_score(y_val, y_pred_proba)
    auc_scores_cat.append(fold_auc)

# Calculate and print the mean AUC-ROC score
mean_auc_cat = np.mean(auc_scores_cat)
print(f"\n--- CatBoost Cross-Validation Results ---")
print(f"Mean AUC-ROC (CatBoost): {mean_auc_cat:.4f}")

Starting 5-Fold Stratified Cross-Validation with CatBoost...

--- CatBoost Cross-Validation Results ---
Mean AUC-ROC (CatBoost): 0.7928


In [65]:
create_submission(model_cat, X_train, y_train, median_values)

Submission file 'submission.csv' created successfully.


In [66]:
sub_df = pd.read_csv('submission.csv')
sub_df.columns

Index(['id', 'Target'], dtype='object')

In [68]:
sub_df = sub_df.rename(columns={'Target': 'Y'})

sub_df['Y'] = (sub_df['Y'] > 0.5).astype(int)

sub_df.to_csv('Submission.csv' , index = False)