In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold 
from sklearn.base import clone
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)

# Load data
def load_data():
    train = pd.read_csv('../input/playground-series-s4e10/train.csv', index_col = 'id')
    test = pd.read_csv('../input/playground-series-s4e10/test.csv', index_col = 'id')
    submission = pd.read_csv('../input/playground-series-s4e10/sample_submission.csv', index_col = 'id')
    original_data = pd.read_csv('../input/loan-approval-prediction/credit_risk_dataset.csv')
    
    train_df = pd.concat([train, original_data])
    X_train = train_df.drop(['loan_status'], axis=1)
    y_train = train_df['loan_status']
    X_test = test
    
    return X_train, X_test, y_train, submission

# Data Preprocessing for CatBoost
def preprocess_catboost(df_train, df_test):
    numeric_columns = df_train.select_dtypes(include=['number']).columns
    
    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    df_train[numeric_columns] = imputer.fit_transform(df_train[numeric_columns])
    df_test[numeric_columns] = imputer.transform(df_test[numeric_columns])   
    
    # Convert to category
    df_train = df_train.astype(str).astype('category')
    df_test = df_test.astype(str).astype('category')
           
    return df_train, df_test

# Data Preprocessing for XGBoost
def preprocess_xgboost(df_train, df_test):
    numeric_columns = df_train.select_dtypes(include=['number']).columns
    
    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    df_train[numeric_columns] = imputer.fit_transform(df_train[numeric_columns])
    df_test[numeric_columns] = imputer.transform(df_test[numeric_columns])   
    
    # Convert categorical columns using OrdinalEncoder
    categorical_columns = df_train.select_dtypes(include=['object', 'category']).columns
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
    df_train[categorical_columns] = encoder.fit_transform(df_train[categorical_columns])
    df_test[categorical_columns] = encoder.transform(df_test[categorical_columns])
            
    return df_train, df_test

def get_catboost_predictions(X, y, X_test, n_splits=10, random_state=2):
    categorical_indexes = [index for index, type in enumerate(X.dtypes) if type == 'category']
    
    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        iterations=5000,
        early_stopping_rounds=200,
        task_type='GPU',
        random_seed=random_state,
        verbose=False
    )
    
    predictions = pd.DataFrame()
    fold_scores = []
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"CatBoost - Training Fold {fold + 1}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        fold_model = clone(model)
        train_pool = Pool(X_train, y_train, cat_features=categorical_indexes)
        val_pool = Pool(X_val, y_val, cat_features=categorical_indexes)
        
        fold_model.fit(X=train_pool, eval_set=val_pool)
        
        # Validate
        val_pred = fold_model.predict_proba(X_val)[:, 1]
        fold_score = roc_auc_score(y_val, val_pred)
        fold_scores.append(fold_score)
        print(f"AUC score: {fold_score:.6f}")
        
        # Predict test
        predictions[fold] = fold_model.predict_proba(X_test)[:, 1]
    
    print(f"\nCatBoost Results:")
    print(f"Mean AUC: {np.mean(fold_scores):.6f} (±{np.std(fold_scores):.6f})")
    
    return predictions, fold_scores

def get_xgboost_predictions(X, y, X_test, n_splits=10, random_state=2):
    model = XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        enable_categorical=True,
        n_estimators=5000,
        early_stopping_rounds=200,
        tree_method='hist',
        random_state=random_state
    )
    
    predictions = pd.DataFrame()
    fold_scores = []
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"XGBoost - Training Fold {fold + 1}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        fold_model = clone(model)
        fold_model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        
        # Validate
        val_pred = fold_model.predict_proba(X_val)[:, 1]
        fold_score = roc_auc_score(y_val, val_pred)
        fold_scores.append(fold_score)
        print(f"AUC score: {fold_score:.6f}")
        
        # Predict test
        predictions[fold] = fold_model.predict_proba(X_test)[:, 1]
    
    print(f"\nXGBoost Results:")
    print(f"Mean AUC: {np.mean(fold_scores):.6f} (±{np.std(fold_scores):.6f})")
    
    return predictions, fold_scores

def calculate_weighted_predictions(predictions_df, fold_scores):
    weights = np.array(fold_scores)
    weights = weights / weights.sum()
    #display(weights)
    
    weighted_predictions = np.zeros(len(predictions_df))
    for fold_idx, weight in enumerate(weights):
        #display(fold_idx,weight)
        weighted_predictions += predictions_df[fold_idx] * weight
        
    return weighted_predictions



In [2]:
# Load data
X_train, X_test, y_train, submission = load_data()

# Get CatBoost predictions
X_train_cat, X_test_cat = preprocess_catboost(X_train.copy(), X_test.copy())
catboost_predictions, catboost_scores = get_catboost_predictions(X_train_cat, y_train, X_test_cat, n_splits=10)

# Get XGBoost predictions
X_train_xgb, X_test_xgb = preprocess_xgboost(X_train.copy(), X_test.copy())
xgboost_predictions, xgboost_scores = get_xgboost_predictions(X_train_xgb, y_train, X_test_xgb, n_splits=10)

# # Calculate weighted predictions for each model
# catboost_weighted = calculate_weighted_predictions(catboost_predictions, catboost_scores)
# xgboost_weighted = calculate_weighted_predictions(xgboost_predictions, xgboost_scores)

# # Combine predictions with simple average
# final_predictions = (catboost_weighted + xgboost_weighted) / 2


FileNotFoundError: [Errno 2] No such file or directory: '../input/playground-series-s4e10/train.csv'

In [45]:
xgboost_predictions_adjusted = xgboost_predictions.copy()
xgboost_predictions_adjusted.columns = [len(catboost_predictions.columns) + int(col) for col in xgboost_predictions_adjusted.columns] # rename columns
#display(xgboost_predictions_adjusted)
predictions = pd.concat([catboost_predictions, xgboost_predictions_adjusted],axis=1)
display(predictions)

scores = catboost_scores + xgboost_scores
weighted = calculate_weighted_predictions(predictions, scores)
final_predictions = weighted


# Create submission
submission['loan_status'] = final_predictions.values
submission.to_csv('submission.csv')
print("\nSubmission file created")
display(submission)

Unnamed: 0,0,1,2,3
0,0.999897,0.999958,0.999901,0.999909
1,0.016581,0.018846,0.094697,0.024023
2,0.444118,0.326018,0.589680,0.644720
3,0.006274,0.004125,0.006926,0.008077
4,0.024523,0.022847,0.058435,0.048696
...,...,...,...,...
39093,0.084150,0.109105,0.074059,0.060577
39094,0.006141,0.003326,0.010680,0.003946
39095,0.003085,0.000965,0.005762,0.006059
39096,0.267411,0.198676,0.107545,0.434555



Submission file created


Unnamed: 0_level_0,loan_status
id,Unnamed: 1_level_1
58645,0.999916
58646,0.038442
58647,0.500557
58648,0.006345
58649,0.038553
...,...
97738,0.082046
97739,0.006017
97740,0.003958
97741,0.251916
