## Imports

In [5]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold 
from sklearn.base import clone

from catboost import CatBoostClassifier, Pool

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10) # make plots bigger

## Load Data

In [6]:
# Load data
train = pd.read_csv('../input/playground-series-s4e10/train.csv', index_col = 'id')
test = pd.read_csv('../input/playground-series-s4e10/test.csv', index_col = 'id')
submission = pd.read_csv('../input/playground-series-s4e10/sample_submission.csv', index_col = 'id')
original_data = pd.read_csv('../input/loan-approval-prediction/credit_risk_dataset.csv')

# Display sample data
display(train.head(2))
display(test.head(2))
display(submission.head(2))
display(original_data.head(2))

train_df = pd.concat([train,original_data])
X_train = train_df.drop(['loan_status'], axis=1)
y_train = train_df['loan_status']
X_test = test

X_test.info()

Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0


Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4


Unnamed: 0_level_0,loan_status
id,Unnamed: 1_level_1
58645,0.5
58646,0.5


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2


<class 'pandas.core.frame.DataFrame'>
Index: 39098 entries, 58645 to 97742
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  39098 non-null  int64  
 1   person_income               39098 non-null  int64  
 2   person_home_ownership       39098 non-null  object 
 3   person_emp_length           39098 non-null  float64
 4   loan_intent                 39098 non-null  object 
 5   loan_grade                  39098 non-null  object 
 6   loan_amnt                   39098 non-null  int64  
 7   loan_int_rate               39098 non-null  float64
 8   loan_percent_income         39098 non-null  float64
 9   cb_person_default_on_file   39098 non-null  object 
 10  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 3.6+ MB


## Data Preprocessing

In [7]:
numeric_columns = X_train.select_dtypes(include=['number']).columns

# Data Preprocessing
def preprocess_data(df_train, df_test):
    
    # Handle missing values in numeric data (fill missing values with the mean)
    imputer = SimpleImputer(strategy='mean')  # Create an imputer
    df_train[numeric_columns] = imputer.fit_transform(df_train[numeric_columns])
    df_test[numeric_columns] = imputer.transform(df_test[numeric_columns])   
    
    # convert all columns to category (gives better results for CatBoost)
    df_train[df_train.columns] = df_train.astype(str).astype('category')
    df_test[df_test.columns] = df_test.astype(str).astype('category')
           
    return df_train, df_test
    
    
# Preprocess the train and test data
X_train, X_test = preprocess_data(X_train, X_test)

# Display the first few rows to check the result
display(X_train.head(5))
display(X_test.head(5))

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,37.0,35000.0,RENT,0.0,EDUCATION,B,6000.0,11.49,0.17,N,14.0
1,22.0,56000.0,OWN,6.0,MEDICAL,C,4000.0,13.35,0.07,N,2.0
2,29.0,28800.0,OWN,8.0,PERSONAL,A,6000.0,8.9,0.21,N,10.0
3,30.0,70000.0,RENT,14.0,VENTURE,B,12000.0,11.11,0.17,N,5.0
4,22.0,60000.0,RENT,2.0,MEDICAL,A,6000.0,6.92,0.1,N,3.0


Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
58645,23.0,69000.0,RENT,3.0,HOMEIMPROVEMENT,F,25000.0,15.76,0.36,N,2.0
58646,26.0,96000.0,MORTGAGE,6.0,PERSONAL,C,10000.0,12.68,0.1,Y,4.0
58647,26.0,30000.0,RENT,5.0,VENTURE,E,4000.0,17.19,0.13,Y,2.0
58648,33.0,50000.0,RENT,4.0,DEBTCONSOLIDATION,A,7000.0,8.9,0.14,N,7.0
58649,26.0,102000.0,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000.0,16.32,0.15,Y,4.0


## Model

In [8]:
categorical_indexes = [index for index, type in enumerate(X_train.dtypes) if type == 'object' or type == 'category']

def get_model_oof_predictions(X, y, model, n_splits=2, random_state=2):

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Convert X and y to numpy arrays
    X_array = X.values
    y_array = y.values
    
    # Initialize results dictionary
    results = {
        'oof_predictions': np.zeros(len(X)),
        'y_val' : np.zeros(len(X)),
        'fold_scores': [],
        'mean_auc': 0.0,
        'oof_models' : []
    }
    
    # Perform k-fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_array,y_array), 1):
        # Split data
        X_train, X_val = X_array[train_idx], X_array[val_idx]
        y_train, y_val = y_array[train_idx], y_array[val_idx]
        
        # Train and predict
        print(f"Training - Fold {fold}")
            
        # Clone and train the model for this fold
        fold_model = clone(model)
    
        X_train_pool = Pool(X_train, y_train, cat_features=categorical_indexes)
        X_valid_pool = Pool(X_val, y_val, cat_features=categorical_indexes)

        fold_model.fit(X=X_train_pool, eval_set=X_valid_pool)

        # Make predictions on validation fold
        val_pred = fold_model.predict_proba(X_val)[:, 1]
        results['oof_predictions'][val_idx] = val_pred
        results['y_val'][val_idx] = y_val # save for further analysis
            
        # Calculate and store fold score
        fold_score = roc_auc_score(y_val, val_pred)
        results['fold_scores'].append(fold_score)
        print(f"AUC score: {fold_score}")
            
        # store model for further use for submission
        results['oof_models'].append(fold_model)
    
    # Calculate overall score
    mean_auc = roc_auc_score(y_array, results['oof_predictions'])
    results['mean_auc'] = mean_auc

    fold_scores = results['fold_scores']
    print(f"\n Results:")
    print(f"Fold AUC scores: {', '.join([f'{score:.8f}' for score in fold_scores])}")
    print(f"Mean AUC: {mean_auc:.8f} (±{np.std(fold_scores):.8f})")
    
    return results 

model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    #learning_rate=0.05,
    iterations=5000,
    early_stopping_rounds=200,
    #depth=9,
    #random_strength=0,
    #l2_leaf_reg=0.5,
    task_type='GPU',
    random_seed=2,
    verbose=False
)

# Get OOF predictions
results = get_model_oof_predictions(X_train, y_train, model)

Training - Fold 1


Default metric period is 5 because AUC is/are not implemented for GPU


AUC score: 0.9630878283967378
Training - Fold 2


Default metric period is 5 because AUC is/are not implemented for GPU


AUC score: 0.9627640758691379

 Results:
Fold AUC scores: 0.96308783, 0.96276408
Mean AUC: 0.96290347 (±0.00016188)


In [17]:
results['oof_predictions'][0]

0.03384140450287685

In [12]:
for fold_id, y_val in enumerate(results['y_val']):
    display(fold_id,y_val[fold_id])
    residuals = y_val - result['oof_predictions'][fold_id]
    display(residuals)
    sns.displot(residuals, kde=True)

IndexError: invalid index to scalar variable.

### Get OOF Predictions for test data

In [None]:
predictions = pd.DataFrame()
for fold_id, model in enumerate(results['oof_models']):
    predictions[fold_id] = model.predict_proba(X_test)[:, 1]
    
predictions
    

In [None]:
# # Create submission - averaging
# submission['loan_status'] = predictions.mean(axis=1).values
# submission.to_csv('submission.csv')
# submission.head()

In [None]:
# weighted average
def calculate_weighted_predictions(predictions_df, results):

    # Get fold scores from results
    weights = np.array(results['fold_scores'])
    
    # Normalize weights so they sum to 1
    weights = weights / weights.sum()
    
#     print(weights)
    
    print("Fold weights:")
    for fold_idx, weight in enumerate(weights, 1):
        print(f"Fold {fold_idx}: {weight:.8f} (AUC: {results['fold_scores'][fold_idx-1]:.8f})")
    
    # Calculate weighted average for each row
    weighted_predictions = np.zeros(len(predictions_df))
    for fold_idx, weight in enumerate(weights):
        weighted_predictions += predictions_df[fold_idx] * weight
        
    return weighted_predictions

# Calculate weighted predictions using the results from get_model_oof_predictions
pred = calculate_weighted_predictions(predictions, results)

# Save submission
submission['loan_status'] = pred.values
submission.to_csv('submission.csv')
submission.head()