In [10]:
# Import Libraries
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import trange
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
import gc

import warnings
warnings.filterwarnings("ignore")

# Load the data
test_data = pd.read_csv("/Users/raghavgarg/Downloads/playground-series-s4e10/test.csv")
train_data = pd.read_csv("/Users/raghavgarg/Downloads/playground-series-s4e10/train.csv")
original_data = pd.read_csv("/Users/raghavgarg/Downloads/credit_risk_dataset.csv")

# Preprocessing function
def preprocess(X):
    X = X.copy()
    # Creating derived features
    X['age_income_interaction'] = X['person_age'] * X['person_income']
    X['loan_to_emp_length_ratio'] = X['loan_amnt'] / (X['person_emp_length'] + 1)
    monthly_income = X['person_income'] / 12
    X['monthly_debt'] = X['loan_amnt'] * (1 + X['loan_int_rate']) / 12
    X['dti_ratio'] = X['monthly_debt'] / monthly_income
    X['risk_flag'] = np.where((X['cb_person_default_on_file'] == 'Y') & (X['loan_grade'].isin(['C', 'D', 'E'])), 1, 0)

    # Identify categorical and numerical columns
    categorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
    numerical_cols = [col for col in X.columns if col not in categorical_cols]

    # Creating additional derived features
    X['person_income_to_age'] = X['person_income'] / (X['person_age'] + 1)  # to avoid division by zero
    X['loan_amnt_to_income_ratio'] = X['loan_amnt'] / (X['person_income'] + 1)  # to avoid division by zero
    X['emp_length_to_age_ratio'] = X['person_emp_length'] / (X['person_age'] + 1)  # to avoid division by zero

    # Update numerical columns list
    numerical_cols += ['person_income_to_age', 'loan_amnt_to_income_ratio', 'emp_length_to_age_ratio']

    # Preprocessing for numerical data: Impute and scale
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical data: One-hot encoding
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Fit and transform the data using the preprocessor
    X_transformed = preprocessor.fit_transform(X)

    # Get column names for numerical and one-hot encoded categorical features
    cat_col_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)

    # Combine numerical and categorical column names
    all_col_names = numerical_cols + list(cat_col_names)

    # Convert the transformed array back to a pandas DataFrame
    X_transformed_df = pd.DataFrame(X_transformed, columns=all_col_names)

    return X_transformed_df

def down_sampling(X, y, i):
    """Perform down-sampling on the majority class."""
    # Convert y to a Pandas Series if it's a NumPy array
    if isinstance(y, np.ndarray):
        y = pd.Series(y)

    # Reset index to ensure proper alignment
    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)

    # Separate majority and minority classes
    majority_class = X[y == 0]
    minority_class = X[y == 1]

    # Down-sample majority class
    majority_sample = majority_class.sample(len(minority_class), random_state=i)

    # Combine the down-sampled majority class with the minority class
    y_minimal = pd.concat([y.iloc[majority_sample.index], y[y == 1]])

    # Ensure X_minimal is a DataFrame
    X_minimal = pd.concat([majority_sample, minority_class], axis=0)

    return X_minimal, y_minimal
    
# Define models
xgb_model = XGBClassifier(n_estimators=2000, early_stopping_rounds=100, eval_metric='auc', max_bin=262143, n_jobs=4, random_state=0)
lgbm_params = {
    'objective': 'binary',
    'n_estimators': 3000,
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'random_state': 42,
    'learning_rate': 0.0322942967545754,
    'num_leaves': 24,
    'max_depth': 15,
    'min_data_in_leaf': 25,
    'feature_fraction': 0.6236144085285287,
    'bagging_fraction': 0.9596685778433888,
    'bagging_freq': 3,
    'verbose': -1,
}
lgbm_model = lgb.LGBMClassifier(**lgbm_params)

def cross_validate(n_splits=5, n_bags=3, model_name='xgb'):
    """Compute out-of-fold and test predictions for a model."""
    start_time = datetime.datetime.now()
    scores = []
    oof_preds = np.zeros(len(y))
    test_preds = np.zeros(len(test_data))

    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=63)
    for fold, (train_index, valid_index) in enumerate(kfold.split(X, y)):
        for i in range(n_bags):
            X_train = X.iloc[train_index]
            y_train = y[train_index]  # Use y[train_index] instead of y.iloc[train_index]
            X_val = X.iloc[valid_index]
            y_val = y[valid_index]  # Use y[valid_index] instead of y.iloc[valid_index]

            X_train, y_train = down_sampling(X_train, y_train, 10 * fold + i)

            if model_name == 'lgb':
                m = clone(lgbm_model)
                m.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc')
            elif model_name == 'xgb':
                m = clone(xgb_model)
                m.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

            y_pred = m.predict_proba(X_val)[:, 1]
            score = roc_auc_score(y_val, y_pred)
            print(f"# Fold {fold}, bag {i}: ROC-AUC-Score={score:.5f}")
            scores.append(score)
            oof_preds[valid_index] += y_pred / n_bags
            test_preds += m.predict_proba(test_data)[:, 1] / (kfold.get_n_splits() * n_bags)

            del m
            gc.collect()

    elapsed_time = datetime.datetime.now() - start_time
    print(f"#ROC-AUC mean: {np.mean(scores):.7f} (+- {np.std(scores):.7f})"
          f"#   elapsed time:   {int(np.round(elapsed_time.total_seconds() / 60))} min")

    return oof_preds, test_preds

def cross_validate_catboost(X, y, test_data, n_splits=5, n_bags=3):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    oof_preds = np.zeros(X.shape[0])
    test_preds = np.zeros((test_data.shape[0], n_splits))

    for fold, (train_index, valid_index) in enumerate(kf.split(X, y)):
        X_train = X.iloc[train_index]
        X_val = X.iloc[valid_index]
        
        # Use standard indexing for y since it's a NumPy array
        y_train = y[train_index]
        y_val = y[valid_index]
        
        for i in range(n_bags):
            model = CatBoostClassifier(
                iterations=5000,
                learning_rate=0.03,
                depth=6,
                eval_metric='AUC',
                random_seed=42,
                logging_level='Silent'
            )
            
            model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100, verbose=100)
            
            # OOF predictions
            oof_preds[valid_index] += model.predict_proba(X_val)[:, 1] / n_bags
            
            # Test predictions
            test_preds[:, fold] += model.predict_proba(test_data)[:, 1] / n_bags
    
    return oof_preds, test_preds.mean(axis=1)

def blend_predictions(weights, oof_preds):
    return np.sum([w * pred for w, pred in zip(weights, oof_preds)], axis=0)

def evaluate_blend(oof_preds, y, weights):
    blended_preds = blend_predictions(weights, oof_preds)
    return roc_auc_score(y, blended_preds)

def hill_climb_blend(oof_preds, y, max_iterations=100):
    # Start with equal weights
    num_models = len(oof_preds)
    best_weights = np.ones(num_models) / num_models
    best_score = evaluate_blend(oof_preds, y, best_weights)

    for _ in range(max_iterations):
        for i in range(num_models):
            # Create new weights by tweaking one weight
            new_weights = best_weights.copy()
            new_weights[i] += 0.01  # Increase weight for model i
            new_weights = new_weights / np.sum(new_weights)  # Normalize weights
            
            new_score = evaluate_blend(oof_preds, y, new_weights)

            if new_score > best_score:
                best_score = new_score
                best_weights = new_weights

            # Try decreasing the weight as well
            new_weights = best_weights.copy()
            new_weights[i] -= 0.01  # Decrease weight for model i
            new_weights = new_weights / np.sum(new_weights)  # Normalize weights
            
            new_score = evaluate_blend(oof_preds, y, new_weights)

            if new_score > best_score:
                best_score = new_score
                best_weights = new_weights

    return best_weights, best_score

# Preprocess the training and test data
X = preprocess(train_data.drop(columns=['id', 'loan_status']))
y = train_data['loan_status'].values
test_data = preprocess(test_data.drop(columns=['id']))

# Cross-validate models and get OOF predictions
oof_xgb, test_xgb = cross_validate(n_splits=5, n_bags=3, model_name='xgb')
oof_lgbm, test_lgbm = cross_validate(n_splits=5, n_bags=3, model_name='lgb')
oof_cat, test_cat = cross_validate_catboost(X, y, test_data, n_splits=5, n_bags=3)

# Combine OOF predictions
oof_preds = [oof_cat, oof_xgb, oof_lgbm]

# Optimize weights using hill climbing
optimal_weights, best_score = hill_climb_blend(oof_preds, y, max_iterations=100)
print("Optimal weights:", optimal_weights)
print("Best ROC-AUC Score:", best_score)

# Generate final predictions using the optimal weights
test_preds = [test_cat, test_xgb, test_lgbm]
final_predictions = blend_predictions(optimal_weights, test_preds)

# Prepare submission DataFrame
submission = pd.DataFrame({
    'id': test_data.index,
    'loan_status': final_predictions
})

# Save submission file
submission.to_csv('subm.csv', index=False)

# Fold 0, bag 0: ROC-AUC-Score=0.94884
# Fold 0, bag 1: ROC-AUC-Score=0.94943
# Fold 0, bag 2: ROC-AUC-Score=0.94879
# Fold 1, bag 0: ROC-AUC-Score=0.95041
# Fold 1, bag 1: ROC-AUC-Score=0.95075
# Fold 1, bag 2: ROC-AUC-Score=0.95045
# Fold 2, bag 0: ROC-AUC-Score=0.95259
# Fold 2, bag 1: ROC-AUC-Score=0.95528
# Fold 2, bag 2: ROC-AUC-Score=0.95313
# Fold 3, bag 0: ROC-AUC-Score=0.94946
# Fold 3, bag 1: ROC-AUC-Score=0.95204
# Fold 3, bag 2: ROC-AUC-Score=0.95247
# Fold 4, bag 0: ROC-AUC-Score=0.95462
# Fold 4, bag 1: ROC-AUC-Score=0.95565
# Fold 4, bag 2: ROC-AUC-Score=0.95456
#ROC-AUC mean: 0.9518991 (+- 0.0022951)#   elapsed time:   0 min
# Fold 0, bag 0: ROC-AUC-Score=0.94955
# Fold 0, bag 1: ROC-AUC-Score=0.94924
# Fold 0, bag 2: ROC-AUC-Score=0.94846
# Fold 1, bag 0: ROC-AUC-Score=0.95252
# Fold 1, bag 1: ROC-AUC-Score=0.95253
# Fold 1, bag 2: ROC-AUC-Score=0.95027
# Fold 2, bag 0: ROC-AUC-Score=0.95158
# Fold 2, bag 1: ROC-AUC-Score=0.95315
# Fold 2, bag 2: ROC-AUC-Score=0.95501

In [15]:
# Import Libraries
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import trange
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
import optuna
import gc

import warnings
warnings.filterwarnings("ignore")

# Load the data
test_data = pd.read_csv("/Users/raghavgarg/Downloads/playground-series-s4e10/test.csv")
train_data = pd.read_csv("/Users/raghavgarg/Downloads/playground-series-s4e10/train.csv")
original_data = pd.read_csv("/Users/raghavgarg/Downloads/credit_risk_dataset.csv")

# Preprocessing function (refined)
def preprocess(X):
    X = X.copy()
    # Creating derived features
    X['age_income_interaction'] = X['person_age'] * X['person_income']
    X['loan_to_emp_length_ratio'] = X['loan_amnt'] / (X['person_emp_length'] + 1)
    monthly_income = X['person_income'] / 12
    X['monthly_debt'] = X['loan_amnt'] * (1 + X['loan_int_rate']) / 12
    X['dti_ratio'] = X['monthly_debt'] / monthly_income
    X['risk_flag'] = np.where((X['cb_person_default_on_file'] == 'Y') & (X['loan_grade'].isin(['C', 'D', 'E'])), 1, 0)

    # Categorical and numerical columns
    categorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
    numerical_cols = [col for col in X.columns if col not in categorical_cols]

    # Feature engineering
    X['person_income_to_age'] = X['person_income'] / (X['person_age'] + 1)
    X['loan_amnt_to_income_ratio'] = X['loan_amnt'] / (X['person_income'] + 1)
    X['emp_length_to_age_ratio'] = X['person_emp_length'] / (X['person_age'] + 1)

    # Preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    X_transformed = preprocessor.fit_transform(X)
    cat_col_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
    all_col_names = numerical_cols + list(cat_col_names)
    X_transformed_df = pd.DataFrame(X_transformed, columns=all_col_names)

    return X_transformed_df

def down_sampling(X, y, i):
    """Down-sample majority class."""
    X = pd.DataFrame(X).reset_index(drop=True)  # Convert to DataFrame if not already
    y = pd.Series(y).reset_index(drop=True)     # Convert to Series if not already
    majority_class = X[y == 0]
    minority_class = X[y == 1]
    majority_sample = majority_class.sample(len(minority_class), random_state=i)
    y_minimal = pd.concat([y.iloc[majority_sample.index], y[y == 1]])
    X_minimal = pd.concat([majority_sample, minority_class], axis=0)
    return X_minimal, y_minimal


# Optuna Hyperparameter Optimization for XGBoost
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }
    model = XGBClassifier(**params, random_state=42, n_jobs=4)
    kf = StratifiedKFold(n_splits=5)
    score = 0
    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        X_train, y_train = down_sampling(X_train, y_train, 10)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]
        score += roc_auc_score(y_val, preds)
    return score / 5

# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
best_params = study.best_params
print("Best XGBoost params:", best_params)

# Models
xgb_model = XGBClassifier(**best_params, random_state=42, n_jobs=4)
lgbm_model = lgb.LGBMClassifier(**lgbm_params)
catboost_model = CatBoostClassifier(
    iterations=5000, learning_rate=0.03, depth=6, eval_metric='AUC', random_seed=42, logging_level='Silent')

# Stacking
stacked_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgbm', lgbm_model),
        ('catboost', catboost_model)],
    final_estimator=LogisticRegression(),
    cv=5, n_jobs=-1
)

# Preprocess and train
X = preprocess(train_data.drop(columns=['id', 'loan_status']))
y = train_data['loan_status'].values
test_data_processed = preprocess(test_data.drop(columns=['id']))

stacked_model.fit(X, y)

# Predictions
test_preds = stacked_model.predict_proba(test_data_processed)[:, 1]

# Prepare submission
submission = pd.DataFrame({
    'id': test_data['id'],
    'loan_status': test_preds
})

# Save submission
submission.to_csv('submission_stacked.csv', index=False)


[I 2024-10-17 22:21:44,251] A new study created in memory with name: no-name-aeb45f36-cd07-49f5-9a83-3e032ac435ef
[I 2024-10-17 22:21:51,820] Trial 0 finished with value: 0.9477904078511632 and parameters: {'n_estimators': 1610, 'learning_rate': 0.04907530236412198, 'max_depth': 10, 'colsample_bytree': 0.5329081859610444, 'subsample': 0.8568878497559596, 'min_child_weight': 9}. Best is trial 0 with value: 0.9477904078511632.
[I 2024-10-17 22:22:01,649] Trial 1 finished with value: 0.9515640951916625 and parameters: {'n_estimators': 2013, 'learning_rate': 0.013437528938009294, 'max_depth': 10, 'colsample_bytree': 0.35742230234870503, 'subsample': 0.6881811150449835, 'min_child_weight': 6}. Best is trial 1 with value: 0.9515640951916625.
[I 2024-10-17 22:22:04,223] Trial 2 finished with value: 0.9521270849294551 and parameters: {'n_estimators': 1205, 'learning_rate': 0.04017326723949277, 'max_depth': 3, 'colsample_bytree': 0.5233803401607108, 'subsample': 0.7868812640268696, 'min_child_w

Best XGBoost params: {'n_estimators': 1122, 'learning_rate': 0.07499910416628376, 'max_depth': 4, 'colsample_bytree': 0.36405528872896836, 'subsample': 0.9525207123967265, 'min_child_weight': 5}


In [14]:
test_data.columns

Index(['id', 'person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length'],
      dtype='object')