### Loading in Data

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier 
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.preprocessing import OrdinalEncoder
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, roc_auc_score, roc_curve,confusion_matrix, classification_report 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
import catboost as cb
from catboost import CatBoostRegressor  
from catboost import CatBoostClassifier 
# Warnings Management
import warnings  # To suppress warnings which can clutter the notebook
warnings.filterwarnings('ignore')

In [3]:
train_df = pd.read_csv("application_train.csv")
test_df = pd.read_csv("application_test.csv")
previous_application = pd.read_csv("previous_application.csv")
POS_CASH_balance = pd.read_csv("POS_CASH_balance.csv")
bureau = pd.read_csv("bureau.csv")

In [4]:
class_counts = train_df['TARGET'].value_counts()
# Determine the majority class and its proportion in the dataset
majority_class_proportion = class_counts.max() / train_df.shape[0] * 100
print(f"Majority Class Performance Benchmark: {majority_class_proportion}%")

Majority Class Performance Benchmark: 91.92711805431351%


In [5]:
values =train_df.isna().sum() / len(train_df)## Checking for the percent of missing values in each column
values.sort_values(ascending=False)
def drop_columns(df, threshold=.5,exempt_columns=['TARGET']): ## Seting threshold 
    missing = df.isna().sum() / len(df)
    columns_to_drop = missing[(values > threshold)].index.tolist()
#     existing = [col for col in columns_to_drop if col in df.columns]
    df_cleaned = df.drop(columns=columns_to_drop)
    return df_cleaned  
train_df = drop_columns(train_df) 
test_df = drop_columns(test_df)

In [6]:
def impute(df):
    for column in df.columns:
        if df[column].dtype == 'O': 
            mode_value = df[column].mode().iloc[0]
            df[column] = df[column].fillna(mode_value)  ## Impute the mode value for object columns
        else:
            median_value = df[column].median()
            df[column] = df[column].fillna(median_value) ## Impute the median value for int or num columns
    return df
train_df = impute(train_df)
test_df = impute(test_df)

In [7]:
## Merging the application data
prev_app_agg = previous_application.groupby('SK_ID_CURR').agg({'AMT_APPLICATION': ['mean'],
                                                               'AMT_CREDIT': ['mean'],
                                                               'DAYS_DECISION': ['mean'],
                                                               'CNT_PAYMENT': ['mean']})
prev_app_agg.columns = ['PREVAPP_' + ('_'.join(col).upper()) for col in prev_app_agg.columns.values]

application_train = train_df.merge(prev_app_agg, on='SK_ID_CURR', how='left')
application_test = test_df.merge(prev_app_agg, on='SK_ID_CURR', how='left')

In [8]:
## Merging Cash 
POS_CASH_agg = POS_CASH_balance.groupby('SK_ID_CURR').agg({'MONTHS_BALANCE': ['max'],
                                                           'CNT_INSTALMENT': ['mean'],
                                                           'SK_DPD': ['max']})
POS_CASH_agg.columns = ['POSCASH_' + ('_'.join(col).upper()) for col in POS_CASH_agg.columns.values]

application_train = application_train.merge(POS_CASH_agg, on='SK_ID_CURR', how='left')
application_test = application_test.merge(POS_CASH_agg, on='SK_ID_CURR', how='left')

In [9]:
bureau_agg = bureau.groupby('SK_ID_CURR').agg({'DAYS_CREDIT': ['mean', 'min', 'max'],
                                               'CREDIT_DAY_OVERDUE': ['max'],
                                               'DAYS_CREDIT_ENDDATE': ['mean'],
                                               'DAYS_CREDIT_UPDATE': ['mean']})
# Flattening the multi-level columns
bureau_agg.columns = ['BUREAU_' + ('_'.join(col).upper()) for col in bureau_agg.columns.values]

# Merging aggregated bureau data with main application data
application_train = application_train.merge(bureau_agg, on='SK_ID_CURR', how='left')
application_test = application_test.merge(bureau_agg, on='SK_ID_CURR', how='left')

In [10]:
app_train = impute(application_train)
app_test = impute(application_test)

In [11]:
## Adding Division and Subtraction Based Features
application_train['NEW_EXT_SOURCE_3_DIVIDE'] = application_train['EXT_SOURCE_3'] / (application_train['AMT_CREDIT'] + 0.01) # Avoid division by zero
application_test['NEW_EXT_SOURCE_3_DIVIDE'] = application_test['EXT_SOURCE_3'] / (application_test['AMT_CREDIT'] + 0.01)
## Adding in Yearly Interest Rate Calculations 
app_train['YEARLY_INTEREST_RATE'] = (app_train['AMT_ANNUITY'] * 12) / app_train['AMT_CREDIT']
app_test['YEARLY_INTEREST_RATE'] = (app_test['AMT_ANNUITY'] * 12) / app_test['AMT_CREDIT']
## Adding in Ratio's
app_train['INCOME_TO_ANNUITY_RATIO'] = app_train['AMT_INCOME_TOTAL'] / app_train['AMT_ANNUITY']
app_train['INCOME_TO_CREDIT_RATIO'] = app_train['AMT_INCOME_TOTAL'] / app_train['AMT_CREDIT']

app_test['INCOME_TO_ANNUITY_RATIO'] = app_test['AMT_INCOME_TOTAL'] / app_test['AMT_ANNUITY']
app_test['INCOME_TO_CREDIT_RATIO'] = app_test['AMT_INCOME_TOTAL'] / app_test['AMT_CREDIT']

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=42) # initialize K fold
X = app_train
def get_categorical_features(df): ## Creating a function to handle categorical features and then pass them on to the cat features in CatBoost
    cat_features = [col for col in df.columns if df[col].dtype == 'object' or df[col].dtype.name == 'category']
    return cat_features
cat_features = get_categorical_features(app_train)
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')  # Creating an encoder object
X_encoded = pd.DataFrame(encoder.fit_transform(X[cat_features]))  # Encoding categorical data
X_encoded.columns = encoder.get_feature_names_out(cat_features)  # Naming the new columns

# Drop original categorical columns and concatenate the new encoded columns
X = X.drop(columns=cat_features)
X = pd.concat([X, X_encoded], axis=1)
# Select features and target; replace 'feature1', 'feature2', ..., 'target' with your actual column names
y = X['TARGET']
X['EXT_SOURCE_AVG'] = (X['EXT_SOURCE_2'] + X['EXT_SOURCE_3']) / 2
X = X[['EXT_SOURCE_AVG', 'YEARLY_INTEREST_RATE', 'DAYS_BIRTH', 
       'AMT_GOODS_PRICE', 'PREVAPP_CNT_PAYMENT_MEAN',
      'POSCASH_CNT_INSTALMENT_MEAN', 'DAYS_EMPLOYED']]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:

X_train = sm.add_constant(X_train)
# Fit logistic regression model using statsmodels
logit_model = sm.Logit(y_train, X_train)
result = logit_model.fit()
print(result.summary())

#logreg = LogisticRegression()

# Train the model
#logreg.fit(X_train, y_train)

Optimization terminated successfully.
         Current function value: 0.258043
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 TARGET   No. Observations:               215257
Model:                          Logit   Df Residuals:                   215249
Method:                           MLE   Df Model:                            7
Date:                Wed, 01 May 2024   Pseudo R-squ.:                 0.08149
Time:                        13:00:01   Log-Likelihood:                -55546.
converged:                       True   LL-Null:                       -60473.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                           0.4067      0.053      7.731      0.000   

In [14]:
kf = KFold(n_splits=5, shuffle=True, random_state=42) # initialize K fold

X = app_train
y = X.pop('TARGET')

cv_results = [] ## Set up empty data frame
cv_auc_scores = []
cv_accuracy_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] ## Create for loop for the folds
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

def get_categorical_features(df): ## Creating a function to handle categorical features and then pass them on to the cat features in CatBoost
    cat_features = [col for col in df.columns if df[col].dtype == 'object' or df[col].dtype.name == 'category']
    return cat_features 

cat_features = get_categorical_features(app_train)


model = cb.CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',  
    eval_metric='AUC',       
    custom_metric=['AUC','Accuracy'],
    random_seed=42,
    verbose=200
)

model.fit(
    X_train, y_train,
    cat_features=cat_features, 
    eval_set=(X_test, y_test) ## Run and test the model
)

0:	test: 0.6436110	best: 0.6436110 (0)	total: 218ms	remaining: 1m 48s
200:	test: 0.7664620	best: 0.7664620 (200)	total: 54.4s	remaining: 1m 20s
400:	test: 0.7693961	best: 0.7694594 (398)	total: 1m 42s	remaining: 25.4s
499:	test: 0.7699555	best: 0.7699555 (499)	total: 2m 6s	remaining: 0us

bestTest = 0.769955465
bestIteration = 499



<catboost.core.CatBoostClassifier at 0x154205ad0>

## Evaluate Final Model

In [15]:
best_iteration = model.get_best_iteration()
eval_results = model.get_evals_result()
try:
    # Try the expected key for a single validation set
    auc = eval_results['validation']['AUC'][best_iteration]
except KeyError:
    # Log an error message and continue with a default value or alternative action
    print("KeyError encountered. Adjust the key based on your eval_results structure.")
    auc = None  # You might choose to set a default value or take other actions
    
# Calculate accuracy only if auc could be retrieved
if auc is not None:
    accuracy = model.score(X_test, y_test)
    # Append metrics to lists
    cv_auc_scores.append(auc)
    cv_accuracy_scores.append(accuracy)

    
    print(f"Fold AUC: {auc}, Fold Accuracy: {accuracy}")
else:
    # Handle the case where AUC couldn't be determined
    print("AUC could not be determined for this fold due to KeyError.")

if cv_auc_scores and cv_accuracy_scores:  # Ensure the lists aren't empty
    # Calculate and print the average AUC and accuracy across all folds
    average_auc = np.mean(cv_auc_scores)
    average_accuracy = np.mean(cv_accuracy_scores)
    print(f"Average AUC across folds: {average_auc}")
    print(f"Average Accuracy across folds: {average_accuracy}")
else:
    print("No AUC or accuracy scores were collected.")

Fold AUC: 0.7699554649761445, Fold Accuracy: 0.9197424474000846
Average AUC across folds: 0.7699554649761445
Average Accuracy across folds: 0.9197424474000846
