In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

# Check data

In [2]:
class LoanDataAnalyzer:
    def __init__(self, dataframe):
        self.df = dataframe
    
    def analyze_data(self):
        self.home_ownership = self.df["person_home_ownership"].value_counts()
        self.loan_intent = self.df["loan_intent"].value_counts()
        self.loan_grade = self.df["loan_grade"].value_counts()
        self.cb_person_default_on_file = self.df["cb_person_default_on_file"].value_counts()
        
        self.unique_home_ownership = self.df["person_home_ownership"].unique()
        self.unique_loan_intent = self.df["loan_intent"].unique()
        
        return self.get_summary()

    def get_summary(self):
        summary = {
            "Home Ownership": self.home_ownership,
            "Loan Intent": self.loan_intent,
            "Loan Grade": self.loan_grade,
            "Default Status": self.cb_person_default_on_file,
            "Unique Home Ownership": self.unique_home_ownership,
            "Unique Loan Intent": self.unique_loan_intent
        }
        return summary
    
    def plot_data(self):
        plt.figure(figsize=(18, 9))
        plt.subplot(1, 4, 1)
        self.home_ownership.plot(kind='bar')
        plt.title('Home Ownership')

        plt.subplot(1, 4, 2)
        self.loan_intent.plot(kind='bar')
        plt.title('Loan Intent')
        
        plt.subplot(1, 4, 3)
        self.loan_grade.plot(kind='bar')
        plt.title('Loan Grade')

        plt.subplot(1, 4, 4)
        self.cb_person_default_on_file.plot(kind='bar')
        plt.title('Default Status')

In [None]:
train_analyzer = LoanDataAnalyzer(train_df)
train_summary = train_analyzer.analyze_data()

test_analyzer = LoanDataAnalyzer(test_df)
test_summary = test_analyzer.analyze_data()

print("Train Data Summary:")
print(train_summary)
print("\nTest Data Summary:")
print(test_summary)

In [None]:
train_analyzer.plot_data()
test_analyzer.plot_data()

# Process Categorical Variable
- one-hot encoding
- lable encoding

In [3]:
train_df_target = train_df["loan_status"]
train_df_features = train_df.drop(["loan_status", "id"], axis=1)

test_df_features = test_df.drop(["id"], axis=1)

In [4]:
class LoanFeatureEncoder:
    def __init__(self, dataframe):
        self.df = dataframe
        self.oh_encoder = OneHotEncoder(sparse_output=False)
        self.label_encoder = LabelEncoder()
        self.encoded_features = None

    def encode_features(self, oh_features, grade_ordering, default_ordering):
        # one-hot encode the categorical features
        features_encoded = self.oh_encoder.fit_transform(self.df[oh_features])
        encoded_df = pd.DataFrame(features_encoded, columns=self.oh_encoder.get_feature_names_out(oh_features))

        self.df.drop(oh_features, axis=1, inplace=True)
        self.df = pd.concat([self.df, encoded_df], axis=1)

        # label encode the ordinal features
        self.df["loan_grade"] = pd.Categorical(self.df["loan_grade"], categories=grade_ordering, ordered=True)
        self.df["loan_grade"] = self.label_encoder.fit_transform(self.df["loan_grade"])

        self.df["cb_person_default_on_file"] = pd.Categorical(self.df["cb_person_default_on_file"], categories=default_ordering, ordered=True)
        self.df["cb_person_default_on_file"] = self.label_encoder.fit_transform(self.df["cb_person_default_on_file"])

        self.encoded_features = self.df
        return self.encoded_features



In [5]:
oh_features = ["person_home_ownership", "loan_intent"]
grade_ordering = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
default_ordering = ['N', 'Y']

In [None]:
train_encoder = LoanFeatureEncoder(train_df_features)
train_encoded_df_feature = train_encoder.encode_features(oh_features, grade_ordering, default_ordering)
print(train_encoded_df_feature.dtypes)

In [None]:
test_encoder = LoanFeatureEncoder(test_df_features)
test_encoded_df_feature = test_encoder.encode_features(oh_features, grade_ordering, default_ordering)
print(test_encoded_df_feature.dtypes)

# Model Training

In [8]:
feature_train, feature_test, target_train, target_test = train_test_split(train_encoded_df_feature, train_df_target, 
                                                                          test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
feature_train, target_train = smote.fit_resample(feature_train, target_train)

In [33]:
# grid search
def grid_search(model, param, feature_train, target_train):
    grid_search = GridSearchCV(model, param, scoring='roc_auc',cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(feature_train, target_train)

    best_params = grid_search.best_params_
    print(best_params)

In [None]:
xgb = XGBClassifier(objective='binary:logistic', eval_metric = 'auc',random_state=42)
xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.3, 0.5],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
grid_search(xgb, xgb_param_grid, feature_train=train_encoded_df_feature, target_train=train_df_target)

- xgboost {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}

In [None]:
cat = CatBoostClassifier(eval_metric='AUC', random_state=42)
cat_param_grid = {
    'depth': [3, 5, 7],
    'learning_rate': [0.1, 0.3, 0.5],
    'iterations': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bylevel': [0.6, 0.8, 1.0]
}

grid_search(cat, cat_param_grid, feature_train=train_encoded_df_feature, target_train=train_df_target)

In [None]:
lgb = LGBMClassifier(objective='binary', eval_metric='auc',random_state=42)
lgb_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.3, 0.5],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
grid_search(lgb, lgb_param_grid, feature_train=train_encoded_df_feature, target_train=train_df_target)

- catboost{'colsample_bylevel': 1.0, 'depth': 3, 'iterations': 300, 'learning_rate': 0.5, 'subsample': 1.0}

In [64]:
# XGBoost model
param = {
    'max_depth': 3,
    'learning_rate': 0.3,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

xgb = XGBClassifier(objective='binary:logistic', 
                    eval_metric='auc', 
                    random_state=42, 
                    **param)

XGB_model = xgb.fit(feature_train, target_train)

In [None]:
# CatBoost model
best_params = {
    'depth': 3,
    'learning_rate': 0.5,
    'iterations': 300,
    'subsample': 1.0,
    'colsample_bylevel': 1.0
}
cat = CatBoostClassifier(eval_metric='AUC', random_state=42, verbose=10,**best_params)

CAT_model = cat.fit(feature_train, target_train)

In [None]:
# LightGBM model
lgb = LGBMClassifier(objective='binary', random_state=42)
LGB_model = lgb.fit(feature_train, target_train)

# Stack Model

In [None]:
LGB_model = LGBMClassifier(objective='binary', random_state=42)
LGB_model.fit(feature_train, target_train)
lgb_pred_train = cross_val_predict(LGB_model, feature_train, target_train, cv=5, method='predict_proba')[:, 1]

xgb_param = {
    'max_depth': 3,
    'learning_rate': 0.3,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

XGB_model = XGBClassifier(objective='binary:logistic', eval_metric='auc', random_state=42, **xgb_param)
XGB_model.fit(feature_train, target_train)
xgb_pred_train = cross_val_predict(XGB_model, feature_train, target_train, cv=5, method='predict_proba')[:, 1]

cat_params = {
    'depth': 3,
    'learning_rate': 0.5,
    'iterations': 300,
    'subsample': 1.0,
    'colsample_bylevel': 1.0
}
CAT_model = CatBoostClassifier(eval_metric='AUC', random_state=42, verbose=10,**cat_params)
CAT_model.fit(feature_train, target_train)
cat_pred_train = cross_val_predict(CAT_model, feature_train, target_train, cv=5, method='predict_proba')[:, 1]

stacked_train = np.column_stack((lgb_pred_train, cat_pred_train, xgb_pred_train))


In [None]:
stacked_model = LogisticRegression(random_state=42)
stacked_model.fit(stacked_train, target_train)

In [11]:
# stack the valid data
rf_pred_test = LGB_model.predict_proba(feature_test)[:, 1]
xgb_pred_test = XGB_model.predict_proba(feature_test)[:, 1]
cat_pred_test = CAT_model.predict_proba(feature_test)[:, 1]

stacked_test = np.column_stack((rf_pred_test, cat_pred_test, xgb_pred_test))

# Voting Model
- Hard Voting
- Soft Voting

In [9]:
xgb_param = {
    'max_depth': 3,
    'learning_rate': 0.3,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

xgb = XGBClassifier(objective='binary:logistic', 
                    eval_metric='auc', 
                    random_state=42, 
                    **xgb_param)

cat_params = {
    'depth': 3,
    'learning_rate': 0.5,
    'iterations': 300,
    'subsample': 1.0,
    'colsample_bylevel': 1.0
}
cat = CatBoostClassifier(eval_metric='AUC', random_state=42, verbose=10,**cat_params)

lgb = LGBMClassifier(objective='binary', random_state=42)


In [None]:
# Hard voting
hard_voting = VotingClassifier(estimators=[('xgb', xgb), ('cat', cat), ('lgb', lgb)], voting='hard')
hard_voting.fit(feature_train, target_train)

In [None]:
# Soft voting
soft_voting = VotingClassifier(estimators=[('xgb', xgb), ('cat', cat), ('lgb', lgb)], voting='soft')
soft_voting.fit(feature_train, target_train)

# Evaluate Model

In [13]:
class ModelEvaluator:
    def __init__(self, model, feature_test, target_test):
        self.model = model
        self.feature_test = feature_test
        self.target_test = target_test
        self.predictions = None
        self.target_prob = None

    def evaluate(self):
        self.predictions = self.model.predict(self.feature_test)
        self.target_prob = self.model.predict_proba(self.feature_test)[:, 1]

        accuracy = accuracy_score(self.target_test, self.predictions)
        
        print("Accuracy:", accuracy)
        print(classification_report(self.target_test, self.predictions))
        print(confusion_matrix(self.target_test, self.predictions))

    def plot_roc_curve(self):
        # roc curve
        fpr, tpr, _ = roc_curve(self.target_test, self.target_prob)
        roc_auc = roc_auc_score(self.target_test, self.target_prob)
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc="lower right")
        plt.show()
        
    def plot_confusion_matrix(self):
        # confusion matrix
        cm = confusion_matrix(self.target_test, self.predictions)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
        

In [None]:
train_evaluator = ModelEvaluator(soft_voting, feature_test, target_test)
train_evaluator.evaluate()
train_evaluator.plot_roc_curve()
train_evaluator.plot_confusion_matrix()

In [None]:
train_evaluator = ModelEvaluator(stacked_model, stacked_test, target_test)
train_evaluator.evaluate()
train_evaluator.plot_roc_curve()
train_evaluator.plot_confusion_matrix()

# Prediction

In [21]:
# stack the test data
rf_pred_test = LGB_model.predict_proba(test_encoded_df_feature)[:, 1]
xgb_pred_test = XGB_model.predict_proba(test_encoded_df_feature)[:, 1]
cat_pred_test = CAT_model.predict_proba(test_encoded_df_feature)[:, 1]

stacked_test_data = np.column_stack((rf_pred_test, cat_pred_test, xgb_pred_test))

In [20]:
test_pred = soft_voting.predict(test_encoded_df_feature)
# test_pred = stacked_model.predict(stacked_test_data)
test_df["loan_status"] = test_pred

test_df.to_csv('./output/soft_predictions.csv', columns=["id", "loan_status"], index=False)