In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from scipy.stats import kurtosis
import scipy.stats as stats
from sklearn import preprocessing
from itertools import combinations
import warnings
import pandas as pd
import missingno as msno
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.pipeline import Pipeline
from category_encoders.woe import WOEEncoder
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore")


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [3]:
data = pd.read_csv("/kaggle/input/hmeq-data/hmeq.csv")

In [4]:
data_copy = data.copy()
data_copy.head()

In [5]:
data_copy.shape

# Train Test split before pre-processing

In [6]:
#check duplicate
data_copy.duplicated().sum()

In [7]:
y = data_copy.pop("BAD")
X = data_copy

In [8]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)

In [9]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

# Feature engineering

1. drop duplicate
2. impute missing
3. process outliers (robust scaler)
4. encode categorical data

In [16]:
categorical_cols= ['REASON', 'JOB']
numerical_cols= ['LOAN', 'DEBTINC', 'DELINQ', 'MORTDUE', 'YOJ', 'CLNO', 'DEROG', 'CLAGE', 'NINQ', 'VALUE']

In [17]:
#feature engineering for numerical data
num_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(missing_values=np.nan, strategy='median')), ("scaler", RobustScaler())]
)

#feature engineering for categorical data
cat_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)
#full training feature engineering
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, numerical_cols),
        ("cat", cat_transformer, categorical_cols),
    ]
)

# Model

In [61]:
#full training pipline
clf_logReg = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

clf_svm = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", svm.SVC(probability=True))]
)

In [62]:
clf_logReg.fit(X_train, y_train)
clf_svm.fit(X_train, y_train)

In [63]:
y_pred = clf_logReg.predict(X_test)
print(f"Accuracy score logistic regression {accuracy_score(list(y_test), list(y_pred))}")

y_pred = clf_svm.predict(X_test)
print(f"Accuracy score svm {accuracy_score(list(y_test), list(y_pred))}")

In [64]:
from sklearn.metrics import roc_curve, auc


def plot_roc_curve(model, X_test, y_test):
    y_pred_prob_test = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thres = roc_curve(y_test, y_pred_prob_test)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize = (10, 8))
    plt.plot(fpr, tpr, 'b-', color='darkorange', lw=2, linestyle='--', label='ROC curve (area = %0.2f)'%roc_auc)
    plt.plot([0, 1], [0, 1], '--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.title('ROC Curve')


In [65]:
plot_roc_curve(clf_logReg, X_test, y_test)
plot_roc_curve(clf_svm, X_test, y_test)

# Using weight of evidence - IV encoder

In [42]:
data_woe = data.copy()

In [43]:
#remove duplicate
data_woe = data_woe.drop_duplicates()
data_woe.reset_index(drop=True, inplace=True)
data_woe.duplicated().sum()

In [44]:
#train test split
y_woe = data_woe.pop("BAD")
X_woe = data_woe
X_train_woe, X_test_woe, y_train_woe, y_test_woe = train_test_split(X_woe, y_woe, random_state=42, test_size=0.2)

In [45]:
X_train_woe.shape, X_test_woe.shape, y_train_woe.shape, y_test_woe.shape

In [46]:
for col in categorical_cols + numerical_cols:
    woe=WOEEncoder(cols=[col], random_state=42, regularization=1)
        
    X_train_woe['woe_'+col]=woe.fit_transform(X_train_woe[col],y_train_woe)
    X_test_woe['woe_'+col]=woe.transform(X_test_woe[col])

In [47]:
woe_cols = list(X_train_woe.columns)[-12:]
woe_cols

In [48]:
X_train_woe[["LOAN","woe_LOAN"]]

In [66]:
clf_logReg_woe = LogisticRegression(random_state=42).fit(X_train_woe[woe_cols[-12:]], y_train_woe)
clf_svm_woe = svm.SVC(probability=True).fit(X_train_woe[woe_cols[-12:]], y_train_woe)

In [67]:
y_pre_logReg_woe = clf_logReg_woe.predict(X_test_woe[woe_cols[-12:]])
y_pre_svm_woe = clf_svm_woe.predict(X_test_woe[woe_cols[-12:]])

In [68]:
print(f"Accuracy score logistic regression {accuracy_score(list(y_test_woe), list(y_pre_logReg_woe))}")
print(f"Accuracy score svm {accuracy_score(list(y_test_woe), list(y_pre_svm_woe))}")

In [69]:
plot_roc_curve(clf_logReg_woe, X_test_woe[woe_cols[-12:]], y_test_woe)
plot_roc_curve(clf_svm_woe, X_test_woe[woe_cols[-12:]], y_test_woe)