In [1]:
import numpy as np 
import pandas as pd 
import pandas as pd
import missingno as msno
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.stats import skew
from scipy.stats import kurtosis
import scipy.stats as stats
from sklearn import preprocessing
from scipy.stats import chi2_contingency

In [2]:
hmeq_path = "https://raw.githubusercontent.com/michstg/creditscoring-hmeq-machinelearning-Xtree-KNN/main/hmeq.csv"
data = pd.read_csv(hmeq_path)

In [3]:
data_copy = data.copy()
data_copy.shape

In [6]:
data_copy.head()

# Meaning of all data fields
* BAD: the target variable, contain binary value 0 or 1, 0-not bad risk (not default). 1-bad risk(default) -> categorical
* LOAN: amount of loan -> mumberical
* MORTDUE: Amount due on the existing mortgage.-> numberical
* VALUE: Current value of the property. -> numberical
* REASON: Reason for the loan request. (HomeImp = home improvement, DebtCon= debt consolidation) -> categorical
* JOB: Six occupational categories. -> categorical
* YOJ: Years at present job. -> numberical
* DEROG: Number of major derogatory reports. -> numberical
* DELINQ: Number of delinquent credit lines. -> numberical
* CLAGE: Age of the oldest credit line in months. -> numberical
* NINQ: Number of recent credit inquiries. -> numberical
* CLNO: Number of existing credit lines. -> numberical
* DEBTINC: Debt-to-income ratio -> numberical

# Check duplicate

In [7]:
data_copy.duplicated(keep=False).sum()

# Check missing value

In [8]:
msno.bar(data_copy)

In [49]:
x = data_copy.isnull().sum()/len(data_copy)*100

In [52]:
x.plot(kind="barh")

* columns have missing value: 'MORTDUE', 'VALUE', 'REASON', 'JOB', 'YOJ', 'DEROG',
       'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC'
* missing value < 30%

# Type of data

**target variable is BAD column**

In [10]:
data_copy["BAD"].head()

In [11]:
data_copy["BAD"].value_counts()

In [12]:
data_copy["BAD"].value_counts().plot(kind="bar")

In [13]:
data_copy.info()

In [14]:
data_copy.columns

In [15]:
categorical = ['REASON', 'JOB', 'BAD']
numerical = list(set(data_copy.columns) - set(categorical))

In [16]:
print("categorical data: {}".format(categorical))
print("numerical data: {}".format(numerical))

# Summary of numerical data

In [17]:
data_copy.describe()

* 'CLNO', 'DEROG', 'DELINQ', 'CLAGE', 'VALUE', 'YOJ', 'MORTDUE', 'NINQ', 'DEBTINC' have count < 5960 -> missing value
* LOAN, MORTDUE, VALUE, DEROG, CLAGE, DELINQ, NINQ, CLNO, DEBTINC have maximun value is quite far from 75th percentile value -> seem skewness.

check normal distribution 

In [18]:
data_copy.hist(bins=50, figsize=(20, 15))

In [19]:
def descriptive_summary_n(data, column_name):
    fig, axs = plt.subplots(1,2, figsize=(19,6))
    axs[0].hist(data[column_name], bins=50) 
    axs[0].axvline(data[column_name].mean(), color='red', label=f"Mean: {round(data[column_name].mean(),2)}") 
    axs[0].axvline(data[column_name].median(), color='green', label=f"Median: {round(data[column_name].median(),2)}") 
    axs[0].axvline(x=data[column_name].describe()[4],color='r', linestyle='--', label=f"25th percentile: {data[column_name].describe()[4]}") 
    axs[0].axvline(x=data[column_name].describe()[6],color='r', linestyle='--', label=f"75th percentile: {data[column_name].describe()[6]}") 
    axs[0].legend()
    sns.boxplot(data=data, x=column_name, ax=axs[1]) #creating boxplot
    axs[1].axvline(data[column_name].mean(), color='red') #vertical line for the mean of the distribution
    fig.suptitle(f"Histogram and Boxplot of {column_name}")
    plt.show()
    
    print(f"{data[column_name].describe().round(3)}\n")
    print(f"Skewness: {skew(data[data[column_name].isna()==False][column_name], bias=True)}")
    print(f"Kurtosis: {kurtosis(data[data[column_name].isna()==False][column_name], bias=True)}")

In [20]:
descriptive_summary_n(data_copy, numerical[0])

Data is mostly distributed between 15 and 26. Data distribution is quite similar normal distribution whoes skewness reachs to 0. (0.77)-> don't need modify anything

In [21]:
descriptive_summary_n(data_copy, numerical[1])

data is mostly distributed between 29 and 39, some data can be outliers with value > 100. -> clip

In [23]:
descriptive_summary_n(data_copy, numerical[2])

In [58]:
data_copy["DELINQ"].value_counts()

most of values is 0, data contain discrete value from 0 to 13 -> categorical data

In [25]:
descriptive_summary_n(data_copy, numerical[3])

Data is mostly distributed between 46276 to 91488 with mean value 73760. Some cases is outliers with value close to 400000. => clip outliers.

In [26]:
descriptive_summary_n(data_copy, numerical[4])

data mostly distributed in range from 3 to 13. less data point have value close to 40

In [27]:
descriptive_summary_n(data_copy, numerical[5])

all value mostly distributed between 15 and 26. close to normal distribution

In [28]:
descriptive_summary_n(data_copy, numerical[6])

In [62]:
data_copy["DEROG"].value_counts()

DEROG have discrete values in few cases, range from 0-10. So, can consider as categorical data.

In [29]:
descriptive_summary_n(data_copy, numerical[7])

data moslty around 115-231. few value > 600 -> clip

In [30]:
descriptive_summary_n(data_copy, numerical[8])

In [64]:
data_copy["NINQ"].value_counts()

discrete value from 0 to 17. -> categorical data

In [32]:
descriptive_summary_n(data_copy, numerical[9])

data is mostly distributed between 66075 and 119824. few data > 400000 -> clip

# Summary of categorical data

In [33]:
data_c = data.copy()
data_c.loc[data_c['BAD']==0, 'BAD']='Not default'
data_c.loc[data_c['BAD']==1, 'BAD']='Default'

In [34]:
def descriptive_summary_c(data, column_name):
    print(f"\033[1mFrequency table of {column_name} variable: \033[0m")
    freq_table = data[column_name].value_counts().to_frame().reset_index() 
    freq_table.columns = [column_name, 'Frequencies']
    freq_table['% Percentages'] = round(freq_table['Frequencies']/data[column_name].count()*100,2) 
    display(freq_table)
    print()
    fig,axs = plt.subplots(1,2,figsize=(15,5))
    sns.barplot(data=freq_table, x='Frequencies', y=column_name, palette='flare', ax=axs[0]) 
    colors = sns.color_palette('flare')
    d = list(freq_table['% Percentages'])
    labels = list(freq_table[column_name])
    plt.pie(d, labels = labels, colors = colors, autopct='%.0f%%')
    fig.suptitle(f"Bar Chart and Pie Chart of {column_name} variable")
    plt.show()

In [35]:
descriptive_summary_c(data_c, categorical[0])

In [36]:
descriptive_summary_c(data_c, categorical[1])

In [37]:
descriptive_summary_c(data_c, categorical[2])

target class is a bit imbalanced.

# Correlation among categorical data

**using cramers'v measure**

In [38]:
label = preprocessing.LabelEncoder() 
data_encoded = pd.DataFrame() 

for i in data_copy[categorical].columns :
    data_encoded[i]=label.fit_transform(data_copy[i])

In [39]:
def cramers_V(var1,var2) :
    crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
    stat = chi2_contingency(crosstab)[0] 
    obs = np.sum(crosstab) 
    mini = min(crosstab.shape)-1 
    return (stat/(obs*mini))

In [40]:
rows= []
for var1 in data_encoded:
    col = []
    for var2 in data_encoded :
        cramers =cramers_V(data_encoded[var1], data_encoded[var2])
        col.append(round(cramers,2)) 
    rows.append(col)      
cramers_results = np.array(rows)
df = pd.DataFrame(cramers_results, columns = data_encoded.columns, index =data_encoded.columns)
df

In [41]:
fig, ax = plt.subplots(figsize=(6,6))
mask = np.zeros_like(df, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    ax = sns.heatmap(df, mask=mask, square=True)
plt.show()

# Correlation among numerical data

using pearson measure

In [43]:
data_copy[numerical+["BAD"]].corr(method='pearson')

In [44]:
fig, ax = plt.subplots(figsize=(6,6))
mask = np.zeros_like(data_copy[numerical+["BAD"]].corr(method='pearson'), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    ax = sns.heatmap(data_copy[numerical+["BAD"]].corr(method='pearson'), mask=mask, square=True)
plt.show()

* MORTDUE-CLNO, CLAGE-CLNO, VALUE-CLNO, DELINQ-DEROG, CLAGE-YOJ,LOAN-VALUE, LOAN-MORTDUE are a bit positive-correlation
* MORTDUE-VALUE are strong positive-correlation
* correlation with target variable: BAD-DEROG, DELINQ-BAD are a bit positive-correlation

# Weight of evidence - IV

**impute missing data from numerical, categorical data**

In [54]:
data_cop = data.copy()
columns_num = data_cop.select_dtypes(['float', 'int']).columns
data_cop[columns_num] = data_cop[columns_num].apply(lambda x: x.fillna(x.mean()), axis=0)

columns_obj = data_cop.select_dtypes(['object']).columns
data_cop[columns_obj] = data_cop[columns_obj].apply(lambda x: x.fillna('Missing'), axis=0)


In [115]:
def calculate_woe_iv(data, col, target, bins=10):
    if (data[col].dtype.kind in 'bifc') and (len(np.unique(data[col]))>10):
        bin_df = pd.qcut(data[col], bins,  duplicates='drop')
        d0 = pd.DataFrame({'bin range': bin_df, 'y': data[target]})
    else:
        d0 = pd.DataFrame({'bin range': data[col], 'y': data[target]})

    d = d0.groupby("bin range", as_index=False).agg({"y": ["count", "sum"]})
    d.columns = ['bin range', 'num of data', 'Events']

    d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()# avoid 0

    d['Non-Events'] = d['num of data'] - d['Events']

    d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()#avoid 0

    d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
    d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
    d.insert(loc=0, column='col name', value=col)

    return d

calculate_woe_iv(data_cop, "LOAN", "BAD")

In [116]:
def rank_iv(iv):
    if iv <= 0.02:
        return 'Useless'
    elif iv <= 0.1:
        return 'Weak'
    elif iv <= 0.3:
        return 'Medium'
    elif iv <= 0.5:
        return 'Strong'
    else:
        return 'suspicious'
cols_expt_target = list(data.columns.drop(target))
target = "BAD"
IV = []
for col in cols_expt_target:
    IV.append(calculate_woe_iv(data_cop, col, target)["IV"].sum())

res = pd.DataFrame(list(zip(cols_expt_target, IV)), columns=["col name", "IV"])
res["rank"] = res["IV"].apply(rank_iv)
res.sort_values("IV")