In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from scipy.stats import kurtosis
import scipy.stats as stats
from sklearn import preprocessing
from itertools import combinations
import warnings
import pandas as pd
import missingno as msno
warnings.filterwarnings("ignore")


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [2]:
#importing dataset
data = pd.read_csv('/kaggle/input/credit-risk-dataset/credit_risk_dataset.csv')
data_copy = data.copy()

In [3]:
data_copy.head()

In [4]:
print("data.shape : {}".format(data_copy.shape))

# Meaning of all data fields
* loan_status: the target variable, contain binary value 0 or 1, 0-not default, 1-default -> categorical
* person_age: customer's age -> numerical
* person_imcome: anual imcome -> numerical
* person_home_ownership: 
* person_emp_length: number of employment -> numerical
* loan_intent: purpose of loan -> categorical
* loan_amnt: amount of loan -> numerical
* loan_grade: grade of loan -> categorical
* loan_int_rate: interest rate -> numerical
* loan_percent_income: ratio of amount of loan and anual income -> numerical
* cb_person_default_on_file: history of default? yes or no -> categorical
* cb_person_cred_hist_length: number of year open crdit -> numerical

# Check duplicate

In [5]:
data_copy.duplicated(keep=False).sum()

# Check missing value

In [6]:
msno.bar(data_copy)

In [7]:
(data_copy.isnull().sum()/len(data_copy))*100

* columns have missing value: 'person_emp_length', 'loan_int_rate'
* missing value < 30%

### Types of Data

**target variable is loan_status column**

In [8]:
data_copy["loan_status"].head()

In [9]:
data_copy["loan_status"].value_counts()

In [13]:
sns.countplot(data_copy["loan_status"])

In [14]:
data_copy.info()

In [15]:
data_copy.columns

In [16]:
numerical = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt','loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
categorical = ['person_home_ownership', 'loan_intent','loan_grade','loan_status','cb_person_default_on_file']

# Summary of numerical data

In [17]:
data_copy.describe()

* person_emp_length, loan_int_rate have count < 32581 -> missing value
* person_age, person_income, person_emp_lengt have maximum value is quite far from 75th percentile value -> seem skewness.

In [18]:
data[numerical].hist(figsize=(15,15), bins=50)

**Examining each variable**

In [19]:
def descriptive_summary_n(data, column_name):
    fig, axs = plt.subplots(1,2, figsize=(19,6))
    axs[0].hist(data[column_name], bins=50) 
    axs[0].axvline(data[column_name].mean(), color='red', label=f"Mean: {round(data[column_name].mean(),2)}") 
    axs[0].axvline(data[column_name].median(), color='green', label=f"Median: {round(data[column_name].median(),2)}") 
    axs[0].axvline(x=data[column_name].describe()[4],color='r', linestyle='--', label=f"25th percentile: {data[column_name].describe()[4]}") 
    axs[0].axvline(x=data[column_name].describe()[6],color='r', linestyle='--', label=f"75th percentile: {data[column_name].describe()[6]}") 
    axs[0].legend()
    sns.boxplot(data=data, x=column_name, ax=axs[1]) #creating boxplot
    axs[1].axvline(data[column_name].mean(), color='red') #vertical line for the mean of the distribution
    fig.suptitle(f"Histogram and Boxplot of {column_name}")
    plt.show()
    
    print(f"{data[column_name].describe().round(3)}\n")
    print(f"Skewness: {skew(data[data[column_name].isna()==False][column_name], bias=True)}")
    print(f"Kurtosis: {kurtosis(data[data[column_name].isna()==False][column_name], bias=True)}")

In [20]:
descriptive_summary_n(data_copy, numerical[0])

age is mostly distributed between 23 and 30. data distribution is right-skewed, few value > 100,
may be outliers. -> clip

In [21]:
descriptive_summary_n(data_copy, numerical[1])

"person_income" distributed between 38500 and 79200, data distribution is heavily right skewd, and lots of outliers. -> clip

In [22]:
descriptive_summary_n(data_copy, numerical[2])

"person_emp_length" is right skewed and long tailed with potential outliers. It is mostly distributed between 2 and 7. Max number is 123, 123 years of employment length seems very strange, this observation is potentially an error.-> clip

In [32]:
descriptive_summary_n(data_copy, numerical[3])

"loan_amnt" variable is slightly right skewed.

In [23]:
descriptive_summary_n(data_copy, numerical[4])

"loan_int_rate" variable's skewness is 0.20, very close to normal distribution. Data is lighter tailed than normal distribution. Mean and Median very close to each other. 

In [24]:
descriptive_summary_n(data_copy, numerical[5])

"loan_percent_income" variable is right skewed and lightly tailed. 

In [25]:
descriptive_summary_n(data_copy, numerical[6])

Mostly distributed between 3 and 8. "cb_person_cred_hist_length" variable is slightly right skewd. 

# Summary of categorical data

In [26]:
data_c = data.copy()
data_c.loc[data_c['loan_status']==0, 'loan_status']='Not default'
data_c.loc[data_c['loan_status']==1, 'loan_status']='Default'

In [27]:
def descriptive_summary_c(data, column_name):
    print(f"\033[1mFrequency table of {column_name} variable: \033[0m")
    freq_table = data[column_name].value_counts().to_frame().reset_index() 
    freq_table.columns = [column_name, 'Frequencies']
    freq_table['% Percentages'] = round(freq_table['Frequencies']/data[column_name].count()*100,2) 
    display(freq_table)
    print()
    fig,axs = plt.subplots(1,2,figsize=(15,5))
    sns.barplot(data=freq_table, x='Frequencies', y=column_name, palette='flare', ax=axs[0]) 
    colors = sns.color_palette('flare')
    d = list(freq_table['% Percentages'])
    labels = list(freq_table[column_name])
    plt.pie(d, labels = labels, colors = colors, autopct='%.0f%%')
    fig.suptitle(f"Bar Chart and Pie Chart of {column_name} variable")
    plt.show()

In [28]:
descriptive_summary_c(data_c, categorical[0])

The large proportion of the person_home_ownership variable is Rent and Mortage.

In [29]:
descriptive_summary_c(data_c, categorical[1])

data is distributed in approximately the same proportion. 

In [30]:
descriptive_summary_c(data_c, categorical[2])

Most of the "loan_grade" variable data distributed in of "A", "B", "C", and "D" category. 

In [31]:
descriptive_summary_c(data_c, categorical[3])

Great percentage of loan_status variable is "Not Default" category.

In [32]:
descriptive_summary_c(data_c, categorical[4])

"cb_person_default_on_file" is 82% "N" category. 

### Correlation among categorical data
using cramers'v measure

In [33]:
label = preprocessing.LabelEncoder() 
data_encoded = pd.DataFrame() 

for i in data_copy[categorical].columns :
    data_encoded[i]=label.fit_transform(data_copy[i])

In [34]:
def cramers_V(var1,var2) :
    crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
    stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
    obs = np.sum(crosstab) # Number of observations
    mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
    return (stat/(obs*mini))

In [35]:
rows= []
for var1 in data_encoded:
    col = []
    for var2 in data_encoded :
        cramers =cramers_V(data_encoded[var1], data_encoded[var2]) # Cramer's V test
        col.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V  
    rows.append(col)      
cramers_results = np.array(rows)
df = pd.DataFrame(cramers_results, columns = data_encoded.columns, index =data_encoded.columns)
df

In [36]:
fig, ax = plt.subplots(figsize=(6,6))
mask = np.zeros_like(df, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    ax = sns.heatmap(df, mask=mask, square=True)
plt.show()

loan_status and loan_grade, loan_grade and cb_person_default_on_file variables have some correlations between them.

# Correlation among numerical data

**using pearson measure**

In [37]:
data_copy[numerical+["loan_status"]].corr(method='pearson')

In [38]:
fig, ax = plt.subplots(figsize=(6,6))
mask = np.zeros_like(data[numerical+["loan_status"]].corr(method='pearson'), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    ax = sns.heatmap(data[numerical+["loan_status"]].corr(method='pearson'), mask=mask, square=True)
plt.show()

* person_age and cb_person_cred_hist_length is positively strong correlated. 
* loan_percent_income is correlated loan_amnt and person_income,  
* person_income is positively correlated with loan_amnt

# Weight of evidence - IV

**drop duplicate, impute missing data**

In [53]:
data_cop = data.copy()
# drop duplicate

data_cop = data_cop.drop_duplicates()
data_cop.reset_index(drop=True, inplace=True)
#impute
columns_num = data_cop.select_dtypes(['float', 'int']).columns
data_cop[columns_num] = data_cop[columns_num].apply(lambda x: x.fillna(x.median()), axis=0)

columns_obj = data_cop.select_dtypes(['object']).columns
data_cop[columns_obj] = data_cop[columns_obj].apply(lambda x: x.fillna('Missing'), axis=0)



In [57]:
def calculate_woe_iv(data, col, target, bins=10):
    if (data[col].dtype.kind in 'bifc') and (len(np.unique(data[col]))>10):
        bin_df = pd.qcut(data[col], bins,  duplicates='drop')
        d0 = pd.DataFrame({'bin range': bin_df, 'y': data[target]})
    else:
        d0 = pd.DataFrame({'bin range': data[col], 'y': data[target]})

    d = d0.groupby("bin range", as_index=False).agg({"y": ["count", "sum"]})
    d.columns = ['bin range', 'num of data', 'Events']

    d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()# avoid 0

    d['Non-Events'] = d['num of data'] - d['Events']

    d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()#avoid 0

    d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
    d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
    d.insert(loc=0, column='col name', value=col)

    return d

calculate_woe_iv(data_cop, "loan_int_rate", "loan_status")

In [55]:
def rank_iv(iv):
    if iv <= 0.02:
        return 'Useless'
    elif iv <= 0.1:
        return 'Weak'
    elif iv <= 0.3:
        return 'Medium'
    elif iv <= 0.5:
        return 'Strong'
    else:
        return 'suspicious'
target = "loan_status"
cols_expt_target = list(data.columns.drop(target))
IV = []
for col in cols_expt_target:
    IV.append(calculate_woe_iv(data_cop, col, target)["IV"].sum())

res = pd.DataFrame(list(zip(cols_expt_target, IV)), columns=["col name", "IV"])
res["rank"] = res["IV"].apply(rank_iv)
res.sort_values("IV")