### Importing modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import os

### Reading dataset

In [6]:
os.listdir('dataset')

['dataset_']

In [7]:
DATASET_DIR='dataset/'
DATASET_FILENAME='dataset_'

#### Function to read dataset

In [28]:
def read_dataset(file_path):
    """
    This function is used to read data from file and produce dataframe with file contents.
    
    Input: file_path
    
    Output: Dataframe with file contents
    
    """
    data_flag=False
    data=[]
    features=[]
    with open(file_path) as f:
        all_lines=f.readlines()
        for line in all_lines:
            if '@ATTRIBUTE' in line:
                _,feature_name,_=line.split(' ')
                features.append(feature_name)
            if data_flag:
                line=line.replace("\n",'')
                data.append(line.split(','))
            if '@DATA' in line:
                data_flag=True   
    df_data=pd.DataFrame(data=data,columns=features)
    return df_data

In [29]:
df_data=read_dataset(DATASET_DIR+DATASET_FILENAME)

In [30]:
df_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [25]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   person_age                  32581 non-null  object
 1   person_income               32581 non-null  object
 2   person_home_ownership       32581 non-null  object
 3   person_emp_length           32581 non-null  object
 4   loan_intent                 32581 non-null  object
 5   loan_grade                  32581 non-null  object
 6   loan_amnt                   32581 non-null  object
 7   loan_int_rate               32581 non-null  object
 8   loan_status                 32581 non-null  object
 9   loan_percent_income         32581 non-null  object
 10  cb_person_default_on_file   32581 non-null  object
 11  cb_person_cred_hist_length  32581 non-null  object
dtypes: object(12)
memory usage: 3.0+ MB


In [26]:
df_data.describe()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
count,32581,32581,32581,32581.0,32581,32581,32581,32581,32581,32581.0,32581,32581
unique,58,4295,4,37.0,6,7,753,349,2,77.0,2,29
top,23,60000,RENT,0.0,EDUCATION,A,10000,?,0,0.1,N,2
freq,3889,1046,16446,4105.0,6453,10777,2664,3116,25473,1533.0,26836,5965


#### Writing dataset back to excel
    - For future use.

In [27]:
df_data.to_excel(DATASET_DIR+'Credit_Risk_Prediction_Dataset.xlsx')

### EDA

The dataset is a combination of numerical and categorical data.

To Do on Numerical Data:
- Visualizing distributions of data
- Eliminating outliers by checking how much data points are deviated away from mean/median.
- Identifying missing data columns,and filling them with Mean/Median
- Visualizing Correlation plots among features
- Performing Standardization/Normalization

To do on Categorical Data:
- Filling missing values with Mode
- Checking for any random text with in features,and discarding them
- Checking the value counts of each categorical feature
- Encoding features using One-Hot-Encoder/Label-Encoder/Multi Label Binarizer

To do on Target label Data:
- Identifying total discre classes
- Analyzing and visualizing the distritbution of data by class
- If imbalance is identified,upsampling or downsampling is implemented

To do Model Building:
- Create train test splits (Stratified sampling)
- Building model pipeline
- Training the data
- Validaing the data
- Parameter tuning
- Saving model
- Deployment using flask
    

In [31]:
df_data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.1,N,26


#### Checking missing values in each column

In [55]:
def check_missing_columns(df_in):
    """
    
    This function takes in a dataframe and prints the missing values in each column.
        -Some columns have "?" as data,considering them as missing values.
    
    
    input: dataframe
    
    """
    for col in df_in.columns:
        missing_data_count=df_in[col].isna().sum()
        missing_data_count=missing_data_count+len(df_in[df_in[col]=='?'])
        if missing_data_count>0:
            print("Missing data in "+col+":"+str(missing_data_count))
            print("#"*100)

In [56]:
check_missing_columns(df_data)

Missing data in person_emp_length:895
####################################################################################################
Missing data in loan_int_rate:3116
####################################################################################################
