# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import warnings as ws
ws.filterwarnings('ignore')

# Data Sourcing
Our data source appears to be private, as it is associated with a specific organization.

In [2]:
df=pd.read_csv('loan.csv')

In [3]:
df.shape

(39717, 111)

In [4]:
df.isnull().mean()[100*df.isnull().mean()==100]

mths_since_last_major_derog       1.0
annual_inc_joint                  1.0
dti_joint                         1.0
verification_status_joint         1.0
tot_coll_amt                      1.0
tot_cur_bal                       1.0
open_acc_6m                       1.0
open_il_6m                        1.0
open_il_12m                       1.0
open_il_24m                       1.0
mths_since_rcnt_il                1.0
total_bal_il                      1.0
il_util                           1.0
open_rv_12m                       1.0
open_rv_24m                       1.0
max_bal_bc                        1.0
all_util                          1.0
total_rev_hi_lim                  1.0
inq_fi                            1.0
total_cu_tl                       1.0
inq_last_12m                      1.0
acc_open_past_24mths              1.0
avg_cur_bal                       1.0
bc_open_to_buy                    1.0
bc_util                           1.0
mo_sin_old_il_acct                1.0
mo_sin_old_r

In [5]:
#Identify the columns which are totally empty
empty_column=df.isnull().mean()[100*df.isnull().mean()==100].keys()
empty_column

Index(['mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint',
       'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal',
       'open_acc_6m', 'open_il_6m', 'open_il_12m', 'open_il_24m',
       'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',
       'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal',
       'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
       'mths_since_recent_inq', 'mths_since_recent_revol_delinq',
       'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
       'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
       'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
       'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m',
       'pct_tl_nvr_dl

In [6]:
#Drop the column which are completely empty
df1=df.drop(columns=empty_column)

In [7]:
df1.shape

(39717, 57)

In [8]:
100*df1.isnull().mean().sort_values(ascending=False)

next_pymnt_d                  97.129693
mths_since_last_record        92.985372
mths_since_last_delinq        64.662487
desc                          32.580507
emp_title                      6.191303
emp_length                     2.706650
pub_rec_bankruptcies           1.754916
last_pymnt_d                   0.178765
chargeoff_within_12_mths       0.140998
collections_12_mths_ex_med     0.140998
revol_util                     0.125891
tax_liens                      0.098195
title                          0.027696
last_credit_pull_d             0.005036
home_ownership                 0.000000
int_rate                       0.000000
out_prncp_inv                  0.000000
total_pymnt                    0.000000
total_pymnt_inv                0.000000
total_rec_prncp                0.000000
total_rec_int                  0.000000
total_rec_late_fee             0.000000
recoveries                     0.000000
collection_recovery_fee        0.000000
term                           0.000000


In [None]:
#Column next_pymnt_d and mths_since_last_record has > 90% null values
df1=df1.drop(columns=[])

In [None]:
#Check if any rows are totally empty
100*df1.isnull().mean(axis=1).sort_values(ascending=False)

In [None]:
#Check if duplicate records are there 
df1.drop_duplicates().shape

In [None]:
#Identify numeric and strinb column
num_col=df1.dtypes[(df1.dtypes=='int64') | (df1.dtypes=='float64')]
str_col=df1.dtypes[~((df1.dtypes=='int64') | (df1.dtypes=='float64'))]

In [None]:
#Remove % sign from int_rate, revol_util
df1['int_rate']=df1['int_rate'].apply(lambda x: x.replace('%',''))
#df1['revol_util']=df1['revol_util'].apply(lambda x: x.replace('%',''))
#Remove xx from zip_code
df1['zip_code']=df1['zip_code'].apply(lambda x: x.replace('xx',''))
#Convert it into number
df1['zip_code']=pd.to_numeric(df1['zip_code'])
#Removing grade from subgrade so that only sub_grade will remain
df1['sub_grade']=df1['sub_grade'].apply(lambda x: x[-1])
#Remove % from revol_util
df1['revol_util']=df1['revol_util'].apply(lambda x: np.nan if pd.isna(x) else x.replace('%',''))

In [None]:
str_col.keys()

In [None]:
#drop not relevant columns 
#application_type has only 0.,Nan values hence need delete 
df1=df1.drop(columns=['mths_since_last_record','next_pymnt_d''application_type'
                      ,'collections_12_mths_ex_med'
                      ,'url','chargeoff_within_12_mths','tax_liens'
                      ,'delinq_amnt','acc_now_delinq','policy_code'
                      ,'pymnt_plan','initial_list_status'])

In [None]:
df1.shape

In [None]:
df1.head()

In [None]:
#Seggrigation of column 
ordered_cat=['term','grade','sub_grade','emp_length','issue_d','delinq_2yrs'
             ,'earliest_cr_line','inq_last_6mths','mths_since_last_delinq',
             'last_pymnt_d','last_credit_pull_d','pub_rec_bankruptcies']
unordered_cat=['home_ownership','verification_status','loan_status','purpose',
               'zip_code','addr_state']
other=['id','member_id','emp_title','desc','title']
measure_col=['loan_amnt', 'funded_amnt','funded_amnt_inv','int_rate',
             'installment','annual_inc','dti','open_acc','pub_rec','revol_bal'
             ,'revol_util','total_accout_prncp','out_prncp','total_pymnt'\
             ,'total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee'
             ,'recoveries','collection_recovery_fee','last_pymnt_amnt']
df1['pub_rec_bankruptcies'].unique()

# Unordered Categorical Variables - Univariate Analysis

In [None]:
len(ordered_cat)+len(unordered_cat)+len(other)+len(measure_col)

In [None]:
#df1['sub_grade']=df1['sub_grade'].apply(lambda x: x[-1])
df1=df1.drop(columns='initial_list_status')


In [None]:
ordered_cat=['grade','sub_grade','loan_amnt', 'funded_amnt','term','int_rate']
dont_know=['funded_amnt_inv']

In [None]:
df1_d=df1[df1['loan_status']=='Charged Off']
df1_f=df1[df1['loan_status']=='Fully Paid']

In [None]:
sns.histplot(df1_d['annual_inc'],bins=[0,20000,40000,60000,80000,100000])

In [None]:

sns.histplot(df1_f['annual_inc'],bins=[0,20000,40000,60000,80000,100000])

In [None]:
sns.histplot(df1_d['addr_state'])

In [None]:
8.230000e+04

In [None]:
df1_d['issue_d'].value_counts()

In [None]:
df1_f['issue_d'].value_counts()