In [87]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

<h3>First Let's take a look at the data</h3>

In [88]:
df = pd.read_csv("loan_data.csv")
df

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2,0,0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722,4380.000000,184,1.1,5,0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692,1800.000000,0,3.2,5,0,0,1


In [89]:
for x in df.columns:
    if "." in (x):
        df.rename(columns={x:x.replace(".","_")} , inplace = True)
df

Unnamed: 0,credit_policy,purpose,int_rate,installment,log_annual_inc,dti,fico,days_with_cr_line,revol_bal,revol_util,inq_last_6mths,delinq_2yrs,pub_rec,not_fully_paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2,0,0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722,4380.000000,184,1.1,5,0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692,1800.000000,0,3.2,5,0,0,1


<h3>Here is a explanation of the columns<h3>

<i>credit.policy</i>: 1 if the customer meets the credit underwriting criteria of LendingClub.com, and 0 otherwise.<br>
<i>purpose</i>: The purpose of the loan (takes values "creditcard", "debtconsolidation", "educational", "majorpurchase", "smallbusiness", and "all_other").<br>
<i>int.rate</i>: The interest rate of the loan, as a proportion (a rate of 11% would be stored as 0.11). Borrowers judged by LendingClub.com to be more risky are assigned higher interest rates.<br>
<i>installment</i>: The monthly installments owed by the borrower if the loan is funded.<br>
log.annual.inc: The natural log of the self-reported annual income of the borrower.<br>
<i>dti</i>: The debt-to-income ratio of the borrower (amount of debt divided by annual income).<br>
<i>fico</i>: The FICO credit score of the borrower.<br>
<i>days.with.cr.line</i>: The number of days the borrower has had a credit line.<br>
revol.bal<i>revol.bal</i>: The borrower's revolving balance (amount unpaid at the end of the credit card billing cycle).<br>
<i>revol.util</i>: The borrower's revolving line utilization rate (the amount of the credit line used relative to total credit available).<br>
<i>inq.last.6mths</i>: The borrower's number of inquiries by creditors in the last 6 months.<br>
<i>delinq.2yrs</i>: The number of times the borrower had been 30+ days past due on a payment in the past 2 years.<br>
<i>pub.rec</i>: The borrower's number of derogatory public records (bankruptcy filings, tax liens, or judgments).<br>

<h3>Next we'll check the column we're supposed to predict</h3>

In [57]:
def Hchart(df,x_axis,y_axis ,color_axis):
    chart = alt.Chart(df).mark_bar().encode(
        x= x_axis,
        y= y_axis,
        color = color_axis
    )
    display(chart)

In [90]:
Hchart(df,x_axis = "not_fully_paid"  ,y_axis = "count()", color_axis = "purpose")

There is a clear class imbalance here the number of defaulters on the loan is nearly one-fourth of the number of customers who paid their loans off.

<h3>Next we'll check the monthly installments owed by the owner</h3>

In [92]:
Hchart(df,x_axis = "int_rate"  ,y_axis = "count()", color_axis = "not_fully_paid")

<p>It seems that there is some correlation between the interest rates and defaulting on the loan.Those with interest rate above 10% seem more likely to default</p><br>

In [93]:
Hchart(df,x_axis = "purpose"  ,y_axis = "count()", color_axis = "not_fully_paid")

Debt consloidation seems to be the biggest cause of loans. followed by credit card and home improvement 