<!-- ### Importing all the required libraries -->

### Importing all the required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from plotly.subplots import make_subplots
sns.set_style('white')
sns.set_theme(palette='tab10')
sns.color_palette("rocket")



#### Reading a CSV file

In [2]:
pd.set_option('display.max_columns', None) # Setting up so we can see all the columns in a dataframe. 
df = pd.read_csv('loan/loan.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'loan/loan.csv'

Checking a dimension of CSV file

In [None]:
df.shape

We have 39717 rows and 111 features

In [None]:
df.info()
# Here We are unable to view of info of dataframe due to large number of features.

#### Zipping the files and finding a number of null value column in our dataset

In [None]:
x = df.isna().sum()
y = df.columns 
zipped = enumerate(zip(x,y))

for i, (x,y) in zipped:
    print('{}: null values {} in column {}'.format(i,x,y))

In [None]:
# Printing all columns
print(list(df.columns))

#### Dropping all columns with null values greater than 50%

In [None]:
df = df.drop(columns=['id', 'member_id','desc','mths_since_last_delinq','mths_since_last_record', 'next_pymnt_d', 
                      'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'tot_cur_bal', 'open_acc_6m',
                      'open_il_6m', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util',
                      'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl',
                      'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',
                     'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc',
                     'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq','num_accts_ever_120_pd',
                     'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                     'num_rev_accts', 'num_rev_tl_bal_gt_0','num_sats','num_tl_120dpd_2m', 'num_tl_30dpd','num_tl_90g_dpd_24m',
                     'num_tl_op_past_12m','pct_tl_nvr_dlq','percent_bc_gt_75', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
                     'total_il_high_credit_limit','tot_coll_amt','mths_since_last_major_derog'
                      ])

#### Now checking the info of remaining columns

In [None]:
df.info()

#### Finding the sum of null values present in dataframe

In [None]:
df.isna().sum()

In [None]:
df.head()

#### Filtering the dataframe based on the 'Fully Paid' and 'Charged Off' loan from column loan_status amount

In [None]:
df = df[df['loan_status'].isin(['Fully Paid','Charged Off'])]

In [None]:
df.head()

In [None]:
df.select_dtypes(include=['float64','int64'])

There are 29 Columns with dtype of float and int

#### Now plotting a ditribution plot 

In [None]:
fig, axes = plt.subplots(5, 2, figsize = (30,60)) # Setting up a size of an images and number of rows and column.
sns.histplot(df, x="total_pymnt",    hue="loan_status", element="step",ax = axes[0,0], kde=True) # Setting kde = true for geeting distribution line
sns.histplot(df, x="total_pymnt_inv",hue="loan_status", element="step",ax = axes[0,1], kde=True)
sns.histplot(df, x="total_rec_prncp",hue="loan_status", element="step",ax = axes[1,0], kde=True)
sns.histplot(df, x="total_rec_int",  hue="loan_status", element="step",ax = axes[1,1], kde=True)
sns.histplot(df, x="total_rec_late_fee",hue="loan_status", element="step",ax = axes[2,0], kde=True)
sns.histplot(df, x="inq_last_6mths", hue="loan_status", element="step",ax = axes[2,1], kde=True)
sns.histplot(df, x="recoveries",     hue="loan_status", element="step",ax = axes[3,0], kde=True)
sns.histplot(df, x="last_pymnt_amnt",hue="loan_status", element="step",ax = axes[3,1], kde=True)
sns.histplot(df, x="revol_bal",      hue="loan_status", element="step",ax = axes[4,0], kde=True)
sns.histplot(df, x="total_acc",      hue="loan_status", element="step",ax = axes[4,1], kde=True)
plt.show()

## Inference

In [None]:
fig, axes = plt.subplots(6, 2, figsize = (30,60))
sns.histplot(df, x="loan_amnt",       hue="loan_status", element="step", ax = axes[0,0], kde=True)
sns.histplot(df, x="funded_amnt",     hue="loan_status", element="step",ax = axes[0,1], kde=True)
sns.histplot(df, x="funded_amnt_inv", hue="loan_status", element="step",ax = axes[1,0], kde=True)
sns.histplot(df, x="installment",     hue="loan_status", element="step",ax = axes[1,1], kde=True)
sns.histplot(df, x="annual_inc",      hue="loan_status", element="step",ax = axes[2,0], kde=True)
sns.histplot(df, x="dti",             hue="loan_status", element="step",ax = axes[2,1], kde=True)
sns.histplot(df, x="delinq_2yrs",     hue="loan_status", element="step",ax = axes[3,0], kde=True)
sns.histplot(df, x="inq_last_6mths",  hue="loan_status", element="step",ax = axes[3,1], kde=True)
sns.histplot(df, x="open_acc",        hue="loan_status", element="step",ax = axes[4,0], kde=True)
sns.histplot(df, x="pub_rec",         hue="loan_status", element="step",ax = axes[4,1], kde=True)
sns.histplot(df, x="revol_bal",       hue="loan_status", element="step",ax = axes[5,0], kde=True)
sns.histplot(df, x="total_acc",       hue="loan_status", element="step",ax = axes[5,1], kde=True)
plt.show()

## Inference

### Pie Plot for determing percentage of categorical distribution

In [None]:
home = df.home_ownership.value_counts()
loanst = df.loan_status.value_counts()
home_sum = df.home_ownership.value_counts().sum()
loanst_sum = df.loan_status.value_counts().sum()

homel = []
loanl = []
for i in list(home):
    x=(i/home_sum)*100
    homel.append(round(x,2)) 
    
for i in list(loanst):
    x=(i/loanst_sum)*100
    loanl.append(round(x,2))
    
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(values=[47.9, 44.12, 7.71, 0.25, 0.01], name="Grade of Loan", 
                             textinfo='label+percent', labels=['RENT', 'MORTGAGE', 'OWN', 'OTHER', 'NONE']),
              1, 1)
fig.add_trace(go.Pie(values=[85.41, 14.59], name="employement length", labels=['Fully Paid', 'Charged Off'], textinfo='label+percent'),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.7, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Home Ownership and Status of Loan",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='ownership of home', x=0.10, y=0.5, font_size=20, showarrow=False),
                 dict(text='status of loan', x=0.87, y=0.5, font_size=20, showarrow=False)])
fig.show()

## Inference
1. 44.1% of people have their home on mortgage, 47.9 % on rent, 7.71% people have there own home and 0.25%, 0.01% home ownership information we do not have.
2. There are 14.6% peoples whose loans are charged off and 85.4% peoples loans are fully paid

In [None]:
pur_sum = []
sum_pur = df.purpose.value_counts().sum()
sum_coun = df.purpose.value_counts()

list_sum=list(sum_coun.index)
for i in list(sum_coun):
    x = x=(i/sum_pur)*100
    pur_sum.append(round(x,2))
    
label = list_sum
fig = go.Figure(data=[go.Pie(values=pur_sum, name="Grade of Loan", 
                             textinfo='label+percent', labels=list_sum, hole = 0.5)])
fig.update_layout(
    title_text="Purpose of taking a loan")
fig.show()

## Inference
1. There are significant number of people (46.8%) taking loan for debt consolidation, 13% of people have credit loan, 10% of people have not decribed any reason.
2. There are borrowers who have taken loan for major purposes like home renovation, major purchases, small business, car, wedding, medical etc.

In [None]:
grade = df.grade.value_counts()
sgrade = df.sub_grade.value_counts()
sum_grade = df.grade.value_counts().sum()
sum_sgrade = df.sub_grade.value_counts().sum()
per_grade = []

for i in grade:
    x=(i/sum_grade)*100
    per_grade.append(x)
    
per_sgrade = []
for i in sgrade:
    x=(i/sum_sgrade)*100
    per_sgrade.append(round(x,2))

In [None]:
label = ['B','A','C','D','E','F','G']
fig = go.Figure(data=[go.Pie(values=[30.26, 26.03, 20.30, 13.18, 6.90, 2.53, 0.77], name="Grade of Loan", 
                             textinfo='label+percent', labels=label, hole = 0.5)])
fig.update_layout(
    title_text="Grade of loan")
fig.show()

## Inference
1. People with loan grade are as follow B > A > C > D > E > F > G. People with B grade loan has highest percentage.

In [None]:
label1 = ['A4', 'B3', 'A5', 'B5', 'B4', 'C1', 'B2', 'C2', 'A3', 'B1', 'A2', 'C3',
       'D2', 'C4', 'C5', 'A1', 'D3', 'D1', 'D4', 'D5', 'E1', 'E2', 'E3']
fig = go.Figure(data=[go.Pie(values=[7.45, 7.32, 7.04, 6.78, 6.32, 5.33, 5.19, 5.01, 4.69, 4.66, 3.91, 3.86,
                            3.33, 3.13, 2.99, 2.95, 2.89, 2.41, 2.38, 2.16, 1.87, 1.59, 1.34], 
                             name="employement length", labels=label1, textinfo='label+percent', hole=.5)])
fig.update_layout(
    title_text="Sub Grade of loan")
fig.show()

## Inference
1. Grade A4, B3, A5, B5, B4, C1, B2, C2 are the major sub grade of loan in all grade of loan.

In [None]:
years = df.emp_length.value_counts()
add = df.emp_length.value_counts().sum()
per = []
for i in years:
    x=(i/add)*100
    per.append(x)
total = 29096 + 9481
total_36 = (29096/total) * 100
total_60 = (9481/total) * 100

# Create subplots: use 'domain' type for Pie subplot
label = ['36 months', '60 months']
label1 = ['10+ years', '< 1 year', '2 years', '3 years', '4 years', '5 years', '1 year', 
          '6 years', '7 years', '8 years', '9 years']
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(values=[total_36, total_60], name="term of loan", textinfo='label+percent', labels=label),
              1, 1)
fig.add_trace(go.Pie(values=[22.60, 12.00, 11.42, 10.68,
                                            8.90, 8.50,8.44, 5.77,
                                            4.55, 3.82, 3.26], name="employement length", labels=label1, textinfo='label+percent'),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.7, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Loan Duration and Length of an employement",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Duration of loan', x=0.12, y=0.5, font_size=20, showarrow=False),
                 dict(text='Length of an employement', x=0.93, y=0.5, font_size=17, showarrow=False)])
fig.show()

## Inference
1. Borrowers have preferred loan duration of 36 months over 60 months.
2. people with employement length of 10+ years has significant percentage, while people with employement length of < 1, 2, 3 year has taken more loan.

In [None]:
df2 = df.select_dtypes(include=['float64','int64'])
for i, col in enumerate(df2.columns):
    print(col)
    fig = go.Figure(data=go.Violin(y=df2[col], box_visible=True, line_color='black',
                               meanline_visible=True, fillcolor='lightseagreen', opacity=0.6,
                               x0=col, x=df['loan_status']))

    fig.update_layout(title_text=col, yaxis_zeroline=False)
    fig.show()

## Inference
1. Median value (9.6k, 10k) of both fully paid and Charged off loans are almost same.
2. People are taking loan more in range of .2k to 16.5k.
3. Most amount is funded in range of 5.5k and 16k.
4. Out of total funding most invested amount are in range of 5k to 15k. 
5. Borrowers who has fully paid loan are more likely to repay there loan in less installments as compared to borrowers who are charged off.
6. Borrowers having higher annual income are more likely to repay loan.
7. dti of Fully paid borrowers is higher than dti of charged off borrowers.
8. Borrowers with good credit history has opened account in slight higher number compared to charged off borrowers.
9. Revolving credit balance is higher compared in fully paid loans.
10. Total payment received are more in fully paid compared to charged off.
11. Payments received to date for portion of total amount funded by investors is higher in fully paid compared to charged off.
12. Principal is amount is recovered higher in Fully Paid.
13. Late fees are applicable in both fully paid and charged off loan.
14. Recoveried and collection recoveries fees are there on charged off loans.


## Behaviourial Analytics

In [None]:
fuli_p = df[df['loan_status'].isin(['Fully Paid'])]

In [None]:
fuli_p

In [None]:
df['funded_Uninvested'] = df['funded_amnt'] - df['funded_amnt_inv']
print('{} has surplus ammount of uninvested money'.format(round(df['funded_Uninvested'].sum(),2)))

In [None]:
# To tal amount remaining uninvested
df['Uninvested'] = df['total_pymnt'] - df['total_pymnt_inv']
print('{} has surplus ammount of uninvested money'.format(round(df['Uninvested'].sum(),2)))

In [None]:
home = fuli_p.home_ownership.value_counts()
loanst = fuli_p.loan_status.value_counts()
home_sum = fuli_p.home_ownership.value_counts().sum()
loanst_sum = fuli_p.loan_status.value_counts().sum()

homel = []
loanl = []
for i in list(home):
    x=(i/home_sum)*100
    homel.append(round(x,2)) 
    
for i in list(loanst):
    x=(i/loanst_sum)*100
    loanl.append(round(x,2))
    
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(values=homel, name="Grade of Loan", 
                             textinfo='label+percent', labels=['RENT', 'MORTGAGE', 'OWN', 'OTHER', 'NONE']),
              1, 1)
fig.add_trace(go.Pie(values=loanl, name="employement length", labels=['Fully Paid', 'Charged Off'], textinfo='label+percent'),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.7, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Home Ownership and Status of Loan (Fully Paid)",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='ownership of home', x=0.10, y=0.5, font_size=20, showarrow=False),
                 dict(text='status of loan', x=0.87, y=0.5, font_size=20, showarrow=False)])
fig.show()

In [None]:
pur_sum = []
sum_pur = fuli_p.purpose.value_counts().sum()
sum_coun = fuli_p.purpose.value_counts()

list_sum=list(sum_coun.index)
for i in list(sum_coun):
    x = x=(i/sum_pur)*100
    pur_sum.append(round(x,2))
    
label = list_sum
fig = go.Figure(data=[go.Pie(values=pur_sum, name="Grade of Loan", 
                             textinfo='label+percent', labels=list_sum, hole = 0.5)])
fig.update_layout(
    title_text="Purpose of taking a loan (Fully Paid)")
fig.show()

In [None]:
grade = fuli_p.grade.value_counts()
sgrade = fuli_p.sub_grade.value_counts()
sum_grade = fuli_p.grade.value_counts().sum()
sum_sgrade = fuli_p.sub_grade.value_counts().sum()
per_grade = []

for i in grade:
    x=(i/sum_grade)*100
    per_grade.append(round(x,2))
    
per_sgrade = []
for i in sgrade:
    x=(i/sum_sgrade)*100
    per_sgrade.append(round(x,2))

In [None]:
label = ['B','A','C','D','E','F','G']
fig = go.Figure(data=[go.Pie(values=[31.11, 28.66, 19.69, 12.04, 5.91, 1.99, 0.6], name="Grade of Loan", 
                             textinfo='label+percent', labels=label, hole = 0.5)])
fig.update_layout(
    title_text="Grade of loan (Fully Paid)")
fig.show()

In [None]:
label1 = ['A4', 'B3', 'A5', 'B5', 'B4', 'C1', 'B2', 'C2', 'A3', 'B1', 'A2', 'C3',
       'D2', 'C4', 'C5', 'A1', 'D3', 'D1', 'D4', 'D5', 'E1', 'E2', 'E3']
fig = go.Figure(data=[go.Pie(values=per_sgrade, 
                             name="employement length", labels=label1, textinfo='label+percent', hole=.5)])
fig.update_layout(
    title_text="Sub Grade of loan (Fully Paid)")
fig.show()

In [None]:
years = fuli_p.emp_length.value_counts()
add = fuli_p.emp_length.value_counts().sum()
per = []
for i in years:
    x=(i/add)*100
    per.append(round(x,2))
total = 25869 + 7081
total_36 = (25869/total) * 100
total_60 = (7081/total) * 100

# Create subplots: use 'domain' type for Pie subplot
label = ['36 months', '60 months']
label1 = ['10+ years', '< 1 year', '2 years', '3 years', '4 years', '5 years', '1 year', 
          '6 years', '7 years', '8 years', '9 years']
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(values=[total_36, total_60], name="term of loan", textinfo='label+percent', labels=label),
              1, 1)
fig.add_trace(go.Pie(values=[22.26, 12.04, 11.59, 10.75, 8.96, 8.51, 8.44, 5.79, 4.5, 3.83, 3.32], 
                     name="employement length", labels=label1, textinfo='label+percent'),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.7, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Loan Duration and Length of an employement (Fully Paid)",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Duration of loan', x=0.12, y=0.5, font_size=20, showarrow=False),
                 dict(text='Length of an employement', x=0.93, y=0.5, font_size=17, showarrow=False)])
fig.show()