## Imports

In [1]:
import pandas as pd
import numpy as np

# Graphing
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Bank data
bank_df = pd.read_csv("Data/bank.csv")

In [5]:
bank_df.shape

(100514, 19)

In [6]:
bank_df.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9.0,0.0,256329.0,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15.0,0.0,253460.0,427174.0,0.0,0.0


In [10]:
bank_df.columns, bank_df.columns.value_counts().sum() # 19 columns

(Index(['Loan ID', 'Customer ID', 'Loan Status', 'Current Loan Amount', 'Term',
        'Credit Score', 'Annual Income', 'Years in current job',
        'Home Ownership', 'Purpose', 'Monthly Debt', 'Years of Credit History',
        'Months since last delinquent', 'Number of Open Accounts',
        'Number of Credit Problems', 'Current Credit Balance',
        'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'],
       dtype='object'), 19)

In [16]:
bank_df.nunique(axis=0)

Loan ID                         81999
Customer ID                     81999
Loan Status                         2
Current Loan Amount             22004
Term                                2
Credit Score                      324
Annual Income                   36174
Years in current job               11
Home Ownership                      4
Purpose                            16
Monthly Debt                    65765
Years of Credit History           506
Months since last delinquent      116
Number of Open Accounts            51
Number of Credit Problems          14
Current Credit Balance          32730
Maximum Open Credit             44596
Bankruptcies                        8
Tax Liens                          12
dtype: int64

In [21]:
bank_df.describe().apply(lambda s: s.apply(lambda x: format(x, 'f'))).T  # Supressing scientific notation

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Current Loan Amount,100000.0,11760447.38946,31783942.546075,10802.0,179652.0,312246.0,524942.0,99999999.0
Credit Score,80846.0,1076.456089,1475.403791,585.0,705.0,724.0,741.0,7510.0
Annual Income,80846.0,1378276.559842,1081360.195662,76627.0,848844.0,1174162.0,1650663.0,165557393.0
Monthly Debt,100000.0,18472.412336,12174.992609,0.0,10214.1625,16220.3,24012.0575,435843.28
Years of Credit History,100000.0,18.199141,7.015324,3.6,13.5,16.9,21.7,70.5
Months since last delinquent,46859.0,34.901321,21.997829,0.0,16.0,32.0,51.0,176.0
Number of Open Accounts,100000.0,11.12853,5.00987,0.0,8.0,10.0,14.0,76.0
Number of Credit Problems,100000.0,0.16831,0.482705,0.0,0.0,0.0,0.0,15.0
Current Credit Balance,100000.0,294637.38235,376170.934666,0.0,112670.0,209817.0,367958.75,32878968.0
Maximum Open Credit,99998.0,760798.381748,8384503.472368,0.0,273438.0,467874.0,782958.0,1539737892.0


### Conclusion

When performing the .describe() function there are many outliers that are shown throughout many of the columns

## Looking at categorical variables

In [27]:
# Separating categorical from numeric variables
cat_variables = bank_df.select_dtypes(include="object")

In [28]:
cat_variables.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Term,Years in current job,Home Ownership,Purpose
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,Short Term,8 years,Home Mortgage,Home Improvements
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,Short Term,10+ years,Home Mortgage,Debt Consolidation
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,Short Term,8 years,Own Home,Debt Consolidation
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,Long Term,3 years,Own Home,Debt Consolidation
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,Short Term,5 years,Rent,Debt Consolidation


In [29]:
cat_variables["Loan Status"].unique()

array(['Fully Paid', 'Charged Off', nan], dtype=object)

In [31]:
cat_variables["Term"].unique()

array(['Short Term', 'Long Term', nan], dtype=object)

In [33]:
cat_variables["Home Ownership"].unique()

array(['Home Mortgage', 'Own Home', 'Rent', 'HaveMortgage', nan],
      dtype=object)

In [34]:
cat_variables["Purpose"].unique()

array(['Home Improvements', 'Debt Consolidation', 'Buy House', 'other',
       'Business Loan', 'Buy a Car', 'major_purchase', 'Take a Trip',
       'Other', 'small_business', 'Medical Bills', 'wedding', 'vacation',
       'Educational Expenses', 'moving', 'renewable_energy', nan],
      dtype=object)

In [44]:
def unique_vars(*columns):
    a = [x.unique() for x in cat_variables]
    return a

In [45]:
unique_vars("Loan Status")

TypeError: string indices must be integers