# import

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

from yellowbrick.datasets import load_concrete
from yellowbrick.regressor import ResidualsPlot
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Data

In [3]:
_  = os.path.join('data', 'credit_train.csv')
bank_df = pd.read_csv(_)

In [7]:
bank_df.shape

(100514, 19)

# Methods/Constants/Globals

In [57]:
%run -i "scripts//methods_globals_constants.py"

<Figure size 432x288 with 0 Axes>

# Clean Data

In [32]:
clean_df = bank_df.copy()

In [33]:
# Remove all rows where all the columns are nan/null. Removes ~ 500 observations
clean_df.dropna(axis=0, how='all', inplace=True)

In [34]:
# Remove all observation with credit scores higher than 900. These an enigma borrower and only account for ~4% of the data
_ = clean_df[clean_df['Credit Score'] <= 900].index
clean_df.drop(_, inplace=True) 

In [35]:
# There were too many missing values in these columns. We removed both to perserve as much data as possible.
clean_df.drop(['Credit Score', 'Annual Income'], axis=1, inplace=True) 

In [37]:
# Dropiing to nan value of year in current job. There are ~ 1056 which is ~ 4.45% of the remaining
_ = clean_df[clean_df['Years in current job'].isna()].index
clean_df.drop(_, inplace=True) 

In [38]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22649 entries, 1 to 99992
Data columns (total 17 columns):
Loan ID                         22649 non-null object
Customer ID                     22649 non-null object
Loan Status                     22649 non-null object
Current Loan Amount             22649 non-null float64
Term                            22649 non-null object
Years in current job            22649 non-null object
Home Ownership                  22649 non-null object
Purpose                         22649 non-null object
Monthly Debt                    22649 non-null float64
Years of Credit History         22649 non-null float64
Months since last delinquent    10260 non-null float64
Number of Open Accounts         22649 non-null float64
Number of Credit Problems       22649 non-null float64
Current Credit Balance          22649 non-null float64
Maximum Open Credit             22647 non-null float64
Bankruptcies                    22597 non-null float64
Tax Liens         

In [49]:
clean_df.groupby('Purpose').mean()['Months since last delinquent']

Purpose
Business Loan           35.423077
Buy House               34.333333
Buy a Car               35.375000
Debt Consolidation      35.096572
Educational Expenses    35.500000
Home Improvements       34.258647
Medical Bills           33.760684
Other                   33.430976
Take a Trip             36.692308
major_purchase          29.757576
moving                  41.583333
other                   33.248188
renewable_energy        25.000000
small_business          32.636364
vacation                36.882353
wedding                 39.000000
Name: Months since last delinquent, dtype: float64

fill in the avg for all month since last delinquent rows but do it for the distribution related to the purpose

In [58]:
_ = clean_df.groupby('Purpose').mean()['Months since last delinquent'] # A temporary dictionary like object. also called a pandas series

clean_df.apply(lambda x : fillna_average_by_target_column(row=x, 
                                                          avg_dict=_, 
                                                          target_col=7, 
                                                          effected_col=10),axis=1)

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.00,35.0,0.0,229976.0,850784.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,5 years,Rent,Debt Consolidation,20639.70,6.1,35.10,15.0,0.0,253460.0,427174.0,0.0,0.0
5,89d8cb0c-e5c2-4f54-b056-48a645c543dd,4ffe99d3-7f2a-44db-afc1-40943f1f9750,Charged Off,206602.0,Short Term,10+ years,Home Mortgage,Debt Consolidation,16367.74,17.3,35.10,6.0,0.0,215308.0,272448.0,0.0,0.0
7,db0dc6e1-77ee-4826-acca-772f9039e1c7,018973c9-e316-4956-b363-67e134fb0931,Charged Off,648714.0,Long Term,< 1 year,Home Mortgage,Buy House,14806.13,8.2,8.00,15.0,0.0,193306.0,864204.0,0.0,0.0
11,fa096848-6143-4907-b2cf-852a0b06171c,aa0a6a22-a95e-48e0-ba4f-b83456d424e4,Fully Paid,541970.0,Short Term,10+ years,Home Mortgage,Home Improvements,23568.55,23.2,34.26,23.0,0.0,60705.0,1634468.0,0.0,0.0
15,7cbaa3fa-16fd-4343-9bcb-e90b34a1072f,3ec886e7-f15d-4c35-83d0-bdec4817ae4b,Fully Paid,449020.0,Long Term,9 years,Own Home,Debt Consolidation,18904.81,19.4,35.10,8.0,0.0,334533.0,428956.0,0.0,0.0
16,c9a16a9d-8801-4430-b445-dbf9cf845e31,abb4c446-08ea-49ff-aeb8-5e1e9da673e7,Charged Off,653004.0,Long Term,7 years,Home Mortgage,Debt Consolidation,14537.09,20.5,35.10,9.0,0.0,302309.0,413754.0,0.0,0.0
18,c6be21f0-80b1-46b3-8019-16646fd2137d,c67b2cb5-9f91-4bcb-9a03-03d1589c6c1a,Fully Paid,66396.0,Short Term,10+ years,Rent,Debt Consolidation,9898.81,27.1,35.10,23.0,1.0,9728.0,402380.0,1.0,0.0
34,83ad0662-ef2d-4732-99ff-e9cdab4eb276,2d53b50a-30a2-488e-a287-3780b26e62ba,Fully Paid,109318.0,Long Term,10+ years,Home Mortgage,Buy a Car,15524.90,22.7,35.38,9.0,0.0,77121.0,920524.0,0.0,0.0
38,03e3a77a-fa93-489c-b7b3-c06196ba3bce,dedbd71d-dabd-4c64-a38f-bb5886e7f8b6,Charged Off,259842.0,Short Term,8 years,Home Mortgage,Debt Consolidation,11792.73,20.6,34.00,9.0,0.0,401584.0,708818.0,0.0,0.0
