# import

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

from yellowbrick.datasets import load_concrete
from yellowbrick.regressor import ResidualsPlot
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Data

In [3]:
_  = os.path.join('data', 'credit_train.csv')
bank_df = pd.read_csv(_)

In [7]:
bank_df.shape

(100514, 19)

# Methods/Constants/Globals

In [57]:
%run -i "scripts//methods_globals_constants.py"

<Figure size 432x288 with 0 Axes>

# Clean Data

In [59]:
clean_df = bank_df.copy()

In [60]:
# Remove all rows where all the columns are nan/null. Removes ~ 500 observations
clean_df.dropna(axis=0, how='all', inplace=True)

In [61]:
# Remove all observation with credit scores higher than 900. These an enigma borrower and only account for ~4% of the data
_ = clean_df[clean_df['Credit Score'] <= 900].index
clean_df.drop(_, inplace=True) 

In [62]:
# There were too many missing values in these columns. We removed both to perserve as much data as possible.
clean_df.drop(['Credit Score', 'Annual Income'], axis=1, inplace=True) 

In [63]:
# Dropiing to nan value of year in current job. There are ~ 1056 which is ~ 4.45% of the remaining
_ = clean_df[clean_df['Years in current job'].isna()].index
clean_df.drop(_, inplace=True) 

fill in the avg for all month since last delinquent rows but do it for the distribution related to the purpose

In [64]:
# Fill in all nan values for "months since last delinquent" with the mean of the catagorical column Purpose
_ = clean_df.groupby('Purpose').mean()['Months since last delinquent'] # A temporary dictionary like object. also called a pandas series

clean_df = clean_df.apply(lambda x : fillna_average_by_target_column(row=x, 
                                                          avg_dict=_, 
                                                          target_col=7, 
                                                          effected_col=10),axis=1)

In [67]:
# Drop the remaining nan values in Maximun open credit, bankruptcies, and tax liens
clean_df.dropna( subset=['Maximum Open Credit', 'Bankruptcies', 'Tax Liens'], axis=0, inplace=True)

In [71]:
#Drop all loans amount with 9999999999. This is too extreme 
_ = clean_df.loc[clean_df['Current Loan Amount'] == 99999999.0].index
clean_df.drop(index=_, inplace=True)

In [72]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22595 entries, 1 to 99992
Data columns (total 17 columns):
Loan ID                         22595 non-null object
Customer ID                     22595 non-null object
Loan Status                     22595 non-null object
Current Loan Amount             22595 non-null float64
Term                            22595 non-null object
Years in current job            22595 non-null object
Home Ownership                  22595 non-null object
Purpose                         22595 non-null object
Monthly Debt                    22595 non-null float64
Years of Credit History         22595 non-null float64
Months since last delinquent    22595 non-null float64
Number of Open Accounts         22595 non-null float64
Number of Credit Problems       22595 non-null float64
Current Credit Balance          22595 non-null float64
Maximum Open Credit             22595 non-null float64
Bankruptcies                    22595 non-null float64
Tax Liens         