In [None]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 60)

# Matplotlib visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Set default font size
plt.rcParams['font.size'] = 24

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

# Seaborn for visualization
import seaborn as sns
sns.set(font_scale = 2)

# Splitting, Preprocessing, and Cross-validating data into training and testing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# XGBoost for machine learning
from xgboost import XGBClassifier

# For Explainable AI
import shap

In [None]:
# Read in credit into a dataframe 
credit = pd.read_csv('credit_data.csv')
credit.head()

In [None]:
# Verify the column data types and non-missing values
credit.info()

In [None]:
# Delete useless features
credit.drop(labels=['Loan ID', 'Customer ID'], axis=1, inplace=True)

In [None]:
# Preprocessing for Missing Values
# Function to calculate missing values by column for verifying data preprocessing results

def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
# Drop the columns with > 50% missing

credit.drop(columns = 'Months since last delinquent', axis=1, inplace=True)
credit[credit['Years of Credit History'].isnull() == True]

# Here I can see that the last 514 observations are NaN values.

credit.drop(credit.tail(514).index, inplace=True) # drop last 514 rows

# As the number of missing values is so low in the 'Maximum Open Credit' I will drop them.

for i in credit['Maximum Open Credit'][credit['Maximum Open Credit'].isnull() == True].index:
    credit.drop(labels=i, inplace=True)

# As the number of missing values is so low in the 'Tax Liens' I will drop them.

for i in credit['Tax Liens'][credit['Tax Liens'].isnull() == True].index:
    credit.drop(labels=i, inplace=True)

for i in credit['Bankruptcies'][credit['Bankruptcies'].isnull() == True].index:
    credit.drop(labels=i, inplace=True)

credit.fillna(credit.mean(), inplace=True)

plt.figure(figsize=(20,8))
sns.countplot(credit['Years in current job'], order = credit['Years in current job'].value_counts().index)

credit.fillna('10+ years', inplace=True) # fill with '10+ years'.
missing_values_table(credit)


In [None]:
# # # Feature Engineering and Selection

credit.columns

In [None]:
# # Encoding categorical data & Feature Scaling

# Select the categorical columns
categorical_subset = credit[['Term', 'Years in current job', 'Home Ownership', 'Purpose']]

# One hot encode
categorical_subset = pd.get_dummies(categorical_subset)

# Join the dataframe in credit_train
# Make sure to use axis = 1 to perform a column bind
# First I will drop the 'old' categorical datas and after I will join the 'new' one.

credit.drop(labels=['Term', 'Years in current job', 'Home Ownership', 'Purpose'], axis=1, inplace=True)
credit = pd.concat([credit, categorical_subset], axis = 1)

In [None]:
# #  Remove Collinear Features

def remove_collinear_features(x, threshold):
    
    # Dont want to remove correlations between Energy Star Score
    y = x['Loan Status']
    x = x.drop(columns = ['Loan Status'])
    
    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)
            
            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                # print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    x = x.drop(columns = drops)
    
    # Add the score back in to the data
    x['Loan Status'] = y
               
    return x

In [None]:
# Remove the collinear features above a specified correlation coefficient
credit = remove_collinear_features(credit, 0.6);

In [None]:
credit.shape

In [None]:
credit.head()

In [None]:
# # # Split Into Training and Testing Sets

# Separate out the features and targets
features = credit.drop(columns='Loan Status')
targets = pd.DataFrame(credit['Loan Status'])

# Split into 80% training and 20% testing set
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size = 0.2, random_state = 42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# # Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Encoding the Dependent Variable
labelencoder_y_train = LabelEncoder()
y_train = labelencoder_y_train.fit_transform(y_train)
labelencoder_y_test = LabelEncoder()
y_test = labelencoder_y_test.fit_transform(y_test)

In [None]:
def cross_val(X_train, y_train, model):
    # Applying k-Fold Cross Validation
    accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 5)
    return accuracies.mean()

# Takes in a model, trains the model, and evaluates the model on the test set
def fit_and_evaluate(model):
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions and evalute
    model_pred = model.predict(X_test)
    model_cross = cross_val(X_train, y_train, model)
    
    # Return the performance metric
    return model_cross

In [None]:
# # XGBoost Classification
gb = XGBClassifier()
gb_cross = fit_and_evaluate(gb)

print('XGBoost Classification Performance on the test set: Cross Validation Score = %0.4f' % gb_cross)

In [None]:
model = XGBClassifier(loss = 'ls', max_depth = 5,
                                  min_samples_leaf = 6,
                                  min_samples_split = 2,
                                  max_features = 'auto',
                                  n_estimators = 500,
                                  random_state = 42)

In [None]:
model.fit(X_train, y_train)

In [None]:
final_pred = model.predict(X_test)

# Function to calculate mean absolute error
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred)*(y_true - y_pred))

print('Model performance on the test set:   MSE = %0.4f.' % mse(y_test, final_pred))

# The final model does out-perform the baseline model by about less than 1% and at the cost of significantly increased running time (it's about 7 times slower on my machine). 

In [None]:
# # # Feature Importances

# Extract the feature importances into a dataframe
feature_results = pd.DataFrame({'Feature': list(features.columns), 
                                'Importance': model.feature_importances_})

# Show the top 10 most important
feature_results = feature_results.sort_values('Importance', ascending = False).reset_index(drop=True)
feature_results['Rank'] = np.arange(1, len(feature_results)+1, 1)
feature_results_rank = feature_results.set_index('Rank')
feature_results_rank = feature_results_rank[['Feature', 'Importance']]
feature_results_rank.head(10)

In [None]:
# # Use Feature Importances for Feature Selection

# Extract the names of the most important features
most_important_features = feature_results['Feature'][:10]

# Find the index that corresponds to each feature name
indices = [list(features.columns).index(x) for x in most_important_features]

# Keep only the most important features
X_train_reduced = X_train[:, indices]
X_test_reduced = X_test[:, indices]

print('Most important training features shape: ', X_train_reduced.shape)
print('Most important testing  features shape: ', X_test_reduced.shape)

In [None]:
# Create the model with the same hyperparamters
model_reduced = XGBClassifier(loss='ls', max_depth=5, max_features='auto',
                                  min_samples_leaf=6, min_samples_split=2, 
                                  n_estimators=500, random_state=42)

# Fit and test on the reduced set of features
model_reduced.fit(X_train_reduced, y_train)
model_reduced_pred = model_reduced.predict(X_test_reduced)

print('XGBoost Reduced Results: MSE = %0.4f' % mse(y_test, model_reduced_pred))

In [None]:
# Mend the data for calculating and plotting Sharpley values
X_train = pd.DataFrame(X_train)
X_train.columns = features.columns
X_train.rename(columns = {'Years in current job_< 1 year':'Years in current job less than a year'}, inplace = True)
X_train.head()

In [None]:
idx = 14

In [None]:
# Load JS visualization code to notebook
shap.initjs()

# Explain the model's predictions using SHAP values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

# Visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, 
                shap_values[idx,:], 
                X_train.iloc[idx,:])

In [None]:
# Summarize the effects of all the features
shap.summary_plot(shap_values, X_train)

In [None]:
# Summrize the importance of the features
shap.summary_plot(shap_values, X_train, plot_type="bar")