In [22]:
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier

In [23]:
df = pd.read_csv("credit_train.csv")

In [24]:
df.info()
# Without clear instructions, Python will classify non-numeric columns as objects, limiting

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100514 entries, 0 to 100513
Data columns (total 19 columns):
Loan ID                         100000 non-null object
Customer ID                     100000 non-null object
Loan Status                     100000 non-null object
Current Loan Amount             100000 non-null float64
Term                            100000 non-null object
Credit Score                    80846 non-null float64
Annual Income                   80846 non-null float64
Years in current job            95778 non-null object
Home Ownership                  100000 non-null object
Purpose                         100000 non-null object
Monthly Debt                    100000 non-null float64
Years of Credit History         100000 non-null float64
Months since last delinquent    46859 non-null float64
Number of Open Accounts         100000 non-null float64
Number of Credit Problems       100000 non-null float64
Current Credit Balance          100000 non-null float64
Maxi

In [25]:
df.head(3)

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0


In [26]:
# Remove NaN values
df = df.dropna(how='any')

In [27]:
# Exclude IDs, for they are not important to the model
df = df.drop(['Customer ID','Loan ID'], axis=1)

In [28]:
# Use get_dummies to code categorical variables into 0s and 1s.
df_new = pd.get_dummies(df)

In [29]:
# df_new = pd.concat([numeric_features,dummies_cat_features], axis=1)

In [30]:
df_new.head(1)

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,...,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation,Purpose_wedding
2,99999999.0,741.0,2231892.0,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# Delete "Loan Status Charged Off" here to prevent it from skewing the model
df_new = df_new.drop('Loan Status_Charged Off', axis = 1)

In [32]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict (Labels are the dependent var)
# We will predict Loan Status with this model
labels = np.array(df_new['Loan Status_Fully Paid'])

# Remove the labels from the set to create features
features = df_new.drop('Loan Status_Fully Paid', axis = 1)

# Saving feature names for later use (for visualization perhaps)
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [33]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# The split function produces 4 arrays, in the below order. -> Create 4 variables to store them.
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [34]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (27317, 45)
Training Labels Shape: (27317,)
Testing Features Shape: (9106, 45)
Testing Labels Shape: (9106,)


In [35]:
# Instantiate model with 200 decision trees
rf = RandomForestClassifier(n_estimators = 200, random_state = 42)

## Training time in Python - 3:1

In [36]:
# t = time.time()

# Train the model on training data
rf.fit(train_features, train_labels)

# t_now = time.time()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [37]:
# elapsed = t_now - t
# print("It took:")
# print(round(elapsed,5),"seconds to train the Random Forest")

In [38]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 5), 'degrees.')

Mean Absolute Error: 0.46244 degrees.


In [39]:
predictions

array([1, 1, 1, ..., 1, 1, 1], dtype=uint8)

In [40]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / sum(test_labels))

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 5), '%.')

Accuracy: 99.99359 %.


This is too good to be true. Maybe my small sample size has led to overfitting.

## In Python, one has to dig these out

In [41]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20}             Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Credit Score                     Importance: 0.26
Variable: Current Loan Amount              Importance: 0.1
Variable: Annual Income                    Importance: 0.08
Variable: Monthly Debt                     Importance: 0.08
Variable: Years of Credit History             Importance: 0.07
Variable: Months since last delinquent             Importance: 0.07
Variable: Current Credit Balance             Importance: 0.07
Variable: Maximum Open Credit              Importance: 0.07
Variable: Number of Open Accounts             Importance: 0.05
Variable: Number of Credit Problems             Importance: 0.01
Variable: Bankruptcies                     Importance: 0.01
Variable: Term_Long Term                   Importance: 0.01
Variable: Term_Short Term                  Importance: 0.01
Variable: Years in current job_1 year             Importance: 0.01
Variable: Years in current job_10+ years             Importance: 0.01
Variable: Years in current job_2 years             Importance: 

Do all that again but with a smaller depth, so that we can show you the chart

In [42]:
# import os
# os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
# # Use dot file to create a graph
# (graph, ) = pydot.graph_from_dot_file('tree.dot')

# # BIG NOTE: Configuring pydot and graphviz to work is tricky\
# # For pydot and graphviz to work, one may have to install 
# # graphviz locally and added it to the local environment

# # Write graph to a png file
# graph.write_png('tree.png')
# Import tools needed for visualization
# from sklearn.tree import export_graphviz
# import pydot
# # 
# # Pull out one tree from the forest. Just random got tree no.6 from the all the trees we had
# tree = rf.estimators_[5]

# # Export the image to a dot file
# export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# # Limit depth of tree to 5 levels
# rf_small = RandomForestRegressor(n_estimators=10, max_depth = 5)
# rf_small.fit(train_features, train_labels)

# # Extract the small tree
# tree_small = rf_small.estimators_[5]

# # Save the tree as a png image
# export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# (graph, ) = pydot.graph_from_dot_file('small_tree.dot')

# graph.write_png('small_tree.png');

# This one is for Markdown. Try extracting one tree to show, but deem it unneccesary in the end.
# ![title](https://i.ibb.co/v1PN8NX/small-tree.png)