# Predicting Student Loan Defaults Using Decision Trees

**_Author: Christos Anagnostopoulos_**

In [43]:
import numpy as np
import pandas as pd
from sklearn import tree, ensemble

In [44]:
df=pd.read_excel("loandata.xlsx") #import our dataset

In [45]:
df.head(10) #visualising our dataset

Unnamed: 0,field,graduationYear,loanAmount,selectiveCollege,sex,Default
0,STEM,2006,23159.580541,0,Male,No
1,HUMANITIES,2010,47498.061207,0,Male,Yes
2,HUMANITIES,2012,29637.519526,0,Female,No
3,STEM,2008,25369.577159,1,Female,No
4,BUSINESS,2013,42398.554574,0,Male,Yes
5,HUMANITIES,2012,39253.384259,1,Female,Yes
6,STEM,2005,48903.966851,1,Male,No
7,STEM,2007,30687.019114,1,Male,No
8,STEM,2005,31999.816866,0,Male,No
9,HUMANITIES,2005,45120.419948,0,Female,Yes


Explanatory Data Analysis :
- field: the field in which each student is taking their studies in
- graduationYear: the year in which each student graduated
- loanAmount: the amount each student owns
- selective College: binary valued column: 1 for students who attend a selective college, 0 for students that do not
- sex: sex of the student


In [46]:
#encode categorical variables into numerical features
df = pd.get_dummies(df,columns=["field","sex","Default"]) 

In [47]:
#drop Default_No, we are only interested in predicting if the students will default on their loans
df = df.drop(columns='Default_No')

In [49]:
df #visualising our new dataframe

Unnamed: 0,graduationYear,loanAmount,selectiveCollege,field_BUSINESS,field_HUMANITIES,field_STEM,sex_Female,sex_Male,Default_Yes
0,2006,23159.580541,0,0,0,1,0,1,0
1,2010,47498.061207,0,0,1,0,0,1,1
2,2012,29637.519526,0,0,1,0,1,0,0
3,2008,25369.577159,1,0,0,1,1,0,0
4,2013,42398.554574,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...
1995,2006,34593.557949,0,1,0,0,1,0,0
1996,2009,35146.672010,0,0,1,0,1,0,1
1997,2012,47883.543619,0,0,0,1,0,1,0
1998,2006,42817.102001,1,1,0,0,1,0,0


In [15]:
df.isnull().sum() #check if there are NaN values

graduationYear      0
loanAmount          0
selectiveCollege    0
field_BUSINESS      0
field_HUMANITIES    0
field_STEM          0
sex_Female          0
sex_Male            0
Default_Yes         0
dtype: int64

In [50]:
Xy=np.array(df) #Convert dataframe to NumPy array


In [51]:
seed = np.random.seed(2) #assign seed value for reproducibility 
np.random.shuffle(Xy)   #shuffling our data set

In [52]:
X=Xy[ : , :-1] #Dividing our data set to predictor features

In [53]:
y= Xy[ : , -1] #Dividing our data set to output value (Deafault_Yes)

In [54]:
print(len(X))
print(len(y))

2000
2000


In [55]:
trainsize = 1000    #Train - Validation - Test Split for predictor features
trainplusvalsize = 500
X_train=X[:trainsize]
X_val=X[trainsize:trainsize + trainplusvalsize]
X_test=X[trainsize + trainplusvalsize:]

In [56]:
y_train= y[ : trainsize] #Train - Validation - Test Split for output value
y_val= y[trainsize : trainsize + trainplusvalsize]
y_test= y[trainsize + trainplusvalsize: ]

In [57]:
acc_train = 1 - sum(y_train)/len(y_train) #calculate accuracy of naive benchamark for training set
acc_val = 1 - sum(y_val)/len(y_val) ##calculate accuracy of naive benchamark for validation set

In [58]:
print ( 'Naïve guess train and validation', acc_train , acc_val)

Naïve guess train and validation 0.778 0.75


In [59]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier() #instantiate DecisionTreeClassifier class with default settings
clf.fit(X_train, y_train) #Fit X_train and y_train

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [60]:
print ( 'Full tree validation Score ',clf.score(X_val, y_val))

Full tree validation Score  0.816


One way we can optimise the decision tree algorithm is by adjusting the maximum depth of the tree.(Pre-puning)
We begin by defining the variables bestdepth and bestscore, assuming the *worst case scenario*.

In [61]:
bestdepth=-1
bestscore=0
max_depth = 15

Next, we will write a for loop to progressively compute the new training/validation scores for different depths.

Here is the pseudocode for the for loop you will need to implement:

```python

for i in range(max_depth):
    # compute new classifier clf with depth = max_depth = i+1
    # fit the X and y training sets with the new classifier
    # compute the updated trainscore using .score() on the training set 
    # compute the updated valscore using .score() on the validation set
    # print the scores
    print ( 'Depth:', i+1, 'Training Score:', trainscore, 'Validation Score:', valscore)
     
    # if valscore is better than bestscore:
        # update the value of bestscore
        # increase bestdepth by one unit
    
```

In [63]:
for i in range(15):
    clf = DecisionTreeClassifier(max_depth = i + 1) # compute new classifier clf with depth = i + 1 (i starts from 0)
    clf.fit(X_train, y_train )  #fit the training sets
    trainscore=clf.score(X_train, y_train ) #update trainscore
    valscore= clf.score(X_val, y_val) #update valscore
    print( 'Depth:', i+1, 'Train Score:', trainscore, 'Validation Score:', valscore)
    if  valscore > bestscore  :
        #update bestscore
        bestscore= valscore #update bestscore
        #update depth
        bestdepth= i + 1  #update depth
print("The best depth for the decision tree is : ",bestdepth)

Depth: 1 Train Score: 0.839 Validation Score: 0.81
Depth: 2 Train Score: 0.891 Validation Score: 0.872
Depth: 3 Train Score: 0.891 Validation Score: 0.872
Depth: 4 Train Score: 0.891 Validation Score: 0.872
Depth: 5 Train Score: 0.896 Validation Score: 0.86
Depth: 6 Train Score: 0.914 Validation Score: 0.84
Depth: 7 Train Score: 0.923 Validation Score: 0.842
Depth: 8 Train Score: 0.933 Validation Score: 0.838
Depth: 9 Train Score: 0.95 Validation Score: 0.814
Depth: 10 Train Score: 0.968 Validation Score: 0.816
Depth: 11 Train Score: 0.978 Validation Score: 0.816
Depth: 12 Train Score: 0.987 Validation Score: 0.818
Depth: 13 Train Score: 0.993 Validation Score: 0.82
Depth: 14 Train Score: 0.996 Validation Score: 0.814
Depth: 15 Train Score: 0.998 Validation Score: 0.818
The best depth for the decision tree is :  2


In [64]:
X_trainval=X[:trainplusvalsize,:]

In [65]:
y_trainval = y[:trainplusvalsize]

In [66]:
clf = DecisionTreeClassifier(max_depth=bestdepth) #use new classifier with best depth
clf.fit(X_trainval, y_trainval)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [67]:
test_score = clf.score(X_test, y_test)

In [68]:
print('testing set score', test_score)

testing set score 0.882
