In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from qgrid import show_grid as Grid

In [2]:
loans=pd.read_csv('lending-club-data.csv')
loans.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1.0,1.0,1.0,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1.0,1.0,1.0,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1.0,1.0,1.0,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1.0,1.0,1.0,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1.0,1.0,1.0,0,5.21533,20141201T000000,1,1,1


In [3]:
loans['safe_loans']=loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
loans.drop('bad_loans', axis=1, inplace=True)

In [4]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'

In [5]:
loans = loans[ [target] +features ]

In [6]:
# one hot encode features
loans=pd.get_dummies(loans)

In [7]:
loans.columns

Index(['safe_loans', 'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E',
       'grade_F', 'grade_G', 'term_ 36 months', 'term_ 60 months',
       'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN',
       'home_ownership_RENT', 'emp_length_1 year', 'emp_length_10+ years',
       'emp_length_2 years', 'emp_length_3 years', 'emp_length_4 years',
       'emp_length_5 years', 'emp_length_6 years', 'emp_length_7 years',
       'emp_length_8 years', 'emp_length_9 years', 'emp_length_< 1 year',
       'emp_length_n/a'],
      dtype='object')

In [8]:
train_index=pd.read_json('module-5-assignment-2-train-idx.json', typ='series')
test_idx= pd.read_json('module-5-assignment-2-test-idx.json', typ='series')
train=loans.iloc[train_index]
test=loans.iloc[test_idx]

write the function intermediate_node_num_mistakes which computes the number of misclassified examples of an intermediate node given the set of labels (y values) of the data points contained in the node. 

In [9]:
def intermediate_node_num_mistakes(labels_in_node):
    #if labels in node is empty return 0
    if len(labels_in_node)==0:
        return 0
    
    pos_count  = (labels_in_node ==1).sum()
    neg_count = (labels_in_node ==-1).sum()
    
    #return mistakes
    return neg_count if pos_count > neg_count else pos_count

In [11]:
def best_splittin_feature(data, features, target):
    
    target_values = data[target]
    best_feature  = None #keep track of best feature
    best_error    = 10 #keep track best error
    
    num_data_points=float(len(data))
    
    for feature in features:
        left_split  = data[data[feature]==0]
        right_split = data[data[feature]==1]
        
        left_mistakes  = intermediate_node_num_mistakes(right_split[target])
        right_mistakes = intermediate_node_num_mistakes(left_split[target])
        
        error = (left_mistakes + right_mistakes)/ num_data_points
        
        if error < best_error:
            best_feature = feature
            best_error   = error
   
    return best_feature

# Building the Tree

Each node in the decision tree is represented as a dictionary which contains the following keys and possible values:

write a function that creates a leaf node given a set of target values. 

In [12]:
def create_leaf(target_values):
    leaf = {'splitting_feature': None,
           'left': None,
           'right': None,
           'is_leaf': True}
    
    #count num data points
    num_pos=len(target_values[target_values ==1])
    num_neg= len(target_values[target_values == -1])
    
    if num_pos > num_neg:
        leaf['prediction']= 1
    else:    
        leaf['prediction']= -1
        
    return leaf

write function that learns the decision tree recursively and implements 3 stopping conditions

In [13]:
def decision_tree_create(data, features, target, current_depth=0, max_depth=10):
    
    remaining_features=features[:]
    target_values=data[target]
    
    print('---------------------------------------------------------')
    print('Subtree, dept : {} ({} data points)'.format(current_depth, len(target_values)))
    
    #stopping condition 1 'data point are from the same class'
    if intermediate_node_num_mistakes(target_values)==0:
        print('Stopping condtion 1 reached')
         #If not mistakes at current node, make current node a leaf node
        return create_leaf(target_values)
    
    #stopping condition 2 'no more features to split on'
    if len(remaining_features)  == 0:
        print(' stopping condition 2 reached')
        return create_leaf(target_values)
    
    #stopping condition 3 'depth limit reached'
    if current_depth >= max_depth:
        print(' reached maximum depth. stopping for now')
        return create_leaf(target_values)
    
    # find best feature
    splitting_feature=best_splittin_feature(data, features, target)
    
    #split on the best feature
    left_split  = data[data[splitting_feature]==0]
    right_split = data[data[splitting_feature]==1]
    remaining_features.drop(splitting_feature, axis=1, inplace=True)
    print('split on feature {}. {}, {}'.format(splitting_feature, len(left_split), len(right_split)))
    
    #create leaf node if split is perfect
    if len(left_split) == len(data):
        print('creating leaf node')
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print('Creating lead node')
        return create_leaf(right_split[target])
    
    #repeat on left and right
    
    left_tree  = decision_tree_create(left_split,  remaining_features, target, current_depth +1, max_depth)
    right_tree = decision_tree_create(right_split, remaining_features, target, current_depth +1, max_depth)
 
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

train it 

In [16]:
features_bin=loans.iloc[:,1:]
target= 'safe_loans'
my_tree=decision_tree_create(train,features_bin, target, current_depth=0, max_depth=6)

---------------------------------------------------------
Subtree, dept : 0 (37224 data points)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


split on feature term_ 36 months. 9223, 28001
---------------------------------------------------------
Subtree, dept : 1 (9223 data points)
split on feature grade_A. 9122, 101
---------------------------------------------------------
Subtree, dept : 2 (9122 data points)
split on feature grade_B. 8074, 1048
---------------------------------------------------------
Subtree, dept : 3 (8074 data points)
split on feature grade_C. 5884, 2190
---------------------------------------------------------
Subtree, dept : 4 (5884 data points)
split on feature grade_D. 3826, 2058
---------------------------------------------------------
Subtree, dept : 5 (3826 data points)
split on feature grade_E. 1693, 2133
---------------------------------------------------------
Subtree, dept : 6 (1693 data points)
 reached maximum depth. stopping for now
---------------------------------------------------------
Subtree, dept : 6 (2133 data points)
 reached maximum depth. stopping for now
-----------------------

In [17]:
my_tree

{'is_leaf': False,
 'left': {'is_leaf': False,
  'left': {'is_leaf': False,
   'left': {'is_leaf': False,
    'left': {'is_leaf': False,
     'left': {'is_leaf': False,
      'left': {'is_leaf': True,
       'left': None,
       'prediction': -1,
       'right': None,
       'splitting_feature': None},
      'prediction': None,
      'right': {'is_leaf': True,
       'left': None,
       'prediction': -1,
       'right': None,
       'splitting_feature': None},
      'splitting_feature': 'grade_E'},
     'prediction': None,
     'right': {'is_leaf': True,
      'left': None,
      'prediction': -1,
      'right': None,
      'splitting_feature': None},
     'splitting_feature': 'grade_D'},
    'prediction': None,
    'right': {'is_leaf': True,
     'left': None,
     'prediction': -1,
     'right': None,
     'splitting_feature': None},
    'splitting_feature': 'grade_C'},
   'prediction': None,
   'right': {'is_leaf': False,
    'left': {'is_leaf': True,
     'left': None,
     'predi

# make prediction with my_tree

. Write a function called classify, which takes in a learned tree and a test point x to classify. Include an option annotate that describes the prediction path when set to True. 

In [43]:
def classify(tree, x, annotate= False):
    """works on dataframes and single entries (.iloc[(int)]),
    but it wont work well when evaluating the model"""
    
    if tree['is_leaf']:
        if annotate:
            print('At leaf, predicting {}'.format(tree['prediction']))
        return tree['prediction']
    else:
        #split on feature
        split_feature_value = x[tree['splitting_feature']]
        if annotate:
            print('Split on {} = {}'.format(tree['splitting_feature'],split_feature_value))
        if type(split_feature_value) == int: # if we are looking at a single row or not 
            if split_feature_value == 0:
                return classify(tree['left'], x, annotate)
            elif len(split_feature_value) == 0:
                return classify(tree['left'],x, annotate)
                    
        else:
            return classify(tree['right'], x, annotate)

In [72]:
def classify(tree, x, annotate= False):
    '''works only on single entry, like x.iloc[0]
    '''
    
    if tree['is_leaf']:
        if annotate:
            print('At leaf, predicting {}'.format(tree['prediction']))
        return tree['prediction']
    else:
        #split on feature
        split_feature_value = x[tree['splitting_feature']]
        if annotate:
            print('Split on {} = {}'.format(tree['splitting_feature'],split_feature_value))
        if split_feature_value == 0:
                return classify(tree['left'],x, annotate)
                    
        else:
            return classify(tree['right'], x, annotate)

In [78]:
print('Predicted class: {}'.format(classify(my_tree, test.iloc[0])))

Predicted class: -1


# Evaluation

 write a function to evaluate a decision tree by computing the classification error of the tree on the given dataset

In [74]:
def evaluate_clf_error(tree, data):
    prediction = data.apply(lambda x: classify(tree, x),axis=1)
    mistakes   = ( prediction != data[target]).sum() / float(len(data))
    return mistakes

In [75]:
evaluate_clf_error(my_tree, test)

0.3837785437311504

In [67]:
test

Unnamed: 0,safe_loans,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,...,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year,emp_length_n/a
24,-1,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
41,-1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
60,-1,0,0,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
93,-1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
132,-1,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
160,-1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
162,-1,0,1,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
175,-1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
184,-1,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
270,-1,0,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


# print decision stump

In [104]:
def print_stump(tree, name = 'root'):
    split_name = tree['splitting_feature'] # split_name is something like 'term. 36 months'
    if split_name is None:
        print( "(leaf, label: {})".format(tree['prediction']))
        return None
    #split_feature, split_value = split_name.split('_')
    print( '                        {}'.format(name))
    print ('         |---------------|----------------|')
    print ('         |                                |')
    print ('         |                                |')
    print ('         |                                |')
    print ('  [{} == 0]               [{} == 1]    '.format(split_name, split_name))
    print ('         |                                |')
    print ('         |                                |')
    print ('         |                                |')
    print ('    ({})                         ({})'.format(('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
                                                        ('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree')))
        

In [105]:
print_stump(my_tree)

                        root
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [term_ 36 months == 0]               [term_ 36 months == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)


In [106]:
print_stump(my_tree['left'],my_tree['splitting_feature'])

                        term_ 36 months
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [grade_A == 0]               [grade_A == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)


In [107]:
print_stump(my_tree['left']['left'],my_tree['left']['splitting_feature'])

                        grade_A
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [grade_B == 0]               [grade_B == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)


In [126]:
#RIGHT PATH
print_stump(my_tree)
print("___________________>")
print('{}'.format(print_stump(my_tree['right'],my_tree['splitting_feature'])))
print('{}'.format(print_stump(my_tree['right']['right'],my_tree['right']['splitting_feature'])))

                        root
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [term_ 36 months == 0]               [term_ 36 months == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)
___________________>
                        term_ 36 months
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [grade_D == 0]               [grade_D == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (leaf, label: -1)
None
(leaf, label: -1)
None
