In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import pandas as pd
import numpy as np

In [9]:
loans = pd.read_csv('/content/drive/My Drive/ML Specialization/lending-club-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
import json
with open('/content/drive/My Drive/ML Specialization/module-6-assignment-train-idx.json', 'r') as json_file:
    train_indices = json.load(json_file)
with open('/content/drive/My Drive/ML Specialization/module-6-assignment-validation-idx.json', 'r') as json_file:
    valid_indices = json.load(json_file)

In [12]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans.drop('bad_loans', axis='columns', inplace=True)

In [13]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'

# Extract the feature columns and target column
loans = loans[features + [target]]

In [14]:
loans.head()

Unnamed: 0,grade,term,home_ownership,emp_length,safe_loans
0,B,36 months,RENT,10+ years,1
1,C,60 months,RENT,< 1 year,-1
2,C,36 months,RENT,10+ years,1
3,C,36 months,RENT,10+ years,1
4,A,36 months,RENT,3 years,1


In [15]:
categorical_variables = []
for feat_name, feat_type in zip(loans.columns, loans.dtypes):
    if feat_type == object:
        categorical_variables.append(feat_name)

for feature in categorical_variables:
    loans_data_one_hot_encoded = pd.get_dummies(loans[feature], prefix=feature, dummy_na=True)
    
    for column in loans_data_one_hot_encoded.columns:
        loans[column]= loans_data_one_hot_encoded[column]
    
    loans.drop(feature, axis='columns', inplace=True)

In [16]:
print(loans.columns)
print(len(loans.columns))

Index(['safe_loans', 'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E',
       'grade_F', 'grade_G', 'grade_nan', 'term_ 36 months', 'term_ 60 months',
       'term_nan', 'home_ownership_MORTGAGE', 'home_ownership_OTHER',
       'home_ownership_OWN', 'home_ownership_RENT', 'home_ownership_nan',
       'emp_length_1 year', 'emp_length_10+ years', 'emp_length_2 years',
       'emp_length_3 years', 'emp_length_4 years', 'emp_length_5 years',
       'emp_length_6 years', 'emp_length_7 years', 'emp_length_8 years',
       'emp_length_9 years', 'emp_length_< 1 year', 'emp_length_nan'],
      dtype='object')
29


In [17]:
train_data = loans.iloc[train_indices]
valid_data = loans.iloc[valid_indices]

In [19]:
def reached_minimum_node_size(data, min_node_size):
    # Return True if the number of data points is less than or equal to the minimum node size.
    return len(data) <= min_node_size

def error_reduction(error_before_split, error_after_split):
    # Return the error before the split minus the error after the split.
    return error_before_split - error_after_split

def intermediate_node_num_mistakes(labels_in_node):

    if len(labels_in_node) == 0:
        return 0
        
    safe_num = np.sum(labels_in_node==1)    
    risky_num = np.sum(labels_in_node==-1)    
    return min(safe_num, risky_num)

In [20]:
# Test case 1
example_labels = pd.Series([-1, -1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print('Test passed!')
else:
    print('Test 1 failed... try again!')

# Test case 2
example_labels = pd.Series([-1, -1, 1, 1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print('Test passed!')
else:
    print('Test 3 failed... try again!')
    
# Test case 3
example_labels = pd.Series([-1, -1, -1, -1, -1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
    print('Test passed!')
else:
    print('Test 3 failed... try again!')

Test passed!
Test passed!
Test passed!


In [21]:
def best_splitting_feature(data, features, target):
    
    target_values = data[target]
    best_feature = None # Keep track of the best feature 
    best_error = 10     # Keep track of the best error so far 
    # Note: Since error is always <= 1, we should intialize it with something larger than 1.

    # Convert to float to make sure error gets computed correctly.
    num_data_points = float(len(data))  
    
    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        left_split = data[data[feature] == 0]
        
        # The right split will have all data points where the feature value is 1
        ## YOUR CODE HERE
        right_split = data[data[feature] == 1]
            
        # Calculate the number of misclassified examples in the left split.
        # Remember that we implemented a function for this! (It was called intermediate_node_num_mistakes)
        left_mistakes = intermediate_node_num_mistakes(left_split[target])

        # Calculate the number of misclassified examples in the right split.
        right_mistakes = intermediate_node_num_mistakes(right_split[target])
            
        # Compute the classification error of this split.
        # Error = (# of mistakes (left) + # of mistakes (right)) / (# of data points)
        error = (left_mistakes + right_mistakes) / num_data_points

        # If this is the best error we have found so far, store the feature as best_feature and the error as best_error
        if error < best_error:
            best_error = error
            best_feature = feature
    
    return best_feature # Return the best feature we found

In [22]:
def create_leaf(target_values):    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True
            }   ## YOUR CODE HERE 
   
    # Count the number of data points that are +1 and -1 in this node.
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])    

    # For the leaf node, set the prediction to be the majority class.
    # Store the predicted class (1 or -1) in leaf['prediction']
    if num_ones > num_minus_ones:
        leaf['prediction'] = 1         ## YOUR CODE HERE
    else:
        leaf['prediction'] = -1         ## YOUR CODE HERE        

    # Return the leaf node
    return leaf

In [23]:
def decision_tree_create(data, features, target, current_depth = 0, 
                         max_depth = 10, min_node_size=1, 
                         min_error_reduction=0.0):
    
    remaining_features = features[:] # Make a copy of the features.
    
    target_values = data[target]
    print("--------------------------------------------------------------------")
    print("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    
    
    # Stopping condition 1: All nodes are of the same type.
    if intermediate_node_num_mistakes(target_values) == 0:
        print("Stopping condition 1 reached. All data points have the same target value.")                
        return create_leaf(target_values)
    
    # Stopping condition 2: No more features to split on.
    if remaining_features == []:
        print("Stopping condition 2 reached. No remaining features.")
        return create_leaf(target_values)    
    
    # Early stopping condition 1: Reached max depth limit.
    if current_depth >= max_depth:
        print("Early stopping condition 1 reached. Reached maximum depth.")
        return create_leaf(target_values)
    
    # Early stopping condition 2: Reached the minimum node size.
    # If the number of data points is less than or equal to the minimum size, return a leaf.
    if reached_minimum_node_size(data, min_node_size): ## YOUR CODE HERE 
        print("Early stopping condition 2 reached. Reached minimum node size.")
        return create_leaf(target_values)
    
    # Find the best splitting feature
    splitting_feature = best_splitting_feature(data, features, target)
    
    # Split on the best feature that we found. 
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    # Early stopping condition 3: Minimum error reduction
    # Calculate the error before splitting (number of misclassified examples 
    # divided by the total number of examples)
    error_before_split = intermediate_node_num_mistakes(target_values) / float(len(data))
    
    # Calculate the error after splitting (number of misclassified examples 
    # in both groups divided by the total number of examples)
    left_mistakes = intermediate_node_num_mistakes(left_split[target])
    right_mistakes = intermediate_node_num_mistakes(right_split[target])
    error_after_split = (left_mistakes + right_mistakes) / float(len(data))
    
    # If the error reduction is LESS THAN OR EQUAL TO min_error_reduction, return a leaf.
    if error_reduction(error_before_split, error_after_split) <= min_error_reduction:
        print("Early stopping condition 3 reached. Minimum error reduction.")
        return create_leaf(target_values)
    
    
    remaining_features.remove(splitting_feature)
    print("Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split)))
    
    
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, 
                                     current_depth + 1, max_depth, min_node_size, min_error_reduction)        
    
    right_tree = decision_tree_create(right_split, remaining_features, target, 
                                     current_depth + 1, max_depth, min_node_size, min_error_reduction)         
    
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [24]:
new_features = list(loans.columns)
new_features.remove('safe_loans')
print(new_features)

['grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G', 'grade_nan', 'term_ 36 months', 'term_ 60 months', 'term_nan', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'home_ownership_nan', 'emp_length_1 year', 'emp_length_10+ years', 'emp_length_2 years', 'emp_length_3 years', 'emp_length_4 years', 'emp_length_5 years', 'emp_length_6 years', 'emp_length_7 years', 'emp_length_8 years', 'emp_length_9 years', 'emp_length_< 1 year', 'emp_length_nan']


In [25]:
my_decision_tree_new = decision_tree_create(train_data, new_features, 'safe_loans', max_depth = 6, 
                                min_node_size = 100, min_error_reduction=0.0)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 3 reached. Minimum error reduction.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Split on feature emp_length_nan. (96, 5)
--------------------------------------------------------------------
Subtree, depth = 3 (96 data points).
Early stopping condition 2 reached. Reached minimum node size.
--------------------------------------------------------------------
Subtree, depth = 3 (5 data points).
Early stopping condition 2 reached. Reached minimum node size.
-------------------------------------------

In [26]:
my_decision_tree_old = decision_tree_create(train_data, new_features, 'safe_loans',
                                            max_depth = 6, min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
E

In [27]:
def classify(tree, x, annotate = False):
    # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate:
             print("At leaf, predicting %s" % tree['prediction'])
        return tree['prediction']
    else:
        # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate:
             print("Split on %s = %s" % (tree['splitting_feature'], split_feature_value))
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)

In [29]:
print(valid_data.iloc[0])
print('Predicted class: %s ' % classify(my_decision_tree_new, valid_data.iloc[0]))

safe_loans                -1
grade_A                    0
grade_B                    0
grade_C                    0
grade_D                    1
grade_E                    0
grade_F                    0
grade_G                    0
grade_nan                  0
term_ 36 months            0
term_ 60 months            1
term_nan                   0
home_ownership_MORTGAGE    0
home_ownership_OTHER       0
home_ownership_OWN         0
home_ownership_RENT        1
home_ownership_nan         0
emp_length_1 year          0
emp_length_10+ years       0
emp_length_2 years         1
emp_length_3 years         0
emp_length_4 years         0
emp_length_5 years         0
emp_length_6 years         0
emp_length_7 years         0
emp_length_8 years         0
emp_length_9 years         0
emp_length_< 1 year        0
emp_length_nan             0
Name: 24, dtype: int64
Predicted class: -1 


In [30]:
classify(my_decision_tree_new, valid_data.iloc[0], annotate = True)

Split on term_ 36 months = 0
Split on grade_A = 0
At leaf, predicting -1


-1

In [31]:
classify(my_decision_tree_old, valid_data.iloc[0], annotate = True)

Split on term_ 36 months = 0
Split on grade_A = 0
Split on grade_B = 0
Split on grade_C = 0
Split on grade_D = 1
Split on grade_E = 0
At leaf, predicting -1


-1

In [32]:
predictions_old = valid_data.apply(lambda x: classify(my_decision_tree_old, x), axis=1)
predictions_new = valid_data.apply(lambda x: classify(my_decision_tree_new, x), axis=1)
print('Acc_old: ', np.sum(predictions_old ==valid_data['safe_loans']) / len(valid_data))
print('Acc_new: ', np.sum(predictions_new ==valid_data['safe_loans']) / len(valid_data))

Acc_old:  0.6162214562688496
Acc_new:  0.6163291684618699


In [33]:
model1 = decision_tree_create(train_data, new_features, 'safe_loans', max_depth = 2, min_node_size = 0, min_error_reduction=-1)
model2 = decision_tree_create(train_data, new_features, 'safe_loans', max_depth = 6, min_node_size = 0, min_error_reduction=-1)
model3 = decision_tree_create(train_data, new_features, 'safe_loans', max_depth = 14, min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 1 (28001 data points).
Split on feature grade_D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Early stopping condition 1 reached. Reached maximum depth.
-----------------------------------------------

In [35]:
predictions1 = valid_data.apply(lambda x: classify(model1, x), axis=1)
predictions2 = valid_data.apply(lambda x: classify(model2, x), axis=1)
predictions3 = valid_data.apply(lambda x: classify(model3, x), axis=1)
print('Acc1: ', np.sum(predictions1 == valid_data['safe_loans']) / len(valid_data))
print('Acc2: ', np.sum(predictions2 == valid_data['safe_loans']) / len(valid_data))
print('Acc3: ', np.sum(predictions3 == valid_data['safe_loans']) / len(valid_data))

Acc1:  0.6018957345971564
Acc2:  0.6162214562688496
Acc3:  0.6207453683757002


In [36]:
def count_leaves(tree):
    if tree['is_leaf']:
        return 1
    return count_leaves(tree['left']) + count_leaves(tree['right'])

In [37]:
print(count_leaves(model1))
print(count_leaves(model2))
print(count_leaves(model3))

4
41
259


In [38]:
model4 = decision_tree_create(train_data, new_features, 'safe_loans', max_depth = 6, min_node_size = 0, min_error_reduction=-1)
model5 = decision_tree_create(train_data, new_features, 'safe_loans', max_depth = 6, min_node_size = 0, min_error_reduction=0)
model6 = decision_tree_create(train_data, new_features, 'safe_loans', max_depth = 6, min_node_size = 0, min_error_reduction=5)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
E

In [39]:
predictions4 = valid_data.apply(lambda x: classify(model4, x), axis=1)
predictions5 = valid_data.apply(lambda x: classify(model5, x), axis=1)
predictions6 = valid_data.apply(lambda x: classify(model6, x), axis=1)
print('Acc4: ', np.sum(predictions4 == valid_data['safe_loans']) / len(valid_data))
print('Acc5: ', np.sum(predictions5 == valid_data['safe_loans']) / len(valid_data))
print('Acc6: ', np.sum(predictions6 == valid_data['safe_loans']) / len(valid_data))

Acc4:  0.6162214562688496
Acc5:  0.6162214562688496
Acc6:  0.496553209823352


In [40]:
print(count_leaves(model4))
print(count_leaves(model5))
print(count_leaves(model6))

41
13
1


In [41]:
model7 = decision_tree_create(train_data, new_features, 'safe_loans', max_depth = 6, min_node_size = 0, min_error_reduction=-1)
model8 = decision_tree_create(train_data, new_features, 'safe_loans', max_depth = 6, min_node_size = 2000, min_error_reduction=-1)
model9 = decision_tree_create(train_data, new_features, 'safe_loans', max_depth = 6, min_node_size = 50000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
E

In [42]:
predictions7 = valid_data.apply(lambda x: classify(model7, x), axis=1)
predictions8 = valid_data.apply(lambda x: classify(model8, x), axis=1)
predictions9 = valid_data.apply(lambda x: classify(model9, x), axis=1)
print('Acc7: ', np.sum(predictions7 == valid_data['safe_loans']) / len(valid_data))
print('Acc8: ', np.sum(predictions8 == valid_data['safe_loans']) / len(valid_data))
print('Acc9: ', np.sum(predictions9 == valid_data['safe_loans']) / len(valid_data))

Acc7:  0.6162214562688496
Acc8:  0.6154674709177079
Acc9:  0.496553209823352


In [43]:
print(count_leaves(model7))
print(count_leaves(model8))
print(count_leaves(model9))

41
19
1
