Boosting a decision stump
In this homework you will implement your own boosting module.

Brace yourselves! This is going to be a fun and challenging assignment.

Use SFrames to do some feature engineering.
Train a boosted ensemble of decision-trees (gradient boosted trees) on the lending club dataset.
Predict whether a loan will default along with prediction probabilities (on a validation set).
Evaluate the trained model and compare it with a baseline.
Find the most positive and negative loans using the learned model.
Explore how the number of trees influences classification performance.

In [1]:
import numpy as np
import pandas as pd

# prepare data


In [2]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home ownership status: own, mortgage or rent
            'emp_length'        # number of years of employment
           ]
target= 'safe_loans'

In [3]:
loans=pd.read_csv('lending-club-data.csv')

#modify target column

loans['safe_loans']=loans['bad_loans'].apply(lambda x: 1 if x==0 else -1)
loans.drop('bad_loans', axis=1, inplace=True)
loans=loans[[target]+ features]
loans=pd.get_dummies(loans)

features_dummy=list(loans.columns[1:])

train_idx=pd.read_json('module-8-assignment-2-train-idx.json', typ='series')
test_idx=pd.read_json('module-8-assignment-2-test-idx.json', typ='series')

train=loans.iloc[train_idx].reset_index(drop=True)
test=loans.iloc[test_idx].reset_index(drop=True)

X_train=train.iloc[:,1:]
y_train=train.iloc[:,0]

X_test=test.iloc[:,1:]
y_test=test.iloc[:,0]

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
features_dummy

['grade_A',
 'grade_B',
 'grade_C',
 'grade_D',
 'grade_E',
 'grade_F',
 'grade_G',
 'term_ 36 months',
 'term_ 60 months',
 'home_ownership_MORTGAGE',
 'home_ownership_OTHER',
 'home_ownership_OWN',
 'home_ownership_RENT',
 'emp_length_1 year',
 'emp_length_10+ years',
 'emp_length_2 years',
 'emp_length_3 years',
 'emp_length_4 years',
 'emp_length_5 years',
 'emp_length_6 years',
 'emp_length_7 years',
 'emp_length_8 years',
 'emp_length_9 years',
 'emp_length_< 1 year',
 'emp_length_n/a']

# Weighted decision trees


In [5]:
def intermediate_node_weighted_mistakes(labels_in_node, data_weights):
    #labels_in_node.reset_index(drop=True, inplace=True)

    total_weight_positive = sum(data_weights[labels_in_node == +1])
    weighted_mistakes_all_negative = total_weight_positive
    #print('weighted mistakes all negative  {}'.format(weighted_mistakes_all_negative))
    
    total_weight_negative= sum(data_weights[labels_in_node == -1])
    weighted_mistakes_all_positive = total_weight_negative
    #print('weighted mistakes all  positive {}'.format(weighted_mistakes_all_positive))
    
    #if weighted_mistakes_all_positive <= weighted_mistakes_all_negative:
     #   return (weighted_mistakes_all_positive,+1)
    #else:
     #   return ( weighted_mistakes_all_negative, -1)
    if weighted_mistakes_all_negative <= weighted_mistakes_all_positive:
        return (weighted_mistakes_all_negative, -1)
    else:
        return ( weighted_mistakes_all_positive, +1)

In [6]:
ex_labels=pd.Series([-1,-1,1,1,1])
ex_weights= pd.Series([1.,2.,.5,1.,1.])
if intermediate_node_weighted_mistakes(ex_labels,ex_weights)==(2.5,-1):
    print('test passed')
else:
    print('failsed')

test passed


In [7]:
ex_labels1=pd.Series([1,1,1,1,1])
ex_weights1=pd.Series([1.,2.,.5,1.,1.])
intermediate_node_weighted_mistakes(ex_labels1,ex_weights1)

(0, 1)

In [8]:
def best_splitting_feature(data, features, target, data_weights):
    
    target_values = data[target]
    best_feature  = None #keep track of best feature
    best_error    = float('+inf') #keep track best error
    
    num_data_points=float(len(data))
    
    for feature in features:
        left_split  = data[data[feature]==0]
        right_split = data[data[feature]==1]
        #left_split  = data[feature]==0
        left_data_weights = data_weights[left_split.index]
        right_data_weights = data_weights[right_split.index]
        #left_data_weights = data_weights[data[feature]==0]
        #right_data_weights = data_weights[data[feature]==1]
        #left_data_weights= left_split[data_weights]
        #left_data_weights = data_weights[left_split]
        #right_data_weights = data_weights[right_split]
        
        
        left_weighted_mistakes, left_class  = intermediate_node_weighted_mistakes(left_split[target], left_data_weights)
        right_weighted_mistakes, right_class = intermediate_node_weighted_mistakes(right_split[target], right_data_weights)
        
        #error = (left_weighted_mistakes + right_weighted_mistakes)/(sum(left_data_weights) + sum(right_data_weights))
        error = (left_weighted_mistakes + right_weighted_mistakes)/sum(data_weights)

        if error < best_error:
            best_feature = feature
            best_error   = error
   
    return best_feature

In [9]:
ex_weights2=pd.Series(len(train)*[1.5])
if best_splitting_feature(train, features_dummy, target, ex_weights2) == 'term_ 36 months':
    print('test passed')
else:
    print("fail")

test passed


# Building the tree

In [10]:
def create_leaf(target_values, data_weights):
    
    leaf= {'splitting feature': None,
          'is_leaf':True}
    
    weighted_error, best_class = intermediate_node_weighted_mistakes(target_values, data_weights)
    leaf['prediction'] = best_class
    
    return leaf

In [11]:
def weighted_decision_tree_create(data, features, target, data_weights, current_depth = 1, max_depth = 10):
    remaining_features = features[:] # Make a copy of the features.
    target_values = data[target]
    #target_values
    print("--------------------------------------------------------------------")
    print("Subtree, depth = {} ({} data points).".format(current_depth, len(target_values)))
    
    # Stopping condition 1. Error is 0.
    if intermediate_node_weighted_mistakes(target_values, data_weights)[0] <= 1e-15:
        print("Stopping condition 1 reached.")                
        return create_leaf(target_values, data_weights)
    
    # Stopping condition 2. No more features.
    if remaining_features == []:
        print("Stopping condition 2 reached.")                
        return create_leaf(target_values, data_weights)    
    
    # Additional stopping condition (limit tree depth)
    if current_depth > max_depth:
        print("Reached maximum depth. Stopping for now.")
        return create_leaf(target_values, data_weights)
    
    # If all the datapoints are the same, splitting_feature will be None. Create a leaf
    splitting_feature = best_splitting_feature(data, features, target, data_weights)
    remaining_features.remove(splitting_feature)
        
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    left_data_weights = data_weights[data[splitting_feature] == 0]
    right_data_weights = data_weights[data[splitting_feature] == 1]
    
    print("Split on feature {}. (left: {}, right: {})".format(splitting_feature, len(left_split), len(right_split)))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        print("Creating leaf node.")
        return create_leaf(left_split[target], data_weights)
    if len(right_split) == len(data):
        print("Creating leaf node.")
        return create_leaf(right_split[target], data_weights)
    
    # Repeat (recurse) on left and right subtrees
    left_tree = weighted_decision_tree_create(
        left_split, remaining_features, target, left_data_weights, current_depth + 1, max_depth)
    right_tree = weighted_decision_tree_create(
        right_split, remaining_features, target, right_data_weights, current_depth + 1, max_depth)
    
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [12]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

In [13]:
ex1= pd.Series([1.0 for i in range(len(train))])
small_tree= weighted_decision_tree_create(train, features_dummy,target, ex1, max_depth=2)

if count_nodes(small_tree)==7:
    print('test passed')
else:
    print('test failed')
    print('number of nodes found; {}'.format(count_nodes(small_tree)))

--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature term_ 36 months. (left: 9223, right: 28001)
--------------------------------------------------------------------
Subtree, depth = 2 (9223 data points).
Split on feature grade_A. (left: 9122, right: 101)
--------------------------------------------------------------------
Subtree, depth = 3 (9122 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 3 (101 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (28001 data points).
Split on feature grade_D. (left: 23300, right: 4701)
--------------------------------------------------------------------
Subtree, depth = 3 (23300 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------

In [14]:
def classify(tree, x, annotate=False):
    if tree['is_leaf']:
        if annotate:
            print('At leaft, predicting {}'.format(tree['prediction']))
        return tree['prediction']
    else:
        #split on feature
        split_feature_value = x[tree['splitting_feature']]
        if annotate:
            print('split on {} = {}'.format(tree['splitting_feature'], split_feature_value))
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)
        

# evaluating tree

In [15]:
def evaluate_classificaton_error(tree, data):
    #target='safe_loans'
    prediction = data.apply(lambda x: classify(tree, x), axis=1)
    mistakes = (prediction != data[target]).sum() / float(len(data))
    return mistakes

In [16]:
evaluate_classificaton_error(small_tree, test)

0.3981042654028436

# example
weight
- 1 to the last 10 items
- 1 to the first 10 items

- and 0 to the rest.

In [17]:
example_weights= pd.Series([1.]*10 + [0.]*(len(train) -20) + [1.]*10)
example_weights = np.array(example_weights)
#len(example_weights), len(train[target])
#example_weights= pd.DataFrame(data=[1]*10 + [0]*(len(train) -20) + [1]*10)
ex=train.safe_loans
ex[:11]=1
ex[10:-10]=0

example_data_weights = np.ones(10*1).tolist() + [0.]*(len(train) - 20) + np.ones(1*10).tolist()
example_data_weights = np.array(example_data_weights)

ex2=pd.Series(len(train)* [1.5], index= train.index)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [18]:
small_tree_subset_20 = weighted_decision_tree_create(test, features_dummy, target,
                                                    ex2, max_depth=2)

--------------------------------------------------------------------
Subtree, depth = 1 (9284 data points).


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match

### small_tree_subset_20

### train[features_dummy]

In [None]:
evaluate_classificaton_error(small_tree_subset_20, train)

### adaboost

In [None]:
from math import log, exp

In [None]:
def adaboost_with_tree_stumps(data, features, target, num_tree_stumps):
    alpha = pd.Series(np.ones(len(data)), index=data.index)
    weights= []
    tree_stumps = []
    target_values = data[target]
    
    for t in range(num_tree_stumps):
        print(" ====================================")
        print(' Adaboost Iteration {}'.format(t))
        print('======================================')
        
        # learn a decision tree stump, depth 1
        tree_stump = weighted_decision_tree_create(data, features, target, data_weights=alpha, max_depth=1)
        tree_stumps.append(tree_stump)
        
        predictions = data.apply(lambda x: classify(tree_stump, x), axis=1)
        
        is_correct = (predictions == target_values)
        is_wrong   = (predictions != target_values)
        
        #compute weighted error
        #weighted_error= sum(alpha[is_wrong])/sum(alpha)
        #weighted_error = is_wrong.sum()/ float(len(alpha))
        weighted_error = round(float(sum(a for i,a in enumerate(alpha) if is_wrong[i]))/sum(alpha), 5)
        #compute model coef uing wei error
        weight= 0.5 * log((1- weighted_error)/weighted_error)
        weights.append(weight)
        
        #adjust weights on data point
        adjustment = is_correct.apply(lambda is_correct: exp(-weight) if is_correct else exp(weight))
        
        #scale alpha by multip by adjustements
        alpha = (alpha* adjustment) / float(sum(alpha))
        #print(alpha)
        
    return weights, tree_stumps

In [None]:
AdaB=adaboost_with_tree_stumps(train, features_dummy, target, num_tree_stumps=10)