In [1]:
import numpy as np
import turicreate as tc
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math
import string
import json

In [2]:
loans = pd.read_csv('Week_3/lending-club-data.csv')

    

In [3]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x == 0 else -1)
loans.drop(['bad_loans'], axis = 1, inplace = True)

In [4]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans']

0         1
1        -1
2         1
3         1
4         1
         ..
122602   -1
122603    1
122604   -1
122605   -1
122606    1
Name: safe_loans, Length: 122607, dtype: int64

In [5]:
#percentage of safe loans
percentage = len(loans[loans['safe_loans'] == +1]) / loans['safe_loans'].count()
print(f'The percentage of safe loans is {percentage}')

The percentage of safe loans is 0.8111853319957262


In [6]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans_extracted = loans[features + [target]]
loans_extracted

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,B,B2,0,11,RENT,27.65,credit_card,36 months,1,1,83.7,0.00,1
1,C,C4,1,1,RENT,1.00,car,60 months,1,1,9.4,0.00,-1
2,C,C5,0,11,RENT,8.72,small_business,36 months,1,1,98.5,0.00,1
3,C,C1,0,11,RENT,20.00,other,36 months,0,1,21.0,16.97,1
4,A,A4,0,4,RENT,11.20,wedding,36 months,1,1,28.3,0.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122602,E,E5,1,0,MORTGAGE,1.50,medical,60 months,0,0,14.6,0.00,-1
122603,D,D2,0,11,MORTGAGE,11.26,debt_consolidation,36 months,0,0,15.2,0.00,1
122604,D,D3,0,6,MORTGAGE,12.28,medical,60 months,0,0,10.7,0.00,-1
122605,D,D5,0,11,MORTGAGE,18.45,debt_consolidation,60 months,1,1,46.3,0.00,-1


In [7]:
with open('Week_3/module-5-assignment-1-validation-idx.json') as f:
    valid_idx = json.load(f)
with open('Week_3/module-5-assignment-1-train-idx.json') as f:
    train_idx = json.load(f)
    
train_data = loans_extracted.iloc[train_idx]
validation_data = loans_extracted.iloc[valid_idx]

In [8]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print ("Number of safe loans  : %s" % len(safe_loans_raw))
print ("Number of risky loans : %s" % len(risky_loans_raw))

Number of safe loans  : 99457
Number of risky loans : 23150


In [9]:
print(len(train_data[train_data['safe_loans'] == +1]))
print(len(train_data[train_data['safe_loans'] == -1]))
print(train_data.shape)

18748
18476
(37224, 13)


In [10]:
pd.set_option('display.max_columns',13)
train_data.head(5)

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0.0,-1
6,F,F2,0,5,OWN,5.55,small_business,60 months,1,1,32.6,0.0,-1
7,B,B5,1,1,RENT,18.08,other,60 months,1,1,36.5,0.0,-1
10,C,C1,1,1,RENT,10.08,debt_consolidation,36 months,1,1,91.7,0.0,-1
12,B,B2,0,4,RENT,7.06,other,36 months,1,1,55.5,0.0,-1


In [116]:
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer   

In [112]:
mct = make_column_transformer((OneHotEncoder(), ['grade', 'sub_grade', 'home_ownership', 'purpose', 'term']), remainder='passthrough')

In [166]:
one_hot = OneHotEncoder()
categorical_features = ['grade', 'sub_grade', 'home_ownership', 'purpose', 'term']
transformer = ColumnTransformer([("one_hot", 
                                  one_hot, 
                                  categorical_features)],
                                  remainder = "passthrough")

transformed_train_data = transformer.fit_transform(train_data.drop(['safe_loans'], axis='columns')).toarray()
Y = train_data['safe_loans'].to_numpy().reshape(-1,1)
                                 


In [169]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(max_depth = 6)
small_model = DecisionTreeClassifier(max_depth = 2)
decision_tree_model.fit(transformed_train_data, Y)
small_model.fit(transformed_train_data, Y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [170]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
19,B,B3,0,11,OWN,11.18,credit_card,36 months,1,1,82.4,0.0,1
79,D,D1,0,10,RENT,16.85,debt_consolidation,36 months,1,1,96.4,0.0,1
24,D,D2,0,3,RENT,13.97,other,60 months,0,1,59.5,0.0,-1
41,A,A5,0,11,MORTGAGE,16.33,debt_consolidation,36 months,1,1,62.1,0.0,-1


In [171]:
sample_X = transformer.transform(sample_validation_data.drop(['safe_loans'], axis='columns')).toarray()
sample_Y = sample_validation_data['safe_loans'].to_numpy().reshape(-1,1)
sample_results = decision_tree_model.predict(sample_X)
sample_results


array([ 1, -1, -1,  1])

In [175]:
decision_tree_model.classes_

array([-1,  1])

In [177]:
pd.DataFrame(decision_tree_model.predict_proba(sample_X)).sort_values(by=1, ascending=False)

Unnamed: 0,0,1
3,0.207895,0.792105
0,0.341565,0.658435
1,0.536306,0.463694
2,0.64751,0.35249


In [178]:
s_sample_results = small_model.predict(sample_X)
s_sample_results

array([ 1, -1, -1,  1])

In [181]:
transformed_valid_data = transformer.transform(validation_data.drop(['safe_loans'], axis='columns')).toarray()
valid_Y = validation_data['safe_loans'].to_numpy().reshape(-1,1)

In [182]:
print(decision_tree_model.score(transformed_valid_data, valid_Y))
print(small_model.score(transformed_valid_data, valid_Y))

0.6361482119775959
0.6193451098664369


In [184]:
big_model = DecisionTreeClassifier(max_depth = 10)
big_model.fit(transformed_train_data, Y)
big_model.score(transformed_valid_data, valid_Y)

0.6259155536406721

In [None]:
##Part 2

In [235]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'

In [236]:
with open('Week_3/module-5-assignment-2-test-idx.json') as f:
    test_idx = json.load(f)

In [237]:
train_data = loans.iloc[train_idx]
test_data = loans.iloc[test_idx]

In [261]:

contains_na = pd.DataFrame([x for x in train_data[train_data[features[-1]].isna()] if x == True])
contains_na
    

In [251]:
one_hot_2 = OneHotEncoder()

transformer_2 = ColumnTransformer([("one_hot_2", 
                                  one_hot_2, 
                                  features)],
                                  remainder = "passthrough")

transformed_train_data = transformer_2.fit_transform(train_data.drop(['safe_loans'], axis='columns'))
train_output = train_data['safe_loans'].to_numpy().reshape(-1,1)
transformed_test_data = transformer.transform(test_data.drop(['safe_loans'], axis='columns'))
test_output = test_data['safe_loans'].to_numpy().reshape(-1,1)

ValueError: Input contains NaN

In [234]:
transformed_train_data.shape

(37224, 77)

In [227]:
def intermediate_node_num_mistakes(labels_in_node):
    # Corner case: If labels_in_node is empty, return 0
    if len(labels_in_node) == 0:
        return 0    
    
    
    labels_in_node = np.array(labels_in_node)
    unique, counts = np.unique(labels_in_node, return_counts=True)
    counter = dict(zip(unique, counts))
    # Count the number of 1's (safe loans)
    one_is_larger = (counter[1] > counter[-1])
    
    # Count the number of -1's (risky loans)
    ## YOUR CODE HERE                
    # Return the number of mistakes that the majority classifier makes.
    if one_is_larger:
        return counter[-1]
    else:
        return counter[1]



In [228]:
# Test case 1
example_labels = [-1, -1, 1, 1, 1]
check = intermediate_node_num_mistakes(example_labels)
if check == 2:
    print ('Test passed!')
else:
    print ('Test 1 failed... try again!')

# Test case 2
example_labels = [-1, -1, 1, 1, 1, 1, 1]
check = intermediate_node_num_mistakes(example_labels)
if check == 2:
    print ('Test passed!')
else:
    print ('Test 2 failed... try again!')
    
# Test case 3
example_labels = [-1, -1, -1, -1, -1, 1, 1]
check = intermediate_node_num_mistakes(example_labels)
if check == 2:
    print ('Test passed!')
else:
    print ('Test 3 failed... try again!')

Test passed!
Test passed!
Test passed!


In [218]:
def best_splitting_feature(data, features, target):
    
    target_values = data[target]
    best_feature = None # Keep track of the best feature 
    best_error = 10     # Keep track of the best error so far 
    # Note: Since error is always <= 1, we should intialize it with something larger than 1.

    # Convert to float to make sure error gets computed correctly.
    num_data_points = float(len(data))  
    
    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        left_split = data[data[feature] == 0]
        
        # The right split will have all data points where the feature value is 1
        ## YOUR CODE HERE
        right_split = data[data[feature] == 1]
            
        # Calculate the number of misclassified examples in the left split.
        # Remember that we implemented a function for this! (It was called intermediate_node_num_mistakes)
        # YOUR CODE HERE
        left_mistakes = intermediate_node_num_mistakes(left_split[target])             

        # Calculate the number of misclassified examples in the right split.
        ## YOUR CODE HERE
        right_mistakes = intermediate_node_num_mistakes(right_split[target])    
            
        # Compute the classification error of this split.
        # Error = (# of mistakes (left) + # of mistakes (right)) / (# of data points)
        ## YOUR CODE HERE
        error = (left_mistakes + right_mistakes) / num_data_points

        # If this is the best error we have found so far, store the feature as best_feature and the error as best_error
        ## YOUR CODE HERE
        if error < best_error:
            best_error = error
            best_feature = feature
    
    return best_feature # Return the best feature we found

In [230]:
if best_splitting_feature(train_data, features, 'safe_loans') == 'term':
    print( 'Test passed!')
else:
    print ('Test failed... try again!')

Test failed... try again!


In [217]:
{ 
   'is_leaf'            : True/False.
   'prediction'         : Prediction at the leaf node.
   'left'               : (dictionary corresponding to the left tree).
   'right'              : (dictionary corresponding to the right tree).
   'splitting_feature'  : The feature that this node splits on
}


SyntaxError: invalid syntax (<ipython-input-217-3656160c7d76>, line 3)

In [None]:
def create_leaf(target_values):    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True
           }   
    
    # Count the number of data points that are +1 and -1 in this node.
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])
    
    # For the leaf node, set the prediction to be the majority class.
    # Store the predicted class (1 or -1) in leaf['prediction']
    if num_ones > num_minus_ones:
        leaf['prediction'] =  1        
    else:
        leaf['prediction'] = -1       
        
    # Return the leaf node        
    return leaf

In [263]:
def decision_tree_create(data, features, target, current_depth = 0, max_depth = 10):
    remaining_features = features[:] # Make a copy of the features.
    
    target_values = data[target]
    print ("--------------------------------------------------------------------")
    print ("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    

    # Stopping condition 1
    # (Check if there are mistakes at current node.
    cond1 = intermediate_node_num_mistakes(target_values)
    # Recall you wrote a function intermediate_node_num_mistakes to compute this.)
    if cond1 == 0:  ## YOUR CODE HERE
        print ("Stopping condition 1 reached.")     
        # If not mistakes at current node, make current node a leaf node
        return create_leaf(target_values)
    
    # Stopping condition 2 (check if there are remaining features to consider splitting on)
    if remaining_features == []:   ## YOUR CODE HERE
        print ("Stopping condition 2 reached.")    
        # If there are no remaining features to consider, make current node a leaf node
        return create_leaf(target_values)    
    
    # Additional stopping condition (limit tree depth)
    if current_depth >= max_depth:  ## YOUR CODE HERE
        print ("Reached maximum depth. Stopping for now.")
        # If the max tree depth has been reached, make current node a leaf node
        return create_leaf(target_values)

    # Find the best splitting feature (recall the function best_splitting_feature implemented above)
    ## YOUR CODE HERE
    splitting_feature = best_splitting_feature(data, remaining_features, target)
    
    # Split on the best feature that we found. 
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    remaining_features.remove(splitting_feature)
    print ("Split on feature %s. (%s, %s)" % (splitting_feature, len(left_split), len(right_split)))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        print ("Creating leaf node.")
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print ("Creating leaf node.")
        return create_leaf(right_split[target])

        
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth)        
    ## YOUR CODE HERE
    right_tree = decision_tree_create(right_split, remaining_features, target, current_depth + 1, max_depth)

    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [266]:
def classify(tree, x, annotate = False):
       # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate:
             print ("At leaf, predicting %s" % tree['prediction'])
        return tree['prediction']
    else:
        # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate:
             print ("Split on %s = %s" % (tree['splitting_feature'], split_feature_value))
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)

In [267]:
def evaluate_classification_error(tree, data):
    # Apply the classify(tree, x) to each row in your data
    prediction = data.apply(lambda x: classify(tree, x))

In [268]:
def print_stump(tree, name = 'root'):
    split_name = tree['splitting_feature'] # split_name is something like 'term. 36 months'
    if split_name is None:
        print "(leaf, label: %s)" % tree['prediction']
        return None
    split_feature, split_value = split_name.split('.')
    print '                       %s' % name
    print '         |---------------|----------------|'
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '  [{0} == 0]               [{0} == 1]    '.format(split_name)
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '    (%s)                         (%s)' \
        % (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
           ('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree'))

SyntaxError: invalid syntax (<ipython-input-268-e804781c764a>, line 4)