# Loan analysis with decision tree (mannual model)

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from typing import Literal
import graphviz
from sklearn.tree import export_graphviz

### Read data

In [52]:
data = pd.read_csv('./data/loan_sub/loan_sub.csv')
print(data.head(2))
# data = data.dropna()
data.info()

        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  1077501    1296599       5000         5000             4975   36 months   
1  1077430    1314167       2500         2500             2500   60 months   

   int_rate  installment grade sub_grade  ... sub_grade_num delinq_2yrs_zero  \
0     10.65       162.87     B        B2  ...           0.4              1.0   
1     15.27        59.83     C        C4  ...           0.8              1.0   

  pub_rec_zero  collections_12_mths_zero short_emp payment_inc_ratio  \
0          1.0                       1.0         0            8.1435   
1          1.0                       1.0         1            2.3932   

           final_d last_delinq_none last_record_none last_major_derog_none  
0  20141201T000000                1                1                     1  
1  20161201T000000                1                1                     1  

[2 rows x 68 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122

  data = pd.read_csv('./data/loan_sub/loan_sub.csv')


### Analyze data and balance data for training

In [53]:

data['safe_loans'] = data['bad_loans'].apply(lambda x: -1 if x == 1 else +1)
good_bad_loan_counts = data['safe_loans'].value_counts()
print(good_bad_loan_counts)
percentage = good_bad_loan_counts[-1] / good_bad_loan_counts[1] 
print(f'percentage: {percentage}')
good_loans = data[data['safe_loans'] == 1].sample(frac=percentage, random_state=0)
bad_loans =  data[data['safe_loans'] == -1]
dataset = pd.concat([good_loans, bad_loans], axis=0)
print(dataset['safe_loans'].value_counts())
dataset.info()

safe_loans
 1    99457
-1    23150
Name: count, dtype: int64
percentage: 0.2327639080205516
safe_loans
 1    23150
-1    23150
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 46300 entries, 81182 to 122605
Data columns (total 69 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           46300 non-null  int64  
 1   member_id                    46300 non-null  int64  
 2   loan_amnt                    46300 non-null  int64  
 3   funded_amnt                  46300 non-null  int64  
 4   funded_amnt_inv              46300 non-null  int64  
 5   term                         46300 non-null  object 
 6   int_rate                     46300 non-null  float64
 7   installment                  46300 non-null  float64
 8   grade                        46300 non-null  object 
 9   sub_grade                    46300 non-null  object 
 10  emp_title                    43324 no

### Process data


In [57]:
def generate_dummy_columns(data: pd.DataFrame, columns : list) -> pd.DataFrame :
    # data = data.dropna()
    for col in columns:        
        new_column_suffix = data[col].apply(lambda x: str(x)).unique()
        new_cols = [ col + '_' + suffix for suffix in new_column_suffix ]
        new_data =pd.get_dummies(data[col], prefix=col)[new_cols]
        data = pd.concat([data, new_data], axis=1) 
        del data[col]
    return data

In [84]:
# balance dataset for training, normalize data and dumminize variables
cols = ['grade', 'term','home_ownership', 'emp_length']
dataset1 = dataset[cols +['safe_loans']]
dataset = dataset1.dropna()
X = dataset[cols]
# dataset.info()
X = generate_dummy_columns(X, cols)
X.info()
Y  = dataset['safe_loans']
x_train, x_valid, y_train, y_valid = train_test_split(X, Y , test_size=0.2, random_state=0)
x_train.columns


<class 'pandas.core.frame.DataFrame'>
Index: 44492 entries, 81182 to 122605
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   grade_C                  44492 non-null  bool 
 1   grade_B                  44492 non-null  bool 
 2   grade_E                  44492 non-null  bool 
 3   grade_D                  44492 non-null  bool 
 4   grade_A                  44492 non-null  bool 
 5   grade_F                  44492 non-null  bool 
 6   grade_G                  44492 non-null  bool 
 7   term_ 36 months          44492 non-null  bool 
 8   term_ 60 months          44492 non-null  bool 
 9   home_ownership_RENT      44492 non-null  bool 
 10  home_ownership_MORTGAGE  44492 non-null  bool 
 11  home_ownership_OWN       44492 non-null  bool 
 12  home_ownership_OTHER     44492 non-null  bool 
 13  emp_length_2 years       44492 non-null  bool 
 14  emp_length_10+ years     44492 non-null  bool 
 15  em

Index(['grade_C', 'grade_B', 'grade_E', 'grade_D', 'grade_A', 'grade_F',
       'grade_G', 'term_ 36 months', 'term_ 60 months', 'home_ownership_RENT',
       'home_ownership_MORTGAGE', 'home_ownership_OWN', 'home_ownership_OTHER',
       'emp_length_2 years', 'emp_length_10+ years', 'emp_length_7 years',
       'emp_length_6 years', 'emp_length_< 1 year', 'emp_length_5 years',
       'emp_length_1 year', 'emp_length_3 years', 'emp_length_8 years',
       'emp_length_4 years', 'emp_length_9 years'],
      dtype='object')

### Build Model

In [87]:

# _get_best_features
def get_best_feature(remaining_features: list, data: pd.DataFrame, label_column: str, algo : Literal['error_count','entropy']) -> str:
    # iterate each feature and get the score using
    
    if algo == 'error_count':
        return get_best_feature_by_error_count(remaining_features, data, label_column)
    elif algo == 'entropy':
        return get_best_feature_by_entropy(remaining_features, data, label_column)
    
def count_error(data: pd.DataFrame, label_column: str) -> int :     
    count_positive = (data[label_column] == 1).sum()
    return min(count_positive , len(data) - count_positive)

def get_best_feature_by_error_count(remaining_features: list, data: pd.DataFrame, label_column: str) -> str:
    total_count = len(data)
    min_error_rate = 1
    best_feature = ''
    for feature in remaining_features:
        count_0 = count_error(data[data[feature] == 0], label_column) # good loans count if split with this feature
        count_1 = count_error(data[data[feature] == 1], label_column) # good loans count if split with this feature
        error_rate = (count_0 + count_1) * 1.0 / float(total_count)
        if error_rate < min_error_rate:
            min_error_rate = error_rate
            best_feature = feature
    return best_feature

def get_best_feature_by_entropy(remaining_features: list, data: pd.DataFrame, label_column: str) -> str:
    max_entropy_gain = float('-inf')
    best_feature = ''
    entropy_before = entropy(data[label_column])
    count = len(data)

    for feature in remaining_features:      
        data_0 = data[data[feature] == 0]   
        data_1 = data[data[feature] == 1] 

        entropy_0 = entropy(data_0[label_column])
        entropy_1 = entropy(data_1[label_column])   

        entropy_split =  entropy_0  * len(data_0) / count + entropy_1  * len(data_1) / count 
        entropy_grain = entropy_before - entropy_split 

        if entropy_grain >  max_entropy_gain:
            max_entropy_gain = entropy_grain
            best_feature = feature

    return best_feature    

def entropy(Y: pd.Series):
    s1 = (Y == 1).sum()
    n = len(Y)
    if s1 == 0 or s1 == n:
        return 0

    p1 =  (Y == 1).sum() / len(Y)
    p0 = 1 - p1
    return -p1 * np.log2(p1) - p0 * np.log2(p0)

In [90]:
# fit()
# predict()
    # dfs visit from the root feature and use the leaf node for prediction 
# _create_tree()
    # termniation. max_depth, no more split or good enough, reaching leaf
    # find the best feature
    # split and build left/right tree

class TreeNode:
    def __init__(self, feature_name: str) -> None:
        
        self.feature_name = feature_name
        self.left = None
        self.right = None
        self.prediction = 0
        self.is_leaf = False

class LoanDecisionTree:
    def __init__(self, min_error_rate =0.3, max_depth = 4) -> None:
        self.min_error_rate = min_error_rate
        self.max_depth = max_depth
        self.label_column = ""
    
    def fit(self, X : pd.DataFrame, Y : pd.Series):
        data = pd.concat([X, Y], axis=1)
        self.label_column = Y.name
        self.root = self._create_tree(data, X.columns, 0)
        
    
    def predict(self, X: pd.DataFrame):
        return X.apply(lambda d: self._predict_single(d, self.root), axis=1)

    def _predict_single(self, row_data, node=None):
        if node == None:
            node = self.root
        if node.is_leaf:
            return node.prediction
        
        if row_data[node.feature_name] == 0:
            return self._predict_single(row_data, node.left)
        else:
            return self._predict_single(row_data, node.right)
    
    def _create_tree(self, data : pd.DataFrame, remaining_features: list, depth: int) -> TreeNode:
        if depth == self.max_depth or len(remaining_features) == 0:
            print(f'terminated at depth {depth}')
            return self._create_leaf(data[self.label_column], '')        
        
        # current node
        best_feature = get_best_feature(remaining_features, data, self.label_column, 'entropy')
        node = TreeNode(best_feature)
                
        # build left/right node
        left_split_data = data[data[best_feature] == 0]
        right_split_data = data[data[best_feature] == 1]
        print(f'split on feature : {best_feature}, {len(left_split_data)}, {len(right_split_data)}'  )

        if(len(left_split_data) == len(data)):
            print('perfect left split')
            return self._create_leaf(left_split_data[self.label_column] , best_feature)
        elif(len(right_split_data) == len(data)):
            print('perfect right split')
            return self._create_leaf(right_split_data[self.label_column] , best_feature)
        
        new_remaining_features = remaining_features.drop(best_feature)
        left_node = self._create_tree(left_split_data, new_remaining_features, depth + 1)
        right_node = self._create_tree(right_split_data, new_remaining_features, depth + 1)
        
        node.left = left_node
        node.right = right_node
        
        return node        
        
    def _create_leaf(self, Y: pd.Series, feature_name: str = ''):
        positive_count = (Y == 1).sum()
        prediction =  1 if positive_count/len(Y) >= 0.5 else -1
        node = TreeNode(feature_name)
        node.prediction = prediction
        node.is_leaf = True         
        return node
    
    def count_leaves(self):
        return self.count_leaves_helper(self.root)
    
    def count_leaves_helper(self, tree):
        if tree.is_leaf:
            return 1
        return self.count_leaves_helper(tree.left) + self.count_leaves_helper(tree.right)

    def score(self, test_x, test_y):
        pred = self.predict(test_x)
        return accuracy_score(test_y, pred)

model =  LoanDecisionTree(max_depth=10, min_error_rate=1e-8)
model.fit(x_train, y_train) 
print(model.score(x_valid, y_valid))     
print(model.count_leaves())   


split on feature : grade_A, 30630, 4963
split on feature : grade_B, 20834, 9796
split on feature : grade_C, 11814, 9020
split on feature : term_ 36 months, 5757, 6057
split on feature : grade_D, 3757, 2000
split on feature : home_ownership_RENT, 2205, 1552
split on feature : grade_E, 909, 1296
split on feature : emp_length_9 years, 873, 36
split on feature : emp_length_3 years, 818, 55
split on feature : emp_length_5 years, 742, 76
terminated at depth 10
terminated at depth 10
split on feature : grade_F, 12, 43
terminated at depth 10
terminated at depth 10
split on feature : grade_F, 8, 28
split on feature : home_ownership_MORTGAGE, 1, 7
terminated at depth 10
terminated at depth 10
split on feature : home_ownership_MORTGAGE, 5, 23
terminated at depth 10
terminated at depth 10
split on feature : emp_length_10+ years, 817, 479
split on feature : home_ownership_MORTGAGE, 105, 712
split on feature : emp_length_5 years, 92, 13
terminated at depth 10
terminated at depth 10
split on feature 

In [None]:
model_2 =  LoanDecisionTree(max_depth=4, min_error_rate=1e-8)
model_2.fit(x_train, y_train) 
print(model_2.score(x_valid, y_valid))     
print(model_2.count_leaves())   

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 14)

### Train model

AttributeError: 'LoanDecisionTree' object has no attribute 'label_column'

### Evaluate model

### Predit on test dataset


### Output

In [None]:

# df = pd.DataFrame()
# df.to_csv('submission.csv',index = False, header=True)