# Title

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

### Read data

In [7]:
data = pd.read_csv('./data/loan_sub/loan_sub.csv', nrows=100)
print(data.head(2))
# data = data.dropna()
data.info()

        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  1077501    1296599       5000         5000             4975   36 months   
1  1077430    1314167       2500         2500             2500   60 months   

   int_rate  installment grade sub_grade  ... sub_grade_num delinq_2yrs_zero  \
0     10.65       162.87     B        B2  ...           0.4                1   
1     15.27        59.83     C        C4  ...           0.8                1   

  pub_rec_zero  collections_12_mths_zero short_emp payment_inc_ratio  \
0            1                         1         0            8.1435   
1            1                         1         1            2.3932   

           final_d last_delinq_none last_record_none last_major_derog_none  
0  20141201T000000                1                1                     1  
1  20161201T000000                1                1                     1  

[2 rows x 68 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100

### Analyze data and balance data for training

In [12]:

data['safe_loans'] = data['bad_loans'].apply(lambda x: -1 if x == 1 else +1)
good_bad_loan_counts = data['safe_loans'].value_counts()
print(good_bad_loan_counts)
percentage = good_bad_loan_counts[-1] / good_bad_loan_counts[1] 
print(f'percentage: {percentage}')
good_loans = data[data['safe_loans'] == 1].sample(frac=percentage, random_state=0)
bad_loans =  data[data['safe_loans'] == -1]
dataset = pd.concat([good_loans, bad_loans], axis=0)
dataset['safe_loans'].value_counts()
dataset.info()

safe_loans
 1    81
-1    19
Name: count, dtype: int64
percentage: 0.2345679012345679
<class 'pandas.core.frame.DataFrame'>
Index: 38 entries, 31 to 93
Data columns (total 69 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           38 non-null     int64  
 1   member_id                    38 non-null     int64  
 2   loan_amnt                    38 non-null     int64  
 3   funded_amnt                  38 non-null     int64  
 4   funded_amnt_inv              38 non-null     int64  
 5   term                         38 non-null     object 
 6   int_rate                     38 non-null     float64
 7   installment                  38 non-null     float64
 8   grade                        38 non-null     object 
 9   sub_grade                    38 non-null     object 
 10  emp_title                    37 non-null     object 
 11  emp_length                   38 non-null     object 
 12

### Process data


In [13]:
def generate_dummy_columns(data: pd.DataFrame, columns : list) -> pd.DataFrame :
    data = data.dropna()
    for col in columns:        
        new_column_suffix = data[col].apply(lambda x: str(x)).unique()
        new_cols = [ col + '_' + suffix for suffix in new_column_suffix ]
        new_data =pd.get_dummies(data[col], prefix=col)[new_cols]
        data = pd.concat([data, new_data], axis=1) 
        del data[col]
    return data

In [17]:
# balance dataset for training, normalize data and dumminize variables
cols = ['grade', 'term','home_ownership', 'emp_length','bad_loans']
dataset = dataset[cols]
# dataset.info()
dataset_2 = generate_dummy_columns(dataset, cols)
dataset_2.info()


<class 'pandas.core.frame.DataFrame'>
Index: 38 entries, 31 to 93
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   grade_B                  38 non-null     bool 
 1   grade_A                  38 non-null     bool 
 2   grade_E                  38 non-null     bool 
 3   grade_F                  38 non-null     bool 
 4   grade_C                  38 non-null     bool 
 5   grade_D                  38 non-null     bool 
 6   term_ 36 months          38 non-null     bool 
 7   term_ 60 months          38 non-null     bool 
 8   home_ownership_RENT      38 non-null     bool 
 9   home_ownership_OWN       38 non-null     bool 
 10  home_ownership_MORTGAGE  38 non-null     bool 
 11  emp_length_10+ years     38 non-null     bool 
 12  emp_length_2 years       38 non-null     bool 
 13  emp_length_7 years       38 non-null     bool 
 14  emp_length_< 1 year      38 non-null     bool 
 15  emp_length_4

Index(['grade_B', 'grade_A', 'grade_E', 'grade_F', 'grade_C', 'grade_D',
       'term_ 36 months', 'term_ 60 months', 'home_ownership_RENT',
       'home_ownership_OWN', 'home_ownership_MORTGAGE', 'emp_length_10+ years',
       'emp_length_2 years', 'emp_length_7 years', 'emp_length_< 1 year',
       'emp_length_4 years', 'emp_length_1 year', 'emp_length_6 years',
       'emp_length_3 years', 'emp_length_9 years', 'emp_length_8 years',
       'emp_length_5 years', 'bad_loans_0', 'bad_loans_1'],
      dtype='object')

### Build Model

In [18]:
# fit()
# predict()
    # dfs visit from the root feature and use the leaf node for prediction 
# _create_tree()
    # termniation. max_depth, no more split or good enough, reaching leaf
    # find the best feature
    # split and build left/right tree
# _get_best_features

class TreeNode:
    def __init__(self, feature_name:str) -> None:
        pass

class LoanDecisionTree:
    def __init__(self, min_error = 2, max_depth = 3) -> None:
        self.min_error = min_error
        self.max_depth = max_depth
    
    def fit(self, X : pd.DataFrame, Y : pd.Series):
        data = pd.concat([X, Y], axis=1)
        self.root = self._create_tree(data, X.columns, 0)
        self.label_column = Y.column[0]
    
    def predict(self, X):
        pass
    
    def _create_tree(self, data : pd.DataFrame, remaining_features: list, depth: int) -> TreeNode:
        if depth == self.max_depth or len(remaining_features) == 0 :
            return self._create_leaf(data)
        
        # current node
        best_feature = self._get_best_feature(remaining_features)
        node = TreeNode(best_feature)
        
        # build left/right node
        left_split_data = data[data[best_feature] == 0]
        right_split_data = data[data[best_feature] == 1]
        
        new_remaining_features = remaining_features.drop(best_feature)
        left_node = self._create_tree(left_split_data, new_remaining_features, depth + 1)
        right_node = self._create_tree(right_split_data, new_remaining_features, depth + 1)
        
        node.left = left_node
        node.right = right_node
        
        return node        
        
    def _create_leaf(data: pd.DataFrame):
        pass
    
    def _get_best_feature(remaining_features: list) -> str:
        pass

NameError: name 'TreeNode' is not defined

### Train model

### Evaluate model

### Predit on test dataset


### Output

In [None]:

# df = pd.DataFrame()
# df.to_csv('submission.csv',index = False, header=True)