### Simple Decision Tree Classifier
# Steps:
- First encode categorical columns using label encoder or onehotencoder(pd.getdummies), doesn't make a difference for this dataset.
- Split data using train_test split and use stratify
- Create an instance of DecisionTreeClassifier
- Set up the parameter grid(for hyperparameters)
- While training drop unnecessary columns like LoanID
- Conduct a randomized search over the param_grid to get best parameters (train the model)
- Use the best parameters to provide predictions or labels for test data

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score

df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')
print(df.columns)
print()
print(test_df.columns)

# we need to  encode each categorical column
to_encode = []
for column in df.columns:
    if column == 'LoanID':
        continue
    if df[column].dtype == 'object':
        to_encode.append(column)
    elif df[column].dtype not in ['float64', 'int64', 'float', 'int']:
        to_encode.append(column)

# for column in to_encode:
#     le = LabelEncoder() #for now let's use labelEncoder
#     df[column] = le.fit_transform(df[column])
#     test_df[column] = le.transform(test_df[column])
    # later use le.inverse_transform if needed

#one hot
df = pd.get_dummies(df, columns=to_encode, drop_first=True, dtype=int)
test_df = pd.get_dummies(test_df, columns=to_encode, drop_first=True, dtype=int)

print('done that')


# now all the categorical columns are set
df = df.drop(columns=['LoanID'])

train_df, validation_df = train_test_split(df, test_size=0.1, random_state=17, stratify=df['Default'])


x_train_df = train_df.drop(columns=['Default'])
y_train_df = train_df['Default']

x_validation_df = validation_df.drop(columns=['Default'])
y_validation_df = validation_df['Default']


tree_clf = tree.DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],  # Split quality measure
    'max_depth': [4, 6, 8, 10, 20],     # Max depth of the tree
    'min_samples_split': [3, 5, 8],   # Min number of samples req to split a node
    'min_samples_leaf': [2, 3, 10],      # Min number of samples req to be a leaf node
    'max_leaf_nodes': [40, 50, 60], # Max number of leaf nodes
}

rs = RandomizedSearchCV(estimator=tree_clf, param_distributions=param_grid, random_state=17) # default 5 fold cross validation
rs.fit(x_train_df, y_train_df)
print("Best Parameters:", rs.best_params_)


# tree_clf.fit(x_train_df, y_train_df)

# y_pred = tree_clf.predict(x_validation_df)
y_pred = rs.predict(x_validation_df)

train_acc = accuracy_score(y_train_df, rs.predict(x_train_df))
valid_acc = accuracy_score(y_validation_df, y_pred)

print(f'Training accuracy {train_acc}')
print(f'Validation accuracy {valid_acc}')




newdf = pd.DataFrame({"LoanID": test_df['LoanID'], "Default": rs.predict(X=test_df.drop(columns=['LoanID']))})
newdf.to_csv('./csv_submissions/dtree_out.csv', index=False)

Index(['LoanID', 'Age', 'Income', 'LoanAmount', 'CreditScore',
       'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm',
       'DTIRatio', 'Education', 'EmploymentType', 'MaritalStatus',
       'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner',
       'Default'],
      dtype='object')

Index(['LoanID', 'Age', 'Income', 'LoanAmount', 'CreditScore',
       'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm',
       'DTIRatio', 'Education', 'EmploymentType', 'MaritalStatus',
       'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner'],
      dtype='object')
done that
Best Parameters: {'min_samples_split': 5, 'min_samples_leaf': 2, 'max_leaf_nodes': 40, 'max_depth': 10, 'criterion': 'gini'}
Training accuracy 0.8852808554846641
Validation accuracy 0.8856471509692578
