# Decision Trees

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('data/UCI_Credit_Card.csv')
y = df['default.payment.next.month']
X = df.drop(columns=['default.payment.next.month'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [3]:
def get_metrics(model):
    metrics = {}
    
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test)
    
    metrics['train_accuracy'] = accuracy_score(y_train, y_hat_train)
    metrics['test_accuracy'] = accuracy_score(y_test, y_hat_test)
    
    metrics['train_f1'] = f1_score(y_train, y_hat_train)
    metrics['test_f1'] = f1_score(y_test, y_hat_test)
    
    metrics['train_precision'] = precision_score(y_train, y_hat_train)
    metrics['test_precision'] = precision_score(y_test, y_hat_test)
    
    metrics['train_recall'] = recall_score(y_train, y_hat_train)
    metrics['test_recall'] = recall_score(y_test, y_hat_test)
    
    return metrics

## Baseline Model

In [4]:
clf0 = DecisionTreeClassifier(random_state=10)
clf0.fit(X_train, y_train)
get_metrics(clf0)

{'train_accuracy': 1.0,
 'test_accuracy': 0.7276,
 'train_f1': 1.0,
 'test_f1': 0.3896026292201972,
 'train_precision': 1.0,
 'test_precision': 0.3801749271137026,
 'train_recall': 1.0,
 'test_recall': 0.39950980392156865}

## Data Cleaning

In [5]:
to_drop = ['ID']
categoricals = ['SEX', 'EDUCATION', 'MARRIAGE']
continuous = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
target = ['default.payment.next.month']

In [6]:
X_train.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)
for col in ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
    X_train[col] = X_train[col].map(lambda x: 0 if x <= 0 else x)
    X_test[col] = X_test[col].map(lambda x: 0 if x <= 0 else x)

In [7]:
clf1 = DecisionTreeClassifier(random_state=10)
clf1.fit(X_train, y_train)
get_metrics(clf1)

{'train_accuracy': 0.9996888888888888,
 'test_accuracy': 0.7276,
 'train_f1': 0.999300209937019,
 'test_f1': 0.3989408649602824,
 'train_precision': 0.9997999599919984,
 'test_precision': 0.3837011884550085,
 'train_recall': 0.9988009592326139,
 'test_recall': 0.41544117647058826}

## Creating Dummy Variables

In [8]:
for col in categoricals:
    X_train[col] = X_train[col].map(str)
    X_test[col] = X_test[col].map(str)
X_train = pd.get_dummies(X_train, drop_first=True, dtype=int)
X_test = pd.get_dummies(X_test, drop_first=True, dtype=int)

In [9]:
clf2 = DecisionTreeClassifier(random_state=10)
clf2.fit(X_train, y_train)
get_metrics(clf2)

{'train_accuracy': 0.9996888888888888,
 'test_accuracy': 0.7294666666666667,
 'train_f1': 0.999300209937019,
 'test_f1': 0.3945091017606685,
 'train_precision': 0.9997999599919984,
 'test_precision': 0.3845258871436882,
 'train_recall': 0.9988009592326139,
 'test_recall': 0.4050245098039216}

## Transforming Continuous Variables

In [10]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))

In [11]:
clf3 = DecisionTreeClassifier(random_state=10)
clf3.fit(X_train, y_train)
get_metrics(clf3)

{'train_accuracy': 0.9996888888888888,
 'test_accuracy': 0.7298666666666667,
 'train_f1': 0.999300209937019,
 'test_f1': 0.39630512514898686,
 'train_precision': 0.9997999599919984,
 'test_precision': 0.3857308584686775,
 'train_recall': 0.9988009592326139,
 'test_recall': 0.4074754901960784}

In [12]:
X_train = X_train.applymap(lambda x: np.log(x+1))
X_test = X_test.applymap(lambda x: np.log(x+1))

In [13]:
clf4 = DecisionTreeClassifier(random_state=10)
clf4.fit(X_train, y_train)
get_metrics(clf4)

{'train_accuracy': 0.9996888888888888,
 'test_accuracy': 0.7294666666666667,
 'train_f1': 0.999300209937019,
 'test_f1': 0.39559130175752155,
 'train_precision': 0.9997999599919984,
 'test_precision': 0.3849275362318841,
 'train_recall': 0.9988009592326139,
 'test_recall': 0.4068627450980392}

## Addressing Class Imbalance

In [14]:
df[target[0]].value_counts(), y_train.value_counts(), y_test.value_counts()

(0    23364
 1     6636
 Name: default.payment.next.month, dtype: int64,
 0    17496
 1     5004
 Name: default.payment.next.month, dtype: int64,
 0    5868
 1    1632
 Name: default.payment.next.month, dtype: int64)

In [15]:
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [16]:
clf5 = DecisionTreeClassifier(random_state=10)
clf5.fit(X_train, y_train)
get_metrics(clf5)

{'train_accuracy': 0.9997999542752629,
 'test_accuracy': 0.6929333333333333,
 'train_f1': 0.9997999256866836,
 'test_f1': 0.41473951715374846,
 'train_precision': 0.9999428277399806,
 'test_precision': 0.3543204515848893,
 'train_recall': 0.9996570644718793,
 'test_recall': 0.5}