In [21]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier, export_text


In [26]:
# Load the wine dataset
#data = load_wine()
data = pd.read_csv('loan_data.csv')
#data = pd.read_csv('loan_data.csv', delimiter=',', na_values=['NA', 'NULL'])

# Convert to DataFrame for easier handling
# df = pd.DataFrame(data.data, columns=data.feature_names)
# df['target'] = data.target


df = data.copy() # Create a copy to avoid modifying the original DataFrame
df.rename(columns={'loan_status': 'target'}, inplace=True)  # Rename the target column
feature_names = [col for col in df.columns if col != 'target']


# Split the dataset into features and target variable
X = df[feature_names].values
y = df['target'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





In [27]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,target
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [28]:
def entropy(y):
    unique, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities))

def conditional_entropy(y, y_left, y_right):
    p_left = len(y_left) / len(y)
    p_right = len(y_right) / len(y)
    return p_left * entropy(y_left) + p_right * entropy(y_right)

def gini(y):
    unique, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return 1 - np.sum(probabilities ** 2)

def gini_split(y, y_left, y_right):
    p_left = len(y_left) / len(y)
    p_right = len(y_right) / len(y)
    return p_left * gini(y_left) + p_right * gini(y_right)

In [29]:

class DecisionTree:
    def __init__(self, max_depth=None, criterion='entropy', min_samples_leaf=1):
        self.max_depth = max_depth
        self.criterion = criterion
        self.min_samples_leaf = min_samples_leaf
        self.tree = None
        self.feature_names = None

    def fit(self, X, y,feature_names=None):
        self.feature_names = data.feature_names
        self.tree = self._build_tree(X, y, 0)

    def _build_tree(self, X, y, depth):
        if len(np.unique(y)) == 1 or (self.max_depth and depth >= self.max_depth) or len(y) < self.min_samples_leaf:
            return np.argmax(np.bincount(y))

        best_split = self._find_best_split(X, y)
        if not best_split:
            return np.argmax(np.bincount(y))

        left_indices, right_indices = best_split['indices']
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {'feature': best_split['feature'], 'threshold': best_split['threshold'], 'left': left_subtree, 'right': right_subtree}

    def print_tree(self, node=None, depth=0):
        # Define print_tree function locally
        def print_tree(node, depth=0):
            if isinstance(node, dict):
                feature_name = self.feature_names[node['feature']] if self.feature_names else node['feature']
                print("  " * depth + f"[{feature_name} <= {node['threshold']}]")
                print_tree(node['left'], depth + 1)
                print_tree(node['right'], depth + 1)
            else:
                print("  " * depth + f"[Class: {node}]")
        # Call the local print_tree function to print the actual tree
        if node is None:
            node = self.tree
        print_tree(node, depth)



    def _find_best_split(self, X, y):
        if self.criterion == 'gini':
            return self._best_split_gini(X, y)
        elif self.criterion == 'entropy':
            return self._best_split_entropy(X, y)
        else:
            raise ValueError("Criterion not recognized.")

    def _best_split_gini(self, X, y):
        best_gini = float('inf')
        best_split = None

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature] <= threshold)[0]
                right_indices = np.where(X[:, feature] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini = self._gini_index(y[left_indices], y[right_indices])

                if gini < best_gini:
                    best_gini = gini
                    best_split = {'feature': feature, 'threshold': threshold, 'indices': (left_indices, right_indices)}

        return best_split

    def _gini_index(self, left_y, right_y):
        total = len(left_y) + len(right_y)
        if total == 0:
            return 0

        p_left = len(left_y) / total
        p_right = len(right_y) / total

        gini_left = 1 - sum((np.bincount(left_y) / len(left_y)) ** 2) if len(left_y) > 0 else 0
        gini_right = 1 - sum((np.bincount(right_y) / len(right_y)) ** 2) if len(right_y) > 0 else 0

        return p_left * gini_left + p_right * gini_right

    def _best_split_entropy(self, X, y):
        best_entropy = float('inf')
        best_split = None

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature] <= threshold)[0]
                right_indices = np.where(X[:, feature] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                entropy = self._entropy_index(y[left_indices], y[right_indices])

                if entropy < best_entropy:
                    best_entropy = entropy
                    best_split = {'feature': feature, 'threshold': threshold, 'indices': (left_indices, right_indices)}

        return best_split

    def _entropy_index(self, left_y, right_y):
        total = len(left_y) + len(right_y)
        if total == 0:
            return 0

        p_left = len(left_y) / total
        p_right = len(right_y) / total

        entropy_left = -sum((np.bincount(left_y) / len(left_y)) * np.log2(np.bincount(left_y) / len(left_y) + 1e-9)) if len(left_y) > 0 else 0
        entropy_right = -sum((np.bincount(right_y) / len(right_y)) * np.log2(np.bincount(right_y) / len(right_y) + 1e-9)) if len(right_y) > 0 else 0

        return p_left * entropy_left + p_right * entropy_right

    def predict(self, X):
        return np.array([self._predict_sample(sample, self.tree) for sample in X])

    def _predict_sample(self, sample, tree):
        if not isinstance(tree, dict):
            return tree

        feature = tree['feature']
        threshold = tree['threshold']

        if sample[feature] <= threshold:
            return self._predict_sample(sample, tree['left'])
        else:
            return self._predict_sample(sample, tree['right'])

In [31]:
# Train and evaluate the custom decision tree with entropy

tree_entropy = DecisionTree(max_depth=2, criterion='entropy',min_samples_leaf=5)
tree_entropy.fit(X_train, y_train)
# Predict on training and testing data
y_pred_train_entropy = tree_entropy.predict(X_train)
y_pred_test_entropy = tree_entropy.predict(X_test)
# Calculate training and testing accuracy
accuracy_train_entropy = accuracy_score(y_train, y_pred_train_entropy)
accuracy_test_entropy = accuracy_score(y_test, y_pred_test_entropy)
# Print training and testing accuracy
print(f"\nCustom Decision Tree (Entropy) - Training Accuracy: {accuracy_train_entropy:.4f}")
print(f"Custom Decision Tree (Entropy) - Testing Accuracy: {accuracy_test_entropy:.4f}")

# Train and evaluate the custom decision tree with gini
tree_gini = DecisionTree(max_depth=2, criterion='gini',min_samples_leaf=5)
tree_gini.fit(X_train, y_train)
# Predict on training and testing data
y_pred_train_gini = tree_gini.predict(X_train)
y_pred_test_gini = tree_gini.predict(X_test)
# Calculate training and testing accuracy
accuracy_train_gini = accuracy_score(y_train, y_pred_train_gini)
accuracy_test_gini = accuracy_score(y_test, y_pred_test_gini)
# Print training and testing accuracy
print(f"\nCustom Decision Tree (Gini) - Training Accuracy: {accuracy_train_gini:.4f}")
print(f"Custom Decision Tree (Gini) - Testing Accuracy: {accuracy_test_gini:.4f}")



# Example usage
# Assuming X_train and y_train are your training data
dt = DecisionTree(max_depth=3)
dt.fit(X_train, y_train, feature_names=feature_names)  # Pass feature names
dt.print_tree()

AttributeError: 'DataFrame' object has no attribute 'feature_names'

In [32]:

# Create a dictionary to store the results
results = {
    # 'Model': ['Custom (Entropy)', 'Custom (Gini)', 'sklearn (Entropy)', 'sklearn (Gini)'],
    'Model': ['Custom (Entropy)', 'Custom (Gini)'],
    'Training Accuracy': [accuracy_train_entropy, accuracy_train_gini],
    'Testing Accuracy': [accuracy_test_entropy, accuracy_test_gini]
}

# Create a pandas DataFrame from the results dictionary
results_df = pd.DataFrame(results)

# Display the table
display(results_df)

NameError: name 'accuracy_train_entropy' is not defined