<h1 style='font-family: serif; text-align: center'><b>Lab12: Decision Tree</b></h1>

<h3 style='font-family: serif'>Initial Processing Steps</h3>

In [9]:
# Import numpy, pandas, math
import numpy as np
import pandas as pd
import math

In [15]:
# Read the dataset
dataset = pd.read_csv('dataset.csv')
TOTAL_SAMPLES = 0
DATASET_ENTROPY = 0
dataset

Unnamed: 0,AGE,HAS_JOB,OWNS_HOUSE,CREDIT_RATING,LOAN_APPROVAL
0,Young,False,False,Fair,No
1,Young,False,False,Good,No
2,Young,True,False,Good,Yes
3,Young,True,True,Fair,Yes
4,Young,False,False,Fair,No
5,Middle,False,False,Fair,No
6,Middle,False,False,Good,No
7,Middle,True,True,Good,Yes
8,Middle,False,True,Excellent,Yes
9,Middle,False,True,Excellent,Yes


In [16]:
# Create a count of all YES and NO against every unique key
def count_labels(df):
    results = {}
    for col in df.columns:
        unique_values = df[col].unique()
        col_results = {}
        for val in unique_values:
            pos_count = len(df[(df[col] == val) & (df['LOAN_APPROVAL'] == 'Yes')])
            neg_count = len(df[(df[col] == val) & (df['LOAN_APPROVAL'] == 'No')])
            col_results[val] = {'Yes': pos_count, 'No': neg_count}
        results[col] = col_results
    return results

# Calculates entropy based on the formula
def entropy(v1, v2):
    total = v1 + v2
    p1 = v1 / total
    p2 = v2 / total
    if p1 == 0 or p2 == 0:
        return 0
    return -p1 * math.log(p1, 2) - p2 * math.log(p2, 2)

# Calculate information gain = entropy of dataset - entropy of the current attribute
def information_gain(attribute):
    keys_list = list(attribute.keys())
    values_list = list(attribute.values())
    gain = 0
    for i in range(len(attribute)):
        p = values_list[i].get('Yes')
        n = values_list[i].get('No')
        ent = entropy(p, n)
        gain += ent * ((p+n)/TOTAL_SAMPLES)
    return DATASET_ENTROPY - gain

# Which node has the highest information gain
def extract_node(table):
    gains = []
    keys = list(table.keys())
    for key in keys:
        gains.append([information_gain(table[key]), key])
    max_gain = max(gains)
    print(f'Gains of the nodes: {gains}')
    return max_gain[1]

# Extract which value of the node the dataset should be split
def splitting_node_on_value(table,node):
    keys = list(table[node].keys())
    for key in keys:
        if table[node][key].get('Yes') == 0 or table[node][key].get('No') == 0:
            table.pop(node)
            return key

# Split the dataset based on the node and value
def split_dataset(df, node, value):
    return df[df[node] == value], df[df[node] != value]

<h3 style='font-family: serif'>Iteration#01: On the Entire Dataset</h3>

In [17]:
# Calling the count_values_by_column function to get the count of all YES and NO against every unique key
count_data = count_labels(dataset)

# Displaying the main table of counts
print(count_data)

{'AGE': {'Young': {'Yes': 2, 'No': 3}, 'Middle': {'Yes': 3, 'No': 2}, 'Old': {'Yes': 4, 'No': 1}}, 'HAS_JOB': {False: {'Yes': 4, 'No': 6}, True: {'Yes': 5, 'No': 0}}, 'OWNS_HOUSE': {False: {'Yes': 3, 'No': 6}, True: {'Yes': 6, 'No': 0}}, 'CREDIT_RATING': {'Fair': {'Yes': 1, 'No': 4}, 'Good': {'Yes': 4, 'No': 2}, 'Excellent': {'Yes': 4, 'No': 0}}, 'LOAN_APPROVAL': {'No': {'Yes': 0, 'No': 6}, 'Yes': {'Yes': 9, 'No': 0}}}


In [18]:
# Extracting all base variables - example: positives, negatives, total samples, dataset entropy etc
count_loan = count_data.pop('LOAN_APPROVAL')
positive = count_loan.get('Yes').get('Yes')
negative = count_loan.get('No').get('No')
TOTAL_SAMPLES = positive + negative
DATASET_ENTROPY = entropy(positive,negative)
# Displaying the base variables

print(f"Positive: {positive}")
print(f"Negative: {negative}")
print(f"total: {TOTAL_SAMPLES}")
print(f"Dataset Entropy: {DATASET_ENTROPY}")

{'AGE': {'Young': {'Yes': 2, 'No': 3}, 'Middle': {'Yes': 3, 'No': 2}, 'Old': {'Yes': 4, 'No': 1}}, 'HAS_JOB': {False: {'Yes': 4, 'No': 6}, True: {'Yes': 5, 'No': 0}}, 'OWNS_HOUSE': {False: {'Yes': 3, 'No': 6}, True: {'Yes': 6, 'No': 0}}, 'CREDIT_RATING': {'Fair': {'Yes': 1, 'No': 4}, 'Good': {'Yes': 4, 'No': 2}, 'Excellent': {'Yes': 4, 'No': 0}}}
Positive: 9
Negative: 6
total: 15
Dataset Entropy: 0.9709505944546686


In [77]:
# What is the root node of the tree? Extract the node with the highest information gain - using extract_node()
root_node = extract_node(count_data)


# What is the decided child node of the root node? Extract the value of the node with the highest information gain - using splitting_node_on_value()
decided_child = splitting_node_on_value(count_data,root_node) 

# Display the root node and the decided child node of the root node
print(f"\nRoot Node: {root_node}")
print(f"Decided Child: {decided_child}")

Gains of the nodes: [[0.08300749985576883, 'AGE'], [0.32365019815155627, 'HAS_JOB'], [0.4199730940219749, 'OWNS_HOUSE'], [0.36298956253708536, 'CREDIT_RATING']]

Root Node: OWNS_HOUSE
Decided Child: True


In [78]:
# Split the dataset based on the root node and the decided child node of the root node
split_data = split_dataset(dataset,root_node,decided_child)
print(split_data)

(       AGE  HAS_JOB  OWNS_HOUSE CREDIT_RATING LOAN_APPROVAL
3    Young     True        True          Fair           Yes
7   Middle     True        True          Good           Yes
8   Middle    False        True     Excellent           Yes
9   Middle    False        True     Excellent           Yes
10     Old    False        True     Excellent           Yes
11     Old    False        True          Good           Yes,        AGE  HAS_JOB  OWNS_HOUSE CREDIT_RATING LOAN_APPROVAL
0    Young    False       False          Fair            No
1    Young    False       False          Good            No
2    Young     True       False          Good           Yes
4    Young    False       False          Fair            No
5   Middle    False       False          Fair            No
6   Middle    False       False          Good            No
12     Old     True       False          Good           Yes
13     Old     True       False     Excellent           Yes
14     Old    False       False       

In [79]:
# Display the left sub-dataset
split_data[0]
leaf_1 = split_data[0]

In [80]:
# Display the right sub-dataset
split_data[1]

Unnamed: 0,AGE,HAS_JOB,OWNS_HOUSE,CREDIT_RATING,LOAN_APPROVAL
0,Young,False,False,Fair,No
1,Young,False,False,Good,No
2,Young,True,False,Good,Yes
4,Young,False,False,Fair,No
5,Middle,False,False,Fair,No
6,Middle,False,False,Good,No
12,Old,True,False,Good,Yes
13,Old,True,False,Excellent,Yes
14,Old,False,False,Fair,No


<h3 style='font-family: serif'>Iteration#02: On the Right Sub-Split-Dataset</h3>
<p> Repeat the same process unless the dataset is empty</p>

In [69]:
dataset = split_data[1]
# Calling the count_values_by_column function to get the count of all YES and NO against every unique key
count_data = count_labels(dataset)

# Displaying the main table of counts
print(count_data)

# Extracting all base variables - example: positives, negatives, total samples, dataset entropy etc
count_loan = count_data.pop('LOAN_APPROVAL')
positive = count_loan.get('Yes').get('Yes')
negative = count_loan.get('No').get('No')
TOTAL_SAMPLES = positive + negative
DATASET_ENTROPY = entropy(positive,negative)
# Displaying the base variables

print(f"Positive: {positive}")
print(f"Negative: {negative}")
print(f"total: {TOTAL_SAMPLES}")
print(f"Dataset Entropy: {DATASET_ENTROPY}")

# What is the root node of the tree? Extract the node with the highest information gain - using extract_node()
root_node = extract_node(count_data)


# What is the decided child node of the root node? Extract the value of the node with the highest information gain - using splitting_node_on_value()
decided_child = splitting_node_on_value(count_data,root_node) 

# Display the root node and the decided child node of the root node
print(f"\nRoot Node: {root_node}")
print(f"Decided Child: {decided_child}")

# Split the dataset based on the root node and the decided child node of the root node
split_data = split_dataset(dataset,root_node,decided_child)
print(split_data)

{'AGE': {'Young': {'Yes': 1, 'No': 3}, 'Middle': {'Yes': 0, 'No': 2}, 'Old': {'Yes': 2, 'No': 1}}, 'HAS_JOB': {False: {'Yes': 0, 'No': 6}, True: {'Yes': 3, 'No': 0}}, 'OWNS_HOUSE': {False: {'Yes': 3, 'No': 6}}, 'CREDIT_RATING': {'Fair': {'Yes': 0, 'No': 4}, 'Good': {'Yes': 2, 'No': 2}, 'Excellent': {'Yes': 1, 'No': 0}}, 'LOAN_APPROVAL': {'No': {'Yes': 0, 'No': 6}, 'Yes': {'Yes': 3, 'No': 0}}}
Positive: 3
Negative: 6
total: 9
Dataset Entropy: 0.9182958340544896
Gains of the nodes: [[0.2516291673878229, 'AGE'], [0.9182958340544896, 'HAS_JOB'], [0.0, 'OWNS_HOUSE'], [0.47385138961004514, 'CREDIT_RATING']]
[False, True]

Root Node: HAS_JOB
Decided Child: False
(       AGE  HAS_JOB  OWNS_HOUSE CREDIT_RATING LOAN_APPROVAL
0    Young    False       False          Fair            No
1    Young    False       False          Good            No
4    Young    False       False          Fair            No
5   Middle    False       False          Fair            No
6   Middle    False       False    

In [81]:
# In tree dictionary, store the root node and the decided child node of the root node
# tree = {'node': value, 'node': value}

# OWNS_HOUSE: True --> ?(Y/N)
leaf_node_1 = leaf_1

# HAS_JOB: False --> ?(Y/N)
leaf_node_2 = split_data[0]

In [82]:
print(leaf_node_1)

       AGE  HAS_JOB  OWNS_HOUSE CREDIT_RATING LOAN_APPROVAL
3    Young     True        True          Fair           Yes
7   Middle     True        True          Good           Yes
8   Middle    False        True     Excellent           Yes
9   Middle    False        True     Excellent           Yes
10     Old    False        True     Excellent           Yes
11     Old    False        True          Good           Yes


In [83]:
print(leaf_node_2)

       AGE  HAS_JOB  OWNS_HOUSE CREDIT_RATING LOAN_APPROVAL
3    Young     True        True          Fair           Yes
7   Middle     True        True          Good           Yes
8   Middle    False        True     Excellent           Yes
9   Middle    False        True     Excellent           Yes
10     Old    False        True     Excellent           Yes
11     Old    False        True          Good           Yes
