In [1]:
import math
import pandas as pd

In [2]:
#STEP 1: Load the Dataset from CSV
data = pd.read_csv("buy.csv")

for col in data.columns:
    if "s.no" in col.lower() or "unnamed" in col.lower():
        data = data.drop(columns=[col])
data_dict = {col: list(data[col]) for col in data.columns}


In [3]:
#STEP 2: Entropy Function
def entropy(col):
    values = {}
    for val in col:
        values[val] = values.get(val, 0) + 1
    total = len(col)
    ent = 0
    for count in values.values():
        prob = count / total
        ent -= prob * math.log2(prob)
    return ent

In [4]:
#STEP 3: Gain Ratio Calculation
def calc_gain_ratio(data, attr, target="Buy"):
    total_entropy = entropy(data[target])
    values = {}
    for i in range(len(data[attr])):
        val = data[attr][i]
        target_val = data[target][i]
        if val not in values:
            values[val] = []
        values[val].append(target_val)

    weighted_entropy = 0
    split_info = 0
    total = len(data[attr])

    for val, subset in values.items():
        prob = len(subset) / total
        weighted_entropy += prob * entropy(subset)
        split_info -= prob * math.log2(prob)

    info_gain = total_entropy - weighted_entropy
    gain_ratio = info_gain / split_info if split_info != 0 else 0
    return gain_ratio

In [5]:
#STEP 4: Recursive C4.5 Algorithm
def c45(data, features, target="Buy"):
    if len(set(data[target])) == 1:
        return data[target][0]
    if not features:
        return max(set(data[target]), key=data[target].count)

    best_feature = max(features, key=lambda f: calc_gain_ratio(data, f, target))
    tree = {best_feature: {}}

    feature_values = set(data[best_feature])
    for value in feature_values:
        subset = {k: [] for k in data.keys()}
        for i in range(len(data[target])):
            if data[best_feature][i] == value:
                for key in data.keys():
                    subset[key].append(data[key][i])

        if not subset[target]:
            tree[best_feature][value] = max(set(data[target]), key=data[target].count)
        else:
            remaining = [f for f in features if f != best_feature]
            tree[best_feature][value] = c45(subset, remaining, target)

    return tree

In [6]:
#STEP 5: Pretty Print Tree (Well-Structured)
def print_tree(tree, indent="", last=True):
    for attr, branches in tree.items():
        prefix = "└── " if last else "├── "
        print(f"{indent}{prefix}[{attr}]")
        child_indent = indent + ("    " if last else "│   ")

        for i, (val, child) in enumerate(branches.items()):
            is_last = (i == len(branches) - 1)
            branch_prefix = "└── " if is_last else "├── "
            if isinstance(child, dict):
                print(f"{child_indent}{branch_prefix}{val} → ")
                print_tree(child, child_indent + ("    " if is_last else "│   "), is_last)
            else:
                print(f"{child_indent}{branch_prefix}{val} → {child}")

In [7]:
#STEP 6: Run C4.5 Algorithm
features = [f for f in data_dict.keys() if f != "Buy"]
final_tree = c45(data_dict, features)

print("Final Decision Tree (Graph Format):\n")
print_tree(final_tree)

Final Decision Tree (Graph Format):

└── [Age]
    ├── >40 → 
    │   ├── [Credit]
    │   │   ├── Fair → Yes
    │   │   └── Excellent → No
    ├── <30 → 
    │   ├── [Student]
    │   │   ├── Yes → Yes
    │   │   └── No → No
    └── 31-40 → Yes


'C:\\Users\\hp\\M.Tech Project\\Machine learning'