In [2]:
import pandas as pd
import math
from collections import Counter

# 1) Create the dataset

data = [
    {"Weather Condition": "Rain",  "Road Condition": "Bad",     "Traffic Condition": "High",   "Engine Problem": "No",  "Accident": "Yes"},
    {"Weather Condition": "Snow",  "Road Condition": "Average", "Traffic Condition": "Normal", "Engine Problem": "Yes", "Accident": "Yes"},
    {"Weather Condition": "Clear", "Road Condition": "Bad",     "Traffic Condition": "Light",  "Engine Problem": "No",  "Accident": "No"},
    {"Weather Condition": "Clear", "Road Condition": "Good",    "Traffic Condition": "Light",  "Engine Problem": "Yes", "Accident": "Yes"},
    {"Weather Condition": "Snow",  "Road Condition": "Good",    "Traffic Condition": "Normal", "Engine Problem": "No",  "Accident": "No"},
    {"Weather Condition": "Rain",  "Road Condition": "Average", "Traffic Condition": "Light",  "Engine Problem": "No",  "Accident": "No"},
    {"Weather Condition": "Rain",  "Road Condition": "Good",    "Traffic Condition": "Normal", "Engine Problem": "No",  "Accident": "No"},
    {"Weather Condition": "Snow",  "Road Condition": "Bad",     "Traffic Condition": "High",   "Engine Problem": "No",  "Accident": "Yes"},
    {"Weather Condition": "Clear", "Road Condition": "Good",    "Traffic Condition": "High",   "Engine Problem": "Yes", "Accident": "No"},
    {"Weather Condition": "Clear", "Road Condition": "Bad",     "Traffic Condition": "High",   "Engine Problem": "Yes", "Accident": "Yes"},
]
df = pd.DataFrame(data)

TARGET = "Accident"
FEATURES = [c for c in df.columns if c != TARGET]


# 2) Entropy + Information Gain (ID3)
def entropy(labels):
    total = len(labels)
    counts = Counter(labels)
    ent = 0.0
    for k, v in counts.items():
        p = v / total
        ent -= p * math.log2(p)
    return ent

def conditional_entropy(df, feature, target=TARGET):
    total = len(df)
    cond_ent = 0.0
    for val, group in df.groupby(feature):
        weight = len(group) / total
        cond_ent += weight * entropy(group[target].tolist())
    return cond_ent

def information_gain(df, feature, target=TARGET):
    base = entropy(df[target].tolist())
    cond = conditional_entropy(df, feature, target)
    return base - cond


# 3) Gini + Gain (for comparison)
def gini(labels):
    total = len(labels)
    counts = Counter(labels)
    g = 1.0
    for _, v in counts.items():
        p = v / total
        g -= p**2
    return g

def weighted_gini(df, feature, target=TARGET):
    total = len(df)
    wg = 0.0
    for val, group in df.groupby(feature):
        weight = len(group) / total
        wg += weight * gini(group[target].tolist())
    return wg


# 4) Print Results
base_entropy = entropy(df[TARGET].tolist())
print("Base Entropy (Dataset):", round(base_entropy, 4))
print("\n--- Information Gain (Entropy/ID3) ---")
ig_results = {}
for f in FEATURES:
    ig = information_gain(df, f)
    ig_results[f] = ig
    print(f"{f:18s} IG = {ig:.6f}")

best_feature = max(ig_results, key=ig_results.get)
print("\n Best Root Node (Highest IG):", best_feature, "=", round(ig_results[best_feature], 6))

print("\n--- Weighted Gini (Lower is better) ---")
gini_results = {}
for f in FEATURES:
    wg = weighted_gini(df, f)
    gini_results[f] = wg
    print(f"{f:18s} Weighted Gini = {wg:.6f}")

best_gini_feature = min(gini_results, key=gini_results.get)
print("\n Best Root by Gini (Lowest):", best_gini_feature, "=", round(gini_results[best_gini_feature], 6))


# 5) (Optional) Build a simple ID3 tree (1â€“2 levels) like your explanation
#     Root = Road Condition, and if "Average", split by Engine Problem
def predict_simple(row):
    # Root: Road Condition
    if row["Road Condition"] == "Bad":
        return "Yes"
    if row["Road Condition"] == "Good":
        return "No"
    # Road Condition == "Average"
    if row["Engine Problem"] == "Yes":
        return "Yes"
    return "No"

df["Predicted"] = df.apply(predict_simple, axis=1)
accuracy = (df["Predicted"] == df[TARGET]).mean()
print("\n--- Simple Manual Tree Accuracy ---")
print("Accuracy =", accuracy)
print(df[[*FEATURES, TARGET, "Predicted"]])


Base Entropy (Dataset): 1.0

--- Information Gain (Entropy/ID3) ---
Weather Condition  IG = 0.049022
Road Condition     IG = 0.150978
Traffic Condition  IG = 0.124511
Engine Problem     IG = 0.124511

 Best Root Node (Highest IG): Road Condition = 0.150978

--- Weighted Gini (Lower is better) ---
Weather Condition  Weighted Gini = 0.466667
Road Condition     Weighted Gini = 0.400000
Traffic Condition  Weighted Gini = 0.416667
Engine Problem     Weighted Gini = 0.416667

 Best Root by Gini (Lowest): Road Condition = 0.4

--- Simple Manual Tree Accuracy ---
Accuracy = 0.8
  Weather Condition Road Condition Traffic Condition Engine Problem Accident  \
0              Rain            Bad              High             No      Yes   
1              Snow        Average            Normal            Yes      Yes   
2             Clear            Bad             Light             No       No   
3             Clear           Good             Light            Yes      Yes   
4              Snow    