In [None]:
#  What is Entropy in Decision Trees?

# Entropy measures the impurity (uncertainty) in a dataset.

# A decision tree uses entropy to decide where to split the data.

# Goal: split the data in such a way that each branch
# becomes more pure (less entropy).

#   purity vs impurity

# 🔹 Purity

# A node (or dataset) is called pure if all the samples
# inside it belong to the same class.

# Example:

# If a node contains 100 animals and all are Dogs, then the node is pure.

# In this case, there is no uncertainty.

# Entropy = 0, Gini = 0.

# Pure means perfectly classified (no mix of classes).

#  Impurity

# A node is impure if it contains a mix of different classes.

# Example:

# If a node has 50 Dogs and 50 Cats → maximum impurity (most uncertain).

# If a node has 80 Dogs and 20 Cats → still impure,
#  but less than the 50–50 case.

#  Impure means uncertain / mixed classes.

In [None]:
# How Decision Trees Use This


# Decision trees split data so that impurity decreases after each split.

# Measures of impurity:

# Entropy → 0 (pure) to 1 (most impure in binary case).

# Gini Index → 0 (pure) to 0.5 (most impure in binary case).

In [None]:
# gini

# 🔹 Properties

# Pure node → only one class present → Gini = 0

# Most impure node (equal distribution of classes) → Gini is maximum

In [None]:
# observation

#  1. more than the uncertainity more is entropy

# define

#  Entropy and Uncertainty

# Entropy is a measure of uncertainty (or impurity).

# If the outcome is certain → entropy is low (0).

# If the outcome is uncertain / random → entropy is high.


# example

#  Binary Classification Example

# 100% Dog (no Cat)

# No uncertainty, you already know the class.

# Entropy = 0 (pure).

# 50% Dog, 50% Cat

# Maximum uncertainty → you can’t predict better than random guess.

# Entropy = 1 (maximum).

# 80% Dog, 20% Cat

# Some uncertainty (but less than 50–50).

# Entropy ≈ 0.72.

# conclusion
#              And decision trees work by splitting data
#               in a way that reduces uncertainty (entropy) step by step.

#  2. for a class problem the min entropy is 0 and the
#    max is 1

# Entropy in classification problems:

# Minimum Entropy = 0
# → This happens when the node is pure (all samples belong to the same class).
# Example: 100% Dog, 0% Cat → Entropy = 0

# Maximum Entropy = log₂(c), where c = number of classes
# → For binary classification (c = 2): max entropy = log₂(2) = 1
# This happens when the classes are evenly split (50% Dog, 50% Cat).

# → For 3 classes (c = 3): max entropy = log₂(3) ≈ 1.585
# This happens when all three classes are equally distributed (1/3 each).

# Summary:

# For binary classification → min entropy = 0, max entropy = 1



#  for multi  classifiction
#  For multi-class classification →
#   min entropy = 0, max entropy = log₂(c)

# both log2 or log e can be used to calculate entropy



In [None]:
#  Entropy and KDE

# If KDE is very flat (low peakness) → data is spread out
#  → high entropy (more uncertainty).

# If KDE is very sharp (high peakness) →
#  data is concentrated in a small region → low entropy (less uncertainty).

#   Think of What Entropy Measures

# Entropy = average uncertainty / unpredictability.

# If the probability distribution is sharp (high peakness) →
#  most of the probability mass is concentrated in a small region →
#  outcomes are more predictable → entropy is low.

# If the probability distribution is flat (low peakness) →
# probability mass is spread over a wide region →
#  outcomes can occur in many places → more uncertainty → entropy is high.

In [None]:
# entropy

#        Meaning of Entropy

# Entropy is a measure of uncertainty, impurity, or randomness in the data.

# When the classes are mixed, entropy is high.

# When the data is pure (all samples belong to one class),
#  entropy is low (zero).

#  Parent Entropy = 1 (What it Means)

# In your case, the parent node has:

# Yes = 4 (50%)

# No = 4 (50%)

# This is a perfectly balanced distribution (50–50).

#  It means we are completely uncertain about the next sample →
#  it could be Yes or No with equal probability.

# Therefore, entropy takes its maximum value = 1 bit
#  (for binary classification).




# Comparison Cases

# All Yes (8 Yes, 0 No):

# H = 0

# → No uncertainty (we are fully sure outcome = Yes).

# All No (0 Yes, 8 No):


# H = 0

# → No uncertainty (we are fully sure outcome = No).

# 50–50 split (4 Yes, 4 No):

# H = 1

# → Maximum uncertainty (highest confusion).


In [None]:
#  So, Parent Entropy = 1 means the data is maximally impure
#            (50% Yes, 50% No).
#  It’s the highest possible uncertainty for binary classification.

In [None]:
# Information Gain (IG)

# Definition: Information Gain is a metric used in Decision Trees
# to measure how much a feature reduces the uncertainty (entropy) in the data.

# It basically tells:

#  “How good is this feature at splitting the data into pure groups?”



#  In One Line

# Information Gain = Reduction in uncertainty (entropy)
# after splitting on a feature.

In [None]:
# Example: Information Gain Calculation


# We have 8 samples in the parent node:

# Yes = 4

# No = 4

# Step 1: Parent Entropy
# H(Parent) = -(0.5 * log2(0.5) + 0.5 * log2(0.5)) = 1

# Step 2: Split on the feature "Outlook"
# After splitting, the data becomes:

# Sunny group (4 samples): 3 Yes, 1 No

# Rainy group (4 samples): 1 Yes, 3 No

# Step 3: Child Entropies


# Sunny group (3 Yes, 1 No):
# H(Sunny) = -(3/4 * log2(3/4) + 1/4 * log2(1/4)) = 0.811

# Rainy group (1 Yes, 3 No):
# H(Rainy) = -(1/4 * log2(1/4) + 3/4 * log2(3/4)) = 0.811

# Step 4: Weighted Average Entropy of Children
# H(Children) = (4/8 * 0.811) + (4/8 * 0.811) = 0.811

# Step 5: Information Gain
# IG = H(Parent) - H(Children)
# IG = 1 - 0.811 = 0.189

# Final Result:
# The Information Gain for splitting on "Outlook" = 0.189 bits.

# Interpretation:

# This feature reduces uncertainty a little, but not perfectly.
# If one branch had all Yes and the other had all No,
# then IG would be 1 (maximum).

In [None]:
Parent Entropy = Entropy of the entire dataset before any split

Examples:

All samples same (Yes, Yes, Yes, …) → Parent Entropy = 0

Half Yes, half No → Parent Entropy = 1 (maximum)

6 Yes, 2 No → Parent Entropy = 0.811 (some uncertainty)



In [None]:
# Gini Impurity:

# Gini Impurity measures the probability of incorrectly classifying
#  a randomly chosen element if it was labeled
#  according to the distribution of classes in the dataset.

# Formula: Gini = 1 - Σ (pi²)
# (where pi is the probability of class i)

# If Gini = 0 → dataset is pure (all samples belong to one class).

# Higher Gini → more mixed classes, more impurity.

In [None]:
# gini vs entropy

# Difference between Gini Impurity and Entropy

# Definition

# Entropy measures the amount of information (or uncertainty) in the dataset.

# Gini Impurity measures the probability of misclassifying
# a randomly chosen sample.

# Formula

# Entropy = – Σ (pi * log₂ pi)

# Gini = 1 – Σ (pi²)

# Range

# Entropy: 0 to 1 (for binary classification).

# Gini: 0 to 0.5 (for binary classification).

# Interpretation

# Entropy is based on information theory (information gain).

# Gini is based on probability of misclassification.

# Speed

# Entropy is slower to compute (because of log).

# Gini is faster (no log).

# Tree Splitting

# Both often give similar splits.

# But Gini tends to isolate the most frequent class,
#  while Entropy is more sensitive to class distribution.


In [None]:
# Relation between Information Gain and Impurity


# When we split a dataset, the impurity (Entropy or Gini) decreases.

# The greater the decrease in impurity, the higher the Information Gain.

# Formula:
# Information Gain = Parent Impurity – Weighted Average of Child Impurities

# So:

# Higher Information Gain → Better split (less impurity in child nodes)

# Lower Information Gain → Poor split (impurity is still high)

# Conclusion:
# More Information Gain = Less Impurity