In [1]:
import pandas as pd
import numpy as np
import itertools

In [2]:
dataset = pd.read_csv('/content/advertising.csv')

In [3]:
def gini_index(y):
    m = len(y)
    if m <= 1:
        return 0

    counts = y.value_counts()
    gini = 1 - sum((count / m) ** 2 for count in counts)
    return gini

In [4]:
def entropy(y):
    a = y.value_counts() / y.shape[0]
    return np.sum(-a * np.log2(a + 1e-9))

In [5]:
def gini_information_gain(y, mask):
    total_size = len(y)
    left_size = sum(mask)
    right_size = total_size - left_size

    if left_size == 0 or right_size == 0:
        return 0

    left_gini = gini_index(y[mask])
    right_gini = gini_index(y[~mask])

    gain = gini_index(y) - (left_size / total_size * left_gini + right_size / total_size * right_gini)
    return gain

In [6]:
def entropy_information_gain(y, mask):
    total_size = len(y)
    left_size = sum(mask)
    right_size = total_size - left_size

    if left_size == 0 or right_size == 0:
        return 0

    left_entropy = entropy(y[mask])
    right_entropy = entropy(y[~mask])

    gain = entropy(y) - (left_size / total_size * left_entropy + right_size / total_size * right_entropy)
    return gain

In [7]:
def best_split(dataset, y, func=entropy):
    best_gain = -float('inf')
    best_split_value = None
    best_split_variable = None
    best_mask = None

    for column in dataset.columns:
        values = dataset[column].unique()
        for value in values:
            if dataset[column].dtype != 'O':
                mask = dataset[column] < value
            else:
                mask = dataset[column].isin([value])

            if func == entropy:
                gain = entropy_information_gain(y, mask)
            else:
                gain = gini_information_gain(y, mask)

            if gain > best_gain:
                best_gain = gain
                best_split_value = value
                best_split_variable = column
                best_mask = mask

    return best_split_variable, best_split_value, best_mask, best_gain

In [8]:
def train_tree(dataset, y, max_depth=None, min_samples_split=2, min_information_gain=1e-5, depth=0, func=entropy):
    if len(y.unique()) == 1:
        return y.iloc[0]

    if depth == max_depth or len(dataset) < min_samples_split:
        return y.mode()[0]

    best_variable, best_value, best_mask, best_gain = best_split(dataset, y, func)

    if best_gain < min_information_gain:
        return y.mode()[0]

    left_dataset = dataset[best_mask]
    right_dataset = dataset[~best_mask]
    left_y = y[best_mask]
    right_y = y[~best_mask]

    left_tree = train_tree(left_dataset, left_y, max_depth, min_samples_split, min_information_gain, depth+1, func)
    right_tree = train_tree(right_dataset, right_y, max_depth, min_samples_split, min_information_gain, depth+1, func)

    tree = {f'{best_variable} <= {best_value}': [left_tree, right_tree]}
    return tree

In [9]:
def classify(observation, tree):
    question = list(tree.keys())[0]
    feature, value = question.split(' <= ')
    feature_value = observation[feature]

    if isinstance(tree[question][0], dict):
        if feature_value <= float(value):
            return classify(observation, tree[question][0])
        else:
            return classify(observation, tree[question][1])
    else:
        return tree[question][0]

In [10]:
def print_tree(tree, level=0):
    if isinstance(tree, dict):
        for question, subtrees in tree.items():
            print(f"{'|  ' * level}Question: {question}")
            for subtree in subtrees:
                print_tree(subtree, level + 1)
    else:
        print(f"{'|  ' * level}Predict: {tree}")

In [11]:
tree_gini = train_tree(dataset.drop('Clicked on Ad', axis=1), dataset['Clicked on Ad'], max_depth=5, func=gini_index)
print("Decision Tree (Gini Index):")
print_tree(tree_gini)

Decision Tree (Gini Index):
Question: Daily Internet Usage <= 177.55
|  Question: Daily Time Spent on Site <= 71.4
|  |  Question: Area Income <= 76984.21
|  |  |  Question: Ad Topic Line <= Polarized analyzing concept
|  |  |  |  Predict: 0
|  |  |  |  Question: Ad Topic Line <= Up-sized tertiary contingency
|  |  |  |  |  Predict: 0
|  |  |  |  |  Predict: 1
|  |  |  Predict: 0
|  |  Question: Daily Internet Usage <= 161.42
|  |  |  Question: Daily Time Spent on Site <= 87.35
|  |  |  |  Question: Ad Topic Line <= Progressive clear-thinking open architecture
|  |  |  |  |  Predict: 0
|  |  |  |  |  Predict: 1
|  |  |  |  Question: Daily Time Spent on Site <= 88.97
|  |  |  |  |  Predict: 0
|  |  |  |  |  Predict: 1
|  |  |  Question: Age <= 51
|  |  |  |  Question: Area Income <= 42760.22
|  |  |  |  |  Predict: 1
|  |  |  |  |  Predict: 0
|  |  |  |  Predict: 1
|  Question: Daily Time Spent on Site <= 56.39
|  |  Question: Daily Time Spent on Site <= 48.22
|  |  |  Predict: 1
|  |  

In [12]:
tree_entropy = train_tree(dataset.drop('Clicked on Ad', axis=1), dataset['Clicked on Ad'], max_depth=5, func=entropy)
print("\nDecision Tree (Entropy):")
print_tree(tree_entropy)


Decision Tree (Entropy):
Question: Daily Internet Usage <= 177.55
|  Question: Daily Time Spent on Site <= 71.4
|  |  Question: Area Income <= 76984.21
|  |  |  Question: Ad Topic Line <= Polarized analyzing concept
|  |  |  |  Predict: 0
|  |  |  |  Question: Ad Topic Line <= Up-sized tertiary contingency
|  |  |  |  |  Predict: 0
|  |  |  |  |  Predict: 1
|  |  |  Predict: 0
|  |  Question: Daily Internet Usage <= 146.19
|  |  |  Predict: 1
|  |  |  Question: Area Income <= 48761.14
|  |  |  |  Question: Daily Time Spent on Site <= 89.37
|  |  |  |  |  Predict: 1
|  |  |  |  |  Predict: 0
|  |  |  |  Question: Age <= 45
|  |  |  |  |  Predict: 0
|  |  |  |  |  Predict: 1
|  Question: Daily Time Spent on Site <= 59.52
|  |  Question: Daily Time Spent on Site <= 48.22
|  |  |  Predict: 1
|  |  |  Question: Area Income <= 38817.4
|  |  |  |  Predict: 1
|  |  |  |  Question: Age <= 41
|  |  |  |  |  Predict: 0
|  |  |  |  |  Predict: 1
|  |  Question: Area Income <= 33239.2
|  |  |  Pre

In [13]:
observation = dataset.iloc[0]
prediction_gini = classify(observation, tree_gini)
prediction_entropy = classify(observation, tree_entropy)

In [14]:
print(f"\nPrediction using Gini Index: {prediction_gini}")
print(f"Prediction using Entropy: {prediction_entropy}")


Prediction using Gini Index: 1
Prediction using Entropy: 1
