In [24]:
import pandas as pd
import numpy as np
import pprint
# eps is used to avoid division by zero exception and log(0) invalid expression
eps = np.finfo(float).eps

# find entropy for the dataset
def find_entropy(df):
    # assuming the last attribute of dataset to be the target
    target = df.keys()[-1]
    values = df[target].unique()
    # implementing the formula of sum( -(p_i)log_2(p_i))
    entropy = 0
    for value in values:
        fraction = df[target].value_counts()[value] / len(df[target])
        entropy += -fraction * np.log2(fraction)
    return entropy

# find individual attribute's entropy (used in information gain calculation)
def find_entropy_attribute(df, attribute):
    # assuming the last attribute of dataset to be the target
    target = df.keys()[-1]
    # values of the target attribute
    target_values = df[target].unique()
    # values of the considered attribute
    values = df[attribute].unique()
    # implementation of the sum(fraction*entropy(attribute))
    # to be deducted from entropy of dataset to obtain information gain
    entropy_attribute = 0
    for value in values:
        entropy_each_feature = 0
        for target_value in target_values:
            # numerator represents number of instances where the target value is obtained
            num = len(df[attribute] [df[attribute] == value] [df[target] == target_value])
            # denominator represents total number of instances with particular attribute value considered
            den = len(df[attribute] [df[attribute] == value])
            fraction = num / (den + eps)
            entropy_each_feature += -fraction * np.log2(fraction + eps)
        # fraction of number of instances with particular value for attribute w.r.t. total number of instances
        fraction2 = den/len(df)
        entropy_attribute += -fraction2 * entropy_each_feature

    return abs(entropy_attribute)

def find_decider(df):
    # returns the one attribute having highest gain in the passed dataset
    info_gain = []
    for key in df.keys()[1:-1]:
        info_gain.append(find_entropy(df) - find_entropy_attribute(df, key))
    return df.keys()[1:-1][np.argmax(info_gain)]

def get_subtable(df, node, value):
    # returns subtable having particular value of attribute and eliminates the column
    return df[df[node] == value].reset_index(drop=True)


def buildTree(df, tree=None):
    # builds the decision tree
    target = df.keys()[-1]
    # root is the node with highest gain
    node = find_decider(df)
    # different values of the root
    values = np.unique(df[node])

    # initializing empty dictionary to store tree
    if tree is None:
        tree = {}
        tree[node] = {}

    # for each value of the root, further subtree are appended until leaf becomes either of the target values
    for value in values:
        subtable = get_subtable(df, node, value)
        target_value, counts = np.unique(subtable[target], return_counts=True)

        # single target value observed, leaf is added with the value
        if len(counts) == 1:
            tree[node][value] = target_value[0]
        # need to call it recursively to complete the decision tree
        else:
            tree[node][value] = buildTree(subtable)
    return tree

if __name__ == "__main__":
    # reads the dataset
    df = pd.read_csv('tennisDT.csv')
    print(df)
    # decision tree is built
    t = buildTree(df)
    # pprint (pretty print) is used for better output interpretability
    print("\nDecision Tree:")
    pprint.pprint(t)


## OUTPUT
# {'outlook': {'Overcast': 'Yes',
#              'Rain': {'wind': {'Strong': 'No', 'Weak': 'Yes'}},
#              'Sunny': {'humidity': {'High': 'No', 'Normal': 'Yes'}}}}

    day   outlook temperature humidity    wind play-tennis
0    D1     Sunny         Hot     High    Weak          No
1    D2     Sunny         Hot     High  Strong          No
2    D3  Overcast         Hot     High    Weak         Yes
3    D4      Rain        Mild     High    Weak         Yes
4    D5      Rain        Cool   Normal    Weak         Yes
5    D6      Rain        Cool   Normal  Strong          No
6    D7  Overcast        Cool   Normal  Strong         Yes
7    D8     Sunny        Mild     High    Weak          No
8    D9     Sunny        Cool   Normal    Weak         Yes
9   D10      Rain        Mild   Normal    Weak         Yes
10  D11     Sunny        Mild   Normal  Strong         Yes
11  D12  Overcast        Mild     High  Strong         Yes
12  D13  Overcast         Hot   Normal    Weak         Yes
13  D14      Rain        Mild     High  Strong          No

Decision Tree:
{'outlook': {'Overcast': 'Yes',
             'Rain': {'wind': {'Strong': 'No', 'Weak': 'Yes'}},
   

In [25]:
import pandas as pd

# def classify_sample(sample, tree):
#     # Check if the tree is a leaf node
#     if not isinstance(tree, dict):
#         return tree

#     # Get the attribute in the sample
#     attribute = next(iter(tree))

#     # Check if the attribute is present in the sample
#     if attribute in sample:
#         # Get the value of the attribute in the sample
#         value = sample[attribute]

#         # Check if the value is present in the current node of the tree
#         if value in tree[attribute]:
#             # Continue traversal
#             return classify_sample(sample, tree[attribute][value])

#     # If attribute or value is not present, return the current node as the classification
#     return tree

def classify_sample(sample, tree):
    # Check if the tree is a leaf node
    if not isinstance(tree, dict):
        return tree

    # Get the attribute in the sample
    attribute = next(iter(tree))

    # Check if the attribute is present in the sample
    if attribute in sample:
        # Get the value of the attribute in the sample
        value = sample[attribute]

        # Check if the value is present in the current node of the tree
        if value in tree[attribute]:
            # Continue traversal
            return classify_sample(sample, tree[attribute][value])

    # If attribute or value is not present, return a default classification value
    return "Unknown"

if __name__ == "__main__":
    # Read the creative samples CSV file
    df_pred = pd.read_csv('tennisPred.csv')

    # Apply the decision tree classification on the entire creative samples dataset
    df_pred['classification'] = df_pred.apply(lambda row: classify_sample(row.to_dict(), t), axis=1)

    # Output the classification result
    print(df_pred[['outlook', 'temperature', 'humidity', 'wind', 'play', 'classification']])


     outlook temperature humidity    wind play classification
0   Overcast         Hot   Normal    Weak  Yes            Yes
1      Sunny        Mild     High    Weak   No             No
2      Sunny        Cool     High    Weak  Yes             No
3       Rain        Cool     High  Strong   No             No
4      Sunny         Hot   Normal  Strong  Yes            Yes
5   Overcast        Mild   Normal    Weak  Yes            Yes
6       Rain        Mild     High    Weak  Yes            Yes
7      Sunny        Cool   Normal  Strong   No            Yes
8      Sunny         Hot     High    Weak   No             No
9       Rain        Cool   Normal    Weak  Yes            Yes
10  Overcast        Mild   Normal  Strong  Yes            Yes
11     Sunny        Cool     High  Strong   No             No
12     Sunny        Mild   Normal    Weak  Yes            Yes
13  Overcast         Hot     High    Weak  Yes            Yes
14      Rain        Mild   Normal  Strong   No             No


In [26]:

def classification_error(df, target_column, predicted_column):
    """
    Calculate classification error.

    Parameters:
    - df: DataFrame, the dataset
    - target_column: str, the column containing the actual classifications
    - predicted_column: str, the column containing the predicted classifications

    Returns:
    - error_rate: float, the classification error rate
    """
    incorrect_predictions = df[df[target_column] != df[predicted_column]]
    error_rate = len(incorrect_predictions) / len(df)
    return error_rate

error_rate = classification_error(df_pred, 'play', 'classification')
print(f"Classification Error Rate: {error_rate * 100:.2f}%")

Classification Error Rate: 13.33%


In [8]:
def classify_sample(sample, tree):
    # Check if the tree is a leaf node
    if not isinstance(tree, dict):
        return tree

    # Get the attribute in the sample
    attribute = next(iter(tree))

    # Check if the attribute is present in the sample
    if attribute in sample:
        # Get the value of the attribute in the sample
        value = sample[attribute]

        # Check if the value is present in the current node of the tree
        if value in tree[attribute]:
            # Continue traversal
            return classify_sample(sample, tree[attribute][value])

    # If attribute or value is not present, return the current node as the classification
    return tree

if __name__ == "__main__":
    # Sample data for classification
    new_sample = {'outlook': 'Sunny', 'temperature': 'Cool', 'humidity': 'High', 'wind': 'Weak'}

    # Applying the decision tree on the new sample
    classification_result = classify_sample(new_sample, t)

    # Output the classification result
    print(f"The classification result for the new sample is: {classification_result}")


The classification result for the new sample is: No
