<a href="https://colab.research.google.com/github/JinxWycman/MACHINE-LEARNING/blob/main/py_code_for_Split_algorithm_using_IDT3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import math

# Define a toy dataset (features and labels)
data = [
    {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': False, 'play_tennis': False},
    {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': True, 'play_tennis': False},
    {'outlook': 'overcast', 'temperature': 'hot', 'humidity': 'high', 'windy': False, 'play_tennis': True},
    {'outlook': 'rainy', 'temperature': 'mild', 'humidity': 'high', 'windy': False, 'play_tennis': True},
    {'outlook': 'rainy', 'temperature': 'cool', 'humidity': 'normal', 'windy': False, 'play_tennis': True},
    {'outlook': 'rainy', 'temperature': 'cool', 'humidity': 'normal', 'windy': True, 'play_tennis': False},
    {'outlook': 'overcast', 'temperature': 'cool', 'humidity': 'normal', 'windy': True, 'play_tennis': True},
    {'outlook': 'sunny', 'temperature': 'mild', 'humidity': 'high', 'windy': False, 'play_tennis': False},
    {'outlook': 'sunny', 'temperature': 'cool', 'humidity': 'normal', 'windy': False, 'play_tennis': True},
    {'outlook': 'rainy', 'temperature': 'mild', 'humidity': 'normal', 'windy': False, 'play_tennis': True},
    {'outlook': 'sunny', 'temperature': 'mild', 'humidity': 'normal', 'windy': True, 'play_tennis': True},
    {'outlook': 'overcast', 'temperature': 'mild', 'humidity': 'high', 'windy': True, 'play_tennis': True},
    {'outlook': 'overcast', 'temperature': 'hot', 'humidity': 'normal', 'windy': False, 'play_tennis': True},
    {'outlook': 'rainy', 'temperature': 'mild', 'humidity': 'high', 'windy': True, 'play_tennis': False},
]

# Function to calculate entropy
def calculate_entropy(data):
    num_data = len(data)
    if num_data == 0:
        return 0.0
    label_counts = {}
    for record in data:
        label = record['play_tennis']
        if label not in label_counts:
            label_counts[label] = 0
        label_counts[label] += 1
    entropy = 0.0
    for label in label_counts:
        prob = label_counts[label] / num_data
        entropy -= prob * math.log(prob, 2)
    return entropy

# Function to split the dataset based on a feature
def split_dataset(data, feature):
    subsets = {}
    for record in data:
        value = record[feature]
        if value not in subsets:
            subsets[value] = []
        subsets[value].append(record)
    return subsets

# Function to select the best feature to split on
def select_best_feature(data, features):
    base_entropy = calculate_entropy(data)
    best_info_gain = 0.0
    best_feature = None
    for feature in features:
        feature_values = set([record[feature] for record in data])
        new_entropy = 0.0
        for value in feature_values:
            subset = [record for record in data if record[feature] == value]
            prob = len(subset) / len(data)
            new_entropy += prob * calculate_entropy(subset)
        info_gain = base_entropy - new_entropy
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = feature
    return best_feature

# Function to build the decision tree
def build_tree(data, features):
    class_labels = [record['play_tennis'] for record in data]

    # If all instances have the same class label, return that label
    if class_labels.count(class_labels[0]) == len(class_labels):
        return class_labels[0]

    # If there are no features left, return the majority class
    if len(features) == 0:
        # Handle empty data case
        if not class_labels:
            return None
        return max(set(class_labels), key=class_labels.count)

    # Select the best feature to split on
    best_feature = select_best_feature(data, features)

    # Create the tree structure
    tree = {best_feature: {}}

    # Get unique values for the best feature
    feature_values = set([record[best_feature] for record in data])

    # Recursively build the tree for each value of the best feature
    for value in feature_values:
        sub_features = features[:] # Create a copy of features
        sub_features.remove(best_feature)
        subset = [record for record in data if record[best_feature] == value]
        tree[best_feature][value] = build_tree(subset, sub_features)

    return tree

# Main function to build and print the decision tree
def main():
    features = list(data[0].keys()) # List of features
    features.remove('play_tennis') # Remove the target variable
    tree = build_tree(data, features)
    print(tree)

if __name__ == '__main__':
    main()

{'outlook': {'sunny': {'humidity': {'normal': True, 'high': False}}, 'overcast': True, 'rainy': {'windy': {False: True, True: False}}}}


Figure 4.8: Output of code 4.5
Split algorithm based on Gini index
The Gini index is another common criterion for splitting in decision tree algorithms. Gini index measures the impurity of a dataset by calculating the probability of misclassifying a randomly chosen element in the dataset. The goal is to minimize the Gini index when splitting the data into subsets at each node. Code 4.6 is an example of implementing a binary decision tree split based on the Gini index in Python:
**bold text**