<a href="https://colab.research.google.com/github/JinxWycman/MACHINE-LEARNING/blob/main/Python_code_for_Split_algorithm_using_Gini_Index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import math

# Define a toy dataset (features and labels)
data = [
    {'feature_1': 1, 'feature_2': 0, 'label': 'A'},
    {'feature_1': 0, 'feature_2': 1, 'label': 'B'},
    {'feature_1': 1, 'feature_2': 1, 'label': 'A'},
    {'feature_1': 0, 'feature_2': 0, 'label': 'B'},
    {'feature_1': 1, 'feature_2': 0, 'label': 'B'},
    {'feature_1': 0, 'feature_2': 1, 'label': 'A'},
    {'feature_1': 1, 'feature_2': 1, 'label': 'B'},
    {'feature_1': 0, 'feature_2': 0, 'label': 'A'},
]

# Calculate Gini index for a dataset
def calculate_gini(data):
    total_count = len(data)
    if total_count == 0:  # Handle empty dataset case
        return 0.0
    label_counts = {}
    for record in data:
        label = record['label']
        if label not in label_counts:
            label_counts[label] = 0
        label_counts[label] += 1

    gini_index = 1.0
    for label in label_counts:
        probability = label_counts[label] / total_count
        gini_index -= probability ** 2
    return gini_index

# Split the dataset based on a feature and value
def split_dataset(data, feature, value):
    left = [record for record in data if record[feature] == value]
    right = [record for record in data if record[feature] != value]
    return left, right

# Select the best split based on Gini index
def select_best_split(data, features):
    best_gini = 1.0
    best_split = None

    for feature in features:
        values = set(record[feature] for record in data)
        for value in values:
            left, right = split_dataset(data, feature, value)

            # Avoid splits that result in empty datasets
            if not left or not right:
                continue

            gini_left = calculate_gini(left)
            gini_right = calculate_gini(right)

            # Calculate weighted Gini index
            weighted_gini = (len(left) / len(data)) * gini_left + (len(right) / len(data)) * gini_right

            # Select the split with the lowest weighted Gini index
            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_split = (feature, value, gini_left, gini_right)

    return best_split

# Recursive function to build the decision tree
def build_tree(data, features):
    # Base case 1: If all data points have the same label, return the label
    if len(set(record['label'] for record in data)) == 1:
        return data[0]['label']

    # Base case 2: If there are no features left to split on, return the majority label
    if len(features) == 0:
        # Find the majority label
        label_counts = {}
        for record in data:
            label = record['label']
            label_counts[label] = label_counts.get(label, 0) + 1
        return max(label_counts, key=label_counts.get)

    # Select the best split
    best_split = select_best_split(data, features)

    # Base case 3: If no good split is found, return the majority label
    if not best_split:
        label_counts = {}
        for record in data:
            label = record['label']
            label_counts[label] = label_counts.get(label, 0) + 1
        return max(label_counts, key=label_counts.get)


    feature, value, _, _ = best_split

    # Build the tree branches recursively
    tree = {feature: {}}
    left, right = split_dataset(data, feature, value)

    # Create copies of the features list for recursive calls
    remaining_features = features[:]
    remaining_features.remove(feature)

    tree[feature][value] = build_tree(left, remaining_features)
    tree[feature]['other'] = build_tree(right, remaining_features)

    return tree

# Main function to build and print the decision tree
def main():
    features = list(data[0].keys())
    features.remove('label')
    tree = build_tree(data, features)
    print("Decision Tree:")
    print(tree)

if __name__ == '__main__':
    main()

Decision Tree:
{'feature_1': {0: {'feature_2': {0: 'B', 'other': 'B'}}, 'other': {'feature_2': {0: 'A', 'other': 'A'}}}}


Figure 4.9: Output of code 4.6
This example demonstrates how to implement a binary decision tree using the Gini index for splitting the data into subsets based on feature values. The decision tree is built recursively by selecting the best split at each node, and it can be used for classification tasks.
**bold text**