In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
def gini_impurity(y):
    counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return 1.0 - np.sum(probabilities**2)

def best_split(X, y):
    num_samples, num_features = X.shape
    if num_samples <= 1:
        return None, None

    current_impurity = gini_impurity(y)

    best_feature = None
    best_threshold = None
    best_impurity_reduction = 0

    for feature_index in range(num_features):
        thresholds = np.unique(X[:, feature_index])

        for threshold in thresholds:
            left_mask = X[:, feature_index] <= threshold
            right_mask = ~left_mask

            if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                continue

            left_impurity = gini_impurity(y[left_mask])
            right_impurity = gini_impurity(y[right_mask])
            impurity_reduction = current_impurity - (
                len(y[left_mask]) / len(y) * left_impurity +
                len(y[right_mask]) / len(y) * right_impurity
            )

            if impurity_reduction > best_impurity_reduction:
                best_feature = feature_index
                best_threshold = threshold
                best_impurity_reduction = impurity_reduction

    return best_feature, best_threshold

In [None]:
def grow_tree(X, y, depth, max_depth):
    num_samples, _ = X.shape
    unique_classes = np.unique(y)

    if len(unique_classes) == 1 or (max_depth is not None and depth == max_depth):
        return {'class': unique_classes[0]}

    best_feature, best_threshold = best_split(X, y)

    if best_feature is None:
        return {'class': np.bincount(y).argmax()}

    left_mask = X[:, best_feature] <= best_threshold
    right_mask = ~left_mask

    left_subtree = grow_tree(X[left_mask], y[left_mask], depth + 1, max_depth)
    right_subtree = grow_tree(X[right_mask], y[right_mask], depth + 1, max_depth)

    return {'feature_index': best_feature,
            'threshold': best_threshold,
            'left': left_subtree,
            'right': right_subtree}

def predict_sample(x, node):
    if 'class' in node:
        return node['class']
    if x[node['feature_index']] <= node['threshold']:
        return predict_sample(x, node['left'])
    else:
        return predict_sample(x, node['right'])

def predict(X, tree):
    return np.array([predict_sample(x, tree) for x in X])

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
from pprint import pprint

sns.set_style("darkgrid")
df = pd.read_csv("/Users/mukulhooda/Desktop/College/3rd Year/Machine Learning-1/Lab File/Programs/Iris - Iris.csv")

df = df.drop("Id", axis=1)
df = df.rename(columns={"species": "label"})
df.head()


def train_test_split(df, test_size):
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))
    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)
    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    return train_df, test_df

def check_purity(data):
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)
    if len(unique_classes) == 1:
        return True
    else:
        return False


def classify_data(data):
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    return classification


def get_potential_splits(data):
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)
        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2
                potential_splits[column_index].append(potential_split)
    return potential_splits


def split_data(data, split_column, split_value):
    split_column_values = data[:, split_column]
    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values > split_value]
    return data_below, data_above


def calculate_entropy(data):
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)
    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
    return entropy


def calculate_overall_entropy(data_below, data_above):
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n
    overall_entropy = (p_data_below * calculate_entropy(data_below)
                       + p_data_above * calculate_entropy(data_above))
    return overall_entropy


def determine_best_split(data, potential_splits):
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)
            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value

    return best_split_column, best_split_value


sub_tree = {"question": ["yes_answer",
                         "no_answer"]}
example_tree = {"petal_width <= 0.8": ["Iris-setosa",
                                       {"petal_width <= 1.65": [{"petal_length <= 4.9": ["Iris-versicolor",
                                                                                         "Iris-virginica"]},
                                                                "Iris-virginica"]}]}


def decision_tree_algorithm(df, counter=0, min_samples=2, max_depth=5):
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df
        # base cases
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        return classification
    # recursive part
    else:
        counter += 1
        # helper functions
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)

        # instantiate sub-tree
        feature_name = COLUMN_HEADERS[split_column]
        question = "{} <= {}".format(feature_name, split_value)
        sub_tree = {question: []}

        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)

        # If the answers are the same, then there is no point in asking the question.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base cases).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        return sub_tree


def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split()

    # ask question
    if example[feature_name] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer

    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)


def calculate_accuracy(df, tree):
    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    accuracy = df["classification_correct"].mean()
    return accuracy


train_df, test_df = train_test_split(df, test_size=20)
tree = decision_tree_algorithm(train_df, max_depth=3)
accuracy = calculate_accuracy(test_df, tree)
print()
pprint(tree)
print(accuracy)

In [None]:
import numpy as np
import math
from collections import Counter

def entropy(labels):
    label_counts = Counter(labels)
    entropy = 0
    total_samples = len(labels)
    for label in label_counts:
        probability = label_counts[label] / total_samples
        entropy -= probability * math.log2(probability)
    return entropy

def information_gain(data, labels, feature_index):
    total_entropy = entropy(labels)
    
    feature_values = data[:, feature_index]
    unique_values = np.unique(feature_values)
    feature_entropy = 0
    total_samples = len(labels)
    for value in unique_values:
        subset_indices = np.where(feature_values == value)[0]
        subset_labels = labels[subset_indices]
        subset_entropy = entropy(subset_labels)
        weight = len(subset_indices) / total_samples
        feature_entropy += weight * subset_entropy
    
    information_gain = total_entropy - feature_entropy
    return information_gain

def find_best_split(data, labels):
    num_features = data.shape[1]
    best_feature_index = None
    best_information_gain = -1
    for feature_index in range(num_features):
        ig = information_gain(data, labels, feature_index)
        if ig > best_information_gain:
            best_information_gain = ig
            best_feature_index = feature_index
    return best_feature_index

def build_tree(data, labels, feature_indices):
    unique_labels = np.unique(labels)

    # If there is only one class in labels, return a leaf node with that class
    if len(unique_labels) == 1:
        return unique_labels[0]

    # If there are no more features to split on, return the most common label
    if len(feature_indices) == 0:
        return Counter(labels).most_common(1)[0][0]

    # Find the best feature to split on
    best_feature_index = find_best_split(data, labels)

    # Create a new tree node with the best feature
    tree_node = {'feature_index': best_feature_index, 'children': {}}
    feature_indices.remove(best_feature_index)

    # Recursively build subtrees
    feature_values = np.unique(data[:, best_feature_index])
    for value in feature_values:
        subset_indices = np.where(data[:, best_feature_index] == value)[0]
        subset_data = data[subset_indices]
        subset_labels = labels[subset_indices]
        subtree = build_tree(subset_data, subset_labels, feature_indices[:])
        tree_node['children'][value] = subtree

    return tree_node

def fit(data, labels):
    num_features = data.shape[1]
    feature_indices = list(range(num_features))
    return build_tree(data, labels, feature_indices)

def predict(tree, sample):
    def traverse(node, sample):
        if isinstance(node, dict):
            feature_index = node['feature_index']
            value = sample[feature_index]
            if value in node['children']:
                return traverse(node['children'][value], sample)
            else:
                return None
        else:
            return node

    return traverse(tree, sample)

# Example usage:
# data = np.array([
#     [1, 1],
#     [1, 0],
#     [0, 1],
#     [0, 0],
#     [1, 1],
#     [1, 0],
#     [0, 1],
#     [0, 0]
# ])

# labels = np.array([1, 1, 1, 0, 1, 1, 0, 0])

data=pd.read_csv('/Users/mukulhooda/Documents/Workbooks /Assignment 2.csv')
# tree = fit(data, labels)

# # Predictions for new samples
# new_samples = np.array([
#     [1, 0],
#     [0, 1]
# ])
# for sample in new_samples:
#     prediction = predict(tree, sample)
#     print(f"Prediction for {sample}: {prediction}")


In [None]:
data

In [None]:
import numpy as np
import math
from collections import Counter
import csv

def entropy(labels):
    label_counts = Counter(labels)
    entropy = 0
    total_samples = len(labels)
    for label in label_counts:
        probability = label_counts[label] / total_samples
        entropy -= probability * math.log2(probability)
    return entropy

def information_gain(data, labels, feature_index):
    total_entropy = entropy(labels)
    
    feature_values = data[:, feature_index]
    unique_values = np.unique(feature_values)
    feature_entropy = 0
    total_samples = len(labels)
    for value in unique_values:
        subset_indices = np.where(feature_values == value)[0]
        subset_labels = labels[subset_indices]
        subset_entropy = entropy(subset_labels)
        weight = len(subset_indices) / total_samples
        feature_entropy += weight * subset_entropy
    
    information_gain = total_entropy - feature_entropy
    return information_gain

def find_best_split(data, labels):
    num_features = data.shape[1] - 1
    best_feature_index = None
    best_information_gain = -1
    for feature_index in range(num_features):
        ig = information_gain(data, labels, feature_index)
        if ig > best_information_gain:
            best_information_gain = ig
            best_feature_index = feature_index
    return best_feature_index

def build_tree(data, labels, feature_indices):
    unique_labels = np.unique(labels)

    # If there is only one class in labels, return a leaf node with that class
    if len(unique_labels) == 1:
        return unique_labels[0]

    # If there are no more features to split on, return the most common label
    if len(feature_indices) == 0:
        return Counter(labels).most_common(1)[0][0]

    # Find the best feature to split on
    best_feature_index = find_best_split(data, labels)

    # Create a new tree node with the best feature
    tree_node = {'feature_index': best_feature_index, 'children': {}}
    feature_indices.remove(best_feature_index)

    # Recursively build subtrees
    feature_values = np.unique(data[:, best_feature_index])
    for value in feature_values:
        subset_indices = np.where(data[:, best_feature_index] == value)[0]
        subset_data = data[subset_indices]
        subset_labels = labels[subset_indices]
        subtree = build_tree(subset_data, subset_labels, feature_indices[:])
        tree_node['children'][value] = subtree

    return tree_node

def fit(data, labels):
    num_features = data.shape[1] - 1
    feature_indices = list(range(num_features))
    return build_tree(data, labels, feature_indices)

def predict(tree, sample):
    def traverse(node, sample):
        if isinstance(node, dict):
            feature_index = node['feature_index']
            value = sample[feature_index]
            if value in node['children']:
                return traverse(node['children'][value], sample)
            else:
                return None
        else:
            return node

    return traverse(tree, sample)

def load_data_from_csv(csv_file):
    data = []
    labels = []
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            data.append([int(x) for x in row[:-1]])
            labels.append(int(row[-1]))
    return np.array(data), np.array(labels)

# Load data from CSV file
csv_file = '/Users/mukulhooda/Documents/Workbooks /Assignment 2.csv'  # Provide the path to your CSV file here
data, labels = load_data_from_csv(csv_file)

# Fit the decision tree
tree = fit(X, Y)

# Predictions for new samples
new_samples = np.array([
    [1, 0, 1, 0, 1],
    [0, 1, 0, 1, 0]
])
for sample in new_samples:
    prediction = predict(tree, sample)
    print(f"Prediction for {sample}: {prediction}")


In [None]:
data=pd.read_csv('/Users/mukulhooda/Documents/Workbooks /Assignment 2.csv')


In [None]:
data[['

In [None]:
# Step 3: Select X and Y Columns
# Replace 'feature_cols' and 'target_col' with your actual column names
feature_cols = ['Day','Outlook','Humidity','Wind','Play Tennis']  # List of column names for features (X)
target_col = 'Play Tennis'  # Name of the column for the target variable (Y)

# Step 4: Extract X and Y
X = data[feature_cols].values  # Extract features as numpy array
Y = data[target_col].values  # Extract target variable as numpy array

# You can now use X and Y for machine learning models

In [None]:
X

In [None]:
Y

In [None]:
import numpy as np
import math
from collections import Counter
import csv

def entropy(labels):
    label_counts = Counter(labels)
    entropy = 0
    total_samples = len(labels)
    for label in label_counts:
        probability = label_counts[label] / total_samples
        entropy -= probability * math.log2(probability)
    return entropy

def information_gain(data, labels, feature_index):
    total_entropy = entropy(labels)
    
    feature_values = data[:, feature_index]
    unique_values = np.unique(feature_values)
    feature_entropy = 0
    total_samples = len(labels)
    for value in unique_values:
        subset_indices = np.where(feature_values == value)[0]
        subset_labels = labels[subset_indices]
        subset_entropy = entropy(subset_labels)
        weight = len(subset_indices) / total_samples
        feature_entropy += weight * subset_entropy
    
    information_gain = total_entropy - feature_entropy
    return information_gain

def find_best_split(data, labels):
    num_features = data.shape[1] - 1
    best_feature_index = None
    best_information_gain = -1
    for feature_index in range(num_features):
        ig = information_gain(data, labels, feature_index)
        if ig > best_information_gain:
            best_information_gain = ig
            best_feature_index = feature_index
    return best_feature_index

def build_tree(data, labels, feature_indices):
    unique_labels = np.unique(labels)

    # If there is only one class in labels, return a leaf node with that class
    if len(unique_labels) == 1:
        return unique_labels[0]

    # If there are no more features to split on, return the most common label
    if len(feature_indices) == 0:
        return Counter(labels).most_common(1)[0][0]

    # Find the best feature to split on
    best_feature_index = find_best_split(data, labels)

    # Create a new tree node with the best feature
    tree_node = {'feature_index': best_feature_index, 'children': {}}
    feature_indices.remove(best_feature_index)

    # Recursively build subtrees
    feature_values = np.unique(data[:, best_feature_index])
    for value in feature_values:
        subset_indices = np.where(data[:, best_feature_index] == value)[0]
        subset_data = data[subset_indices]
        subset_labels = labels[subset_indices]
        subtree = build_tree(subset_data, subset_labels, feature_indices[:])
        tree_node['children'][value] = subtree

    return tree_node

def fit(data, labels):
    num_features = data.shape[1] - 1
    feature_indices = list(range(num_features))
    return build_tree(data, labels, feature_indices)

def predict(tree, sample):
    def traverse(node, sample):
        if isinstance(node, dict):
            feature_index = node['feature_index']
            value = sample[feature_index]
            if value in node['children']:
                return traverse(node['children'][value], sample)
            else:
                return None
        else:
            return node

    return traverse(tree, sample)

def load_data_from_csv(csv_file):
    data = []
    labels = []
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            data.append([int(x) for x in row[:-1]])
            labels.append(int(row[-1]))
    return np.array(data), np.array(labels)

# Load data from CSV file
# csv_file = 'data.csv'  # Provide the path to your CSV file here
# data, labels = load_data_from_csv(csv_file)

# Fit the decision tree
tree = fit(X, Y)

# Predictions for new samples
new_samples = np.array([
    [1, 0, 1, 0, 1],
    [0, 1, 0, 1, 0]
])
for sample in new_samples:
    prediction = predict(tree, sample)
    print(f"Prediction for {sample}: {prediction}")


In [None]:
import numpy as np
import math
from collections import Counter
import csv

def entropy(labels):
    label_counts = Counter(labels)
    entropy = 0
    total_samples = len(labels)
    for label in label_counts:
        probability = label_counts[label] / total_samples
        entropy -= probability * math.log2(probability)
    return entropy

def information_gain(data, labels, feature_index):
    total_entropy = entropy(labels)
    
    feature_values = data[:, feature_index]
    unique_values = np.unique(feature_values)
    feature_entropy = 0
    total_samples = len(labels)
    for value in unique_values:
        subset_indices = np.where(feature_values == value)[0]
        subset_labels = labels[subset_indices]
        subset_entropy = entropy(subset_labels)
        weight = len(subset_indices) / total_samples
        feature_entropy += weight * subset_entropy
    
    information_gain = total_entropy - feature_entropy
    return information_gain, feature_entropy

def find_best_split(data, labels):
    num_features = data.shape[1] - 1  # Exclude the last column which is the dependent variable
    best_feature_index = None
    best_information_gain = -1
    for feature_index in range(num_features):
        ig, impurity = information_gain(data, labels, feature_index)
        if ig > best_information_gain:
            best_information_gain = ig
            best_feature_index = feature_index
    return best_feature_index

def build_tree(data, labels, feature_indices):
    unique_labels = np.unique(labels)

    # If there is only one class in labels, return a leaf node with that class
    if len(unique_labels) == 1:
        return unique_labels[0]

    # If there are no more features to split on, return the most common label
    if len(feature_indices) == 0:
        return Counter(labels).most_common(1)[0][0]

    # Find the best feature to split on
    best_feature_index = find_best_split(data, labels)

    # Create a new tree node with the best feature
    tree_node = {'feature_index': best_feature_index, 'children': {}}
    feature_indices.remove(best_feature_index)

    # Recursively build subtrees
    feature_values = np.unique(data[:, best_feature_index])
    for value in feature_values:
        subset_indices = np.where(data[:, best_feature_index] == value)[0]
        subset_data = data[subset_indices]
        subset_labels = labels[subset_indices]
        subtree = build_tree(subset_data, subset_labels, feature_indices[:])
        tree_node['children'][value] = subtree

    return tree_node

def fit(data, labels):
    num_features = data.shape[1] - 1  # Exclude the last column which is the dependent variable
    feature_indices = list(range(num_features))
    return build_tree(data, labels, feature_indices)

def predict(tree, sample):
    def traverse(node, sample):
        if isinstance(node, dict):
            feature_index = node['feature_index']
            value = sample[feature_index]
            if value in node['children']:
                return traverse(node['children'][value], sample)
            else:
                return None
        else:
            return node

    return traverse(tree, sample)

def load_data_from_csv(csv_file):
    data = []
    labels = []
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            data.append([int(x) for x in row[:-1]])
            labels.append(int(row[-1]))
    return np.array(data), np.array(labels)

def print_information_gain(data, labels):
    num_features = data.shape[1] - 1  # Exclude the last column which is the dependent variable
    for feature_index in range(num_features):
        ig, impurity = information_gain(data, labels, feature_index)
        print(f"Information Gain for feature {feature_index}: {ig:.4f}")
        print(f"Mini Impurity for feature {feature_index}: {impurity:.4f}\n")

# Load data from CSV file
# csv_file = 'data.csv'  # Provide the path to your CSV file here
# data, labels = load_data_from_csv(csv_file)

print("Information Gain and Mini Impurity for each attribute:")
print_information_gain(X, Y)

# Fit the decision tree
tree = fit(X, Y)

# Predictions for new samples
new_samples = np.array([
    [1, 0, 1, 1, 0],  # Example new sample
    [0, 1, 0, 0, 1]   # Example new sample
])
for sample in new_samples:
    prediction = predict(tree, sample)
    print(f"Prediction for {sample}: {prediction}")


In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import math

def entropy(y):
    """
    Function to calculate entropy
    """
    class_counts = np.bincount(y)
    ps = class_counts / len(y)
    entropy = -np.sum([p * np.log2(p) for p in ps if p > 0])
    return entropy

def information_gain(parent, left_child, right_child):
    """
    Function to calculate information gain
    """
    num_left = len(left_child) / len(parent)
    num_right = len(right_child) / len(parent)
    ig = entropy(parent) - (num_left * entropy(left_child) + num_right * entropy(right_child))
    return ig

def split(X, y, split_attribute, split_value):
    """
    Function to split the dataset
    """
    left_child_indices = np.where(X.iloc[:, split_attribute] <= split_value)[0]
    right_child_indices = np.where(X.iloc[:, split_attribute] > split_value)[0]
    left_child = y[left_child_indices]
    right_child = y[right_child_indices]
    return left_child, right_child

def find_best_split(X, y):
    """
    Function to find the best attribute to split on
    """
    best_attribute = None
    best_value = None
    max_info_gain = -1

    for attribute in X.columns:
        attribute_values = np.unique(X[attribute])
        for value in attribute_values:
            left_child, right_child = split(X, y, X.columns.get_loc(attribute), value)
            ig = information_gain(y, left_child, right_child)
            if ig > max_info_gain:
                max_info_gain = ig
                best_attribute = attribute
                best_value = value

    return best_attribute, best_value

def fit(X, y):
    """
    Function to fit the decision tree
    """
    if len(set(y)) == 1:
        return Counter(y).most_common(1)[0][0]
    else:
        best_attribute, best_value = find_best_split(X, y)
        if best_attribute is None:
            return Counter(y).most_common(1)[0][0]
        left_child, right_child = split(X, y, X.columns.get_loc(best_attribute), best_value)
        tree = {
            'attribute': best_attribute,
            'value': best_value,
            'left': fit(X.iloc[left_child], y[left_child]),
            'right': fit(X.iloc[right_child], y[right_child])
        }
        return tree

def predict(tree, X):
    """
    Function to predict using the decision tree
    """
    if isinstance(tree, dict):
        if X[tree['attribute']] <= tree['value']:
            return predict(tree['left'], X)
        else:
            return predict(tree['right'], X)
    else:
        return tree

# Example usage:
data = {
    'feature1': [2, 5, 8, 1, 4],
    'feature2': [3, 1, 9, 8, 5],
    'target': [0, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

X = df[['feature1', 'feature2']]
y = df['target']

tree = fit(X, y)
print(tree)


KeyError: '[2, 3] not in index'

In [6]:
import pandas as pd
import numpy as np
from collections import Counter
import math

def entropy(y):
    """
    Function to calculate entropy
    """
    class_counts = np.bincount(y)
    ps = class_counts / len(y)
    entropy = -np.sum([p * np.log2(p) for p in ps if p > 0])
    return entropy

def information_gain(parent, left_child, right_child):
    """
    Function to calculate information gain
    """
    num_left = len(left_child) / len(parent)
    num_right = len(right_child) / len(parent)
    ig = entropy(parent) - (num_left * entropy(left_child) + num_right * entropy(right_child))
    return ig

def split(X, y, split_attribute, split_value):
    """
    Function to split the dataset
    """
    left_child_indices = np.where(X[:, split_attribute] <= split_value)[0]
    right_child_indices = np.where(X[:, split_attribute] > split_value)[0]
    left_child = y[left_child_indices]
    right_child = y[right_child_indices]
    return left_child, right_child

def find_best_split(X, y):
    """
    Function to find the best attribute to split on
    """
    best_attribute = None
    best_value = None
    max_info_gain = -1

    for attribute in range(X.shape[1]):
        attribute_values = np.unique(X[:, attribute])
        for value in attribute_values:
            left_child, right_child = split(X, y, attribute, value)
            ig = information_gain(y, left_child, right_child)
            if ig > max_info_gain:
                max_info_gain = ig
                best_attribute = attribute
                best_value = value

    return best_attribute, best_value

def fit(X, y):
    """
    Function to fit the decision tree
    """
    if len(set(y)) == 1:
        return Counter(y).most_common(1)[0][0]
    else:
        best_attribute, best_value = find_best_split(X, y)
        if best_attribute is None:
            return Counter(y).most_common(1)[0][0]
        left_child, right_child = split(X, y, best_attribute, best_value)
        tree = {
            'attribute': best_attribute,
            'value': best_value,
            'left': fit(X[left_child], y[left_child]),
            'right': fit(X[right_child], y[right_child])
        }
        return tree

def predict(tree, X):
    """
    Function to predict using the decision tree
    """
    if isinstance(tree, dict):
        if X[tree['attribute']] <= tree['value']:
            return predict(tree['left'], X)
        else:
            return predict(tree['right'], X)
    else:
        return tree

# Example usage:
data = {
    'feature1': [2, 5, 8, 1, 4],
    'feature2': [3, 1, 9, 8, 5],
    'target': [0, 1, 0, 1, 0]
}
df = pd.read_csv('/Users/mukulhooda/Documents/Workbooks /Assignment 2.csv')

feature_cols = ['Day','Outlook','Humidity','Wind','Play Tennis']  # List of column names for features (X)
target_col = 'Play Tennis'  # Name of the column for the target variable (Y)

# Step 4: Extract X and Y
X = df[feature_cols].values  # Extract features as numpy array
Y = df[target_col].values  # Extract target variable as numpy array


tree = fit(X, Y)
print(tree)


TypeError: Cannot cast array data from dtype('O') to dtype('int64') according to the rule 'safe'