**KHAN MOHD OWAIS RAZA**

**20BCD7138**

*CSE4005 (Data Warehousing and Data Mining) Lab-8*

In [9]:
import pandas as pd
import numpy as np

In [10]:
def calculate_entropy(data):
    class_labels = data.iloc[:, -1]
    unique_labels, counts = np.unique(class_labels, return_counts=True)
    probabilities = counts / len(class_labels)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

In [11]:
def information_gain(data, attribute):
    total_entropy = calculate_entropy(data)
    values, counts = np.unique(data[attribute], return_counts=True)
    weighted_entropy = 0
    for value, count in zip(values, counts):
        subset = data[data[attribute] == value]
        subset_entropy = calculate_entropy(subset)
        weighted_entropy += (count / len(data)) * subset_entropy
    info_gain = total_entropy - weighted_entropy
    return info_gain

In [12]:
def calculate_gini_index(data):
    class_labels = data.iloc[:, -1]
    unique_labels, counts = np.unique(class_labels, return_counts=True)
    probabilities = counts / len(class_labels)
    gini_index = 1 - np.sum(probabilities ** 2)
    return gini_index

In [14]:
def gini_index(data, attribute):
    values, counts = np.unique(data[attribute], return_counts=True)
    weighted_gini = 0
    for value, count in zip(values, counts):
        subset = data[data[attribute] == value]
        subset_gini = calculate_gini_index(subset)
        weighted_gini += (count / len(data)) * subset_gini
    return weighted_gini

In [15]:
def find_best_split(data, measure):
    attributes = data.columns[:-1]
    best_split = None
    best_value = None
    for attribute in attributes:
        values = data[attribute].unique()
        for value in values:
            subset = data[data[attribute] == value]
            remaining_data = data[data[attribute] != value]
            if measure == 'info_gain':
                current_measure = information_gain(data, attribute)
            elif measure == 'gini_index':
                current_measure = gini_index(data, attribute)
            else:
                raise ValueError("Invalid measure. Use 'info_gain' or 'gini_index'.")
            if best_split is None or current_measure > best_split:
                best_split = current_measure
                best_value = value
    return best_split, best_value

In [17]:
tennis_data = pd.read_csv('PlayTennis.csv')
print("Information Gain:")
info_gain_value, split_value = find_best_split(tennis_data, 'info_gain')
print(f"Best Splitting Attribute: {split_value}, Information Gain: {info_gain_value}")

Information Gain:
Best Splitting Attribute: Sunny, Information Gain: 0.24674981977443933


In [18]:
print("\nGini Index:")
gini_value, split_value = find_best_split(tennis_data, 'gini_index')
print(f"Best Splitting Attribute: {split_value}, Gini Index: {gini_value}")


Gini Index:
Best Splitting Attribute: Hot, Gini Index: 0.44047619047619047


In [19]:
iris_data = pd.read_csv('iris.csv')
print("\nInformation Gain:")
info_gain_value, split_value = find_best_split(iris_data, 'info_gain')
print(f"Best Splitting Attribute: {split_value}, Information Gain: {info_gain_value}")


Information Gain:
Best Splitting Attribute: 1.4, Information Gain: 1.4463165236458


In [20]:
print("\nGini Index:")
gini_value, split_value = find_best_split(iris_data, 'gini_index')
print(f"Best Splitting Attribute: {split_value}, Gini Index: {gini_value}")


Gini Index:
Best Splitting Attribute: 3.5, Gini Index: 0.46767491767491765
