In [8]:
import pandas as pd
import math

# Importing data from the csv file in a dataframe

In [9]:
# Given data
file_path = 'play.csv'
df = pd.read_csv(file_path)
df

Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,Play Tennis
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Sunny,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


# Function to calculate entropy

In [10]:
def calculate_entropy(labels):
    total_samples = len(labels)
    unique_labels = set(labels)
    
    entropy = 0
    for label in unique_labels:
        label_count = labels.value_counts()[label]
        probability = label_count / total_samples
        entropy -= probability * math.log2(probability)
    
    return entropy

In [11]:
# Function to calculate information gain

In [12]:
def calculate_information_gain(data, feature, target):
    total_entropy = calculate_entropy(data[target])
    unique_values = set(data[feature])
    
    weighted_entropy = 0
    for value in unique_values:
        subset = data[data[feature] == value]
        subset_entropy = calculate_entropy(subset[target])
        weight = len(subset) / len(data)
        weighted_entropy += weight * subset_entropy
    
    information_gain = total_entropy - weighted_entropy
    return information_gain

# Function to determine the best attribute to split on

In [13]:
def choose_best_attribute(data, features, target):
    best_attribute = None
    max_information_gain = 0
    
    for feature in features:
        information_gain = calculate_information_gain(data, feature, target)
        
        if information_gain > max_information_gain:
            max_information_gain = information_gain
            best_attribute = feature
    
    return best_attribute, max_information_gain

# Calculating Entropy and Information Gain of each attribute

In [14]:
# Calculate entropy for the target variable 'Play Tennis'
attributes = ['Outlook', 'Temperature', 'Humidity', 'Wind']
for attribute in attributes:
    target_entropy = calculate_entropy(df[attribute])
    print(f"Entropy for the target variable {attribute}: {target_entropy:.4f}")
print()

# Calculate information gain for each attribute
attributes = ['Outlook', 'Temperature', 'Humidity', 'Wind']
for attribute in attributes:
    gain = calculate_information_gain(df, attribute, 'Play Tennis')
    print(f"Information Gain for {attribute}: {gain:.4f}")

# Choose the best attribute to split on
best_attribute, max_gain = choose_best_attribute(df, attributes, 'Play Tennis')
print(f"\nThe best attribute to split on is '{best_attribute}' with Information Gain: {max_gain:.4f}")

Entropy for the target variable Outlook: 1.5774
Entropy for the target variable Temperature: 1.5567
Entropy for the target variable Humidity: 1.0000
Entropy for the target variable Wind: 0.9852

Information Gain for Outlook: 0.2467
Information Gain for Temperature: 0.0292
Information Gain for Humidity: 0.1518
Information Gain for Wind: 0.0481

The best attribute to split on is 'Outlook' with Information Gain: 0.2467
