In [None]:
import pandas as pd
import math


In [None]:
df = pd.read_csv('dt_dataset.csv')
df.head()

Unnamed: 0,Day,Temperature,Outlook,Humidity,Windy,Rainfall,Play Game?
0,5-Jul,hot,sunny,high,False,120,no
1,6-Jul,hot,sunny,high,True,200,no
2,7-Jul,hot,overcast,high,False,180,yes
3,9-Jul,cool,rain,normal,False,150,yes
4,10-Jul,cool,overcast,normal,True,140,yes


In [None]:
def convert_rainfall_to_category(value):
    if 100 <= value <= 140:
        return 'low'
    elif 141 <= value <= 180:
        return 'medium'
    else:
        return 'high'

df['Rainfall'] = df['Rainfall'].apply(convert_rainfall_to_category)
df.head()

Unnamed: 0,Day,Temperature,Outlook,Humidity,Windy,Rainfall,Play Game?
0,5-Jul,hot,sunny,high,False,low,no
1,6-Jul,hot,sunny,high,True,high,no
2,7-Jul,hot,overcast,high,False,medium,yes
3,9-Jul,cool,rain,normal,False,medium,yes
4,10-Jul,cool,overcast,normal,True,low,yes


In [None]:
def calculate_entropy(data):
    target_labels = data['Play Game?']
    total_instances = len(target_labels)
    unique_labels = target_labels.unique()
    entropy_val = 0

    for label in unique_labels:
        probability = len(target_labels[target_labels == label]) / total_instances
        entropy_val -= probability * math.log2(probability)

    return entropy_val


In [None]:
def calculate_information_gain(data, attribute):
    total_instances = len(data)
    attribute_entropy = 0

    for value in data[attribute].unique():
        subset = data[data[attribute] == value]
        subset_entropy = calculate_entropy(subset) * len(subset) / total_instances
        attribute_entropy += subset_entropy

    return calculate_entropy(data) - attribute_entropy


In [None]:
def build_decision_tree(data, candidate_attributes):

    if len(data['Play Game?'].unique()) == 1:
        return data['Play Game?'].iloc[0]


    if len(candidate_attributes) == 0:
        return data['Play Game?'].value_counts().idxmax()

    max_information_gain = -1
    best_attribute = None
    for attribute in candidate_attributes:
        information_gain = calculate_information_gain(data, attribute)
        if information_gain > max_information_gain:
            max_information_gain = information_gain
            best_attribute = attribute

    tree = {best_attribute: {}}
    remaining_attributes = [attr for attr in candidate_attributes if attr != best_attribute]

    for value in data[best_attribute].unique():
        subset = data[data[best_attribute] == value]
        subtree = build_decision_tree(subset, remaining_attributes)
        tree[best_attribute][value] = subtree

    return tree

# List of candidate attributes (excluding the target variable 'Play Game?')
candidate_attributes = ['Temperature', 'Outlook', 'Humidity', 'Windy', 'Rainfall']


In [None]:
decision_tree = build_decision_tree(df, candidate_attributes)

print(decision_tree)

{'Outlook': {'sunny': {'Humidity': {'high': 'no', 'normal': 'yes'}}, 'overcast': 'yes', 'rain': {'Windy': {False: 'yes', True: 'no'}}}}


In [None]:
train_data = pd.read_excel('/content/Train_data.xlsx')
testing_data = pd.DataFrame(train_data)
testing_data['Rainfall'] = testing_data['Rainfall'].apply(convert_rainfall_to_category)
testing_data.head()

Unnamed: 0,Day,Temperature,Outlook,Humidity,Windy,Rainfall
0,Today,sunny,low,normal,False,low


In [None]:
def predict_instance(instance, tree):
    attribute = next(iter(tree))
    value = instance[attribute]

    if value in tree[attribute]:
        subtree = tree[attribute][value]
        if isinstance(subtree, dict):
            return predict_instance(instance, subtree)
        else:
            return subtree
    else:
        # If the value is not present in the tree, return the majority class of the subtree
        return list(tree[attribute].values())[0]

predictions = testing_data.apply(lambda x: predict_instance(x, decision_tree), axis=1)

testing_data['Play Game?'] = predictions
print(testing_data[['Play Game?']])



                                      Play Game?
0  {'Humidity': {'high': 'no', 'normal': 'yes'}}
