In [None]:
import pandas as pd
data=pd.read_csv("/content/dataset 1.csv")

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Day          14 non-null     object
 1   Outlook      14 non-null     object
 2   Temperature  14 non-null     object
 3   Humidity     14 non-null     object
 4   Wind         14 non-null     object
 5   Tennis       14 non-null     object
dtypes: object(6)
memory usage: 800.0+ bytes


In [None]:
data.head()

Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,Tennis
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Medium,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [None]:
data.Tennis.unique

In [None]:
data.drop(['Day'],axis=1,inplace=True)

In [None]:
import numpy as np

def entropy(col):

    elements, counts = np.unique(col, return_counts=True)
    prob = counts / counts.sum()
    entropy = -np.sum(prob * np.log2(prob))
    return entropy

result = entropy(data['Tennis'])
print(result)


0.9402859586706311


In [None]:
def weighted_entropy(data, split_attribute_name, target_name="Tennis"):

    vals, counts= np.unique(data[split_attribute_name], return_counts=True)
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    return Weighted_Entropy


In [None]:
def InfoGain(data, split_attr_name, target_name="Tennis"):

    total_entropy = entropy(data[target_name])
    Weighted_Entropy = weighted_entropy(data, split_attr_name, target_name)
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

print(InfoGain(data=data,split_attr_name='Outlook'))

0.24674981977443933


In [None]:
def find_best_split(data):
        features = data.columns.drop('Tennis')
        best_gain = -1
        best_feature = None
        for feature in features:
            gain = InfoGain(data, feature)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
        return best_feature


In [None]:
def build_tree(data):
        if len(np.unique(data['Tennis'])) == 1:
            return np.unique(data['Tennis'])[0]
        elif len(data.columns) == 1:
            return data['Tennis'].value_counts().idxmax()
        else:
            best_feature = find_best_split(data)
            tree = {best_feature: {}}
            for value in np.unique(data[best_feature]):
                subset_data = data[data[best_feature] == value].drop(columns=[best_feature])
                tree[best_feature][value] = build_tree(subset_data)
            return tree

In [None]:
def fit(data):
        tree = build_tree(data)

In [None]:
def predict_instance(instance, tree):
        if not isinstance(tree, dict):
            return tree
        else:
            feature = list(tree.keys())[0]
            value = instance[feature]
            if value in tree[feature]:
                subtree = tree[feature][value]
                return predict_instance(instance, subtree)
            else:
                return None

In [None]:
def predict( instances):
        predictions = []
        for index, instance in instances.iterrows():
            prediction = predict_instance(instance,tree)
            predictions.append(prediction)
        return prediction

In [None]:
class DecisionTree:
    def __init__(self):
        pass

    def entropy(self, target_col):
        elements, counts = np.unique(target_col, return_counts=True)
        probabilities = counts / len(target_col)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def weighted_entropy(self, data, split_attribute_name, target_name="Tennis"):
        vals, counts = np.unique(data[split_attribute_name], return_counts=True)
        weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * self.entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
        return weighted_entropy

    def information_gain(self, data, split_attribute_name, target_name="Tennis"):
        total_entropy = self.entropy(data[target_name])
        weighted_entropy = self.weighted_entropy(data, split_attribute_name, target_name)
        information_gain = total_entropy - weighted_entropy
        return information_gain

    def find_best_split(self, data):
        features = data.columns.drop('Tennis')
        best_gain = -1
        best_feature = None
        for feature in features:
            gain = self.information_gain(data, feature)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
        return best_feature

    def fit(self, data):
        self.tree = self.build_tree(data)

    def build_tree(self, data):
        if len(np.unique(data['Tennis'])) == 1:
            return np.unique(data['Tennis'])[0]
        elif len(data.columns) == 1:
            return data['Tennis'].value_counts().idxmax()
        else:
            best_feature = self.find_best_split(data)
            tree = {best_feature: {}}
            for value in np.unique(data[best_feature]):
                subset_data = data[data[best_feature] == value].drop(columns=[best_feature])
                tree[best_feature][value] = self.build_tree(subset_data)
            return tree

    def predict_instance(self, instance, tree):
        if not isinstance(tree, dict):
            return tree
        else:
            feature = list(tree.keys())[0]
            value = instance[feature]
            if value in tree[feature]:
                subtree = tree[feature][value]
                return self.predict_instance(instance, subtree)
            else:
                return None

    def predict(self, instances):
        predictions = []
        for index, instance in instances.iterrows():
            prediction = self.predict_instance(instance, self.tree)
            predictions.append(prediction)
        return predictions

In [None]:
tree_model = DecisionTree()
tree_model.fit(data)
print(tree_model.predict(data))

['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']


In [None]:
custom_data = pd.DataFrame({
    'Outlook': ['Sunny', 'Overcast', 'Rain', 'Sunny'],
    'Temp': ['Hot', 'Mild', 'Cool', 'Mild'],
    'Humidity': ['High', 'Normal', 'High', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak']
})

custom_labels = ['No', 'Yes', 'Yes', 'No']

predictions = tree_model.predict(custom_data)
print("Predictions on custom data:", predictions)

if custom_labels:
    accuracy = np.mean(predictions == custom_labels)
    print("Accuracy:", accuracy)


Predictions on custom data: ['No', 'Yes', 'Yes', 'No']
Accuracy: 1.0
