In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path = "C:\\Users\\santo\\OneDrive - Amrita vishwa vidyapeetham\\Documents\\4th sem Btech\\Machine Learning\\datasets\\question_answers_answerkey.xlsx"
df = pd.read_excel(file_path, skiprows=1)

In [3]:
def calculate_entropy(column):
    """Calculate the entropy of a DataFrame column."""
    probabilities = column.value_counts(normalize=True)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

In [4]:
def calculate_conditional_entropy(feature_column, target_column):
    """Calculate the conditional entropy of a feature given the target."""
    # Calculate the probabilities of each feature category
    probabilities = feature_column.value_counts(normalize=True)
    conditional_entropy = 0
    
    # Iterate over each category in the feature column
    for category in feature_column.unique():
        # Filter the target column based on the feature category
        target_subset = target_column[feature_column == category]
        # Calculate and sum the weighted entropy for the category
        conditional_entropy += probabilities[category] * calculate_entropy(target_subset)
    
    return conditional_entropy

In [5]:
def find_root_node(df, target_column_name):
    """Find the best feature to use as the root node based on Information Gain."""
    target_column = df[target_column_name]
    initial_entropy = calculate_entropy(target_column)
    max_information_gain = -1
    best_feature = None
    
    for feature in df.drop(columns=[target_column_name]).columns:
        feature_column = df[feature]
        conditional_entropy = calculate_conditional_entropy(feature_column, target_column)
        information_gain = initial_entropy - conditional_entropy
        
        if information_gain > max_information_gain:
            max_information_gain = information_gain
            best_feature = feature
    
    return best_feature, max_information_gain

In [6]:
# Assuming 'df' is your DataFrame and 'target' is the name of the target column
# Example usage:
best_feature, information_gain = find_root_node(df, 3.5)
print(f"The best root node feature is: {best_feature} with an information gain of: {information_gain}")

The best root node feature is: 1.0 with an information gain of: 1.3746884964855104


In [9]:

def bin_feature(series, binning_type='equal_width', n_bins=3):
    if binning_type == 'equal_width':
        # Equal width binning
        bins = np.linspace(series.min(), series.max(), n_bins + 1)
        return pd.cut(series, bins=bins, labels=False, include_lowest=True)
    
    elif binning_type == 'equal_freq':
        # Equal frequency binning (quantile-based)
        return pd.qcut(series, q=n_bins, labels=False, duplicates='drop')
    
    else:
        raise ValueError("Invalid binning_type. Choose 'equal_width' or 'equal_freq'.")

# Example usage:
# Assuming 'df' is your DataFrame and '1.0' is a continuous feature you want to bin
feature = 1.0  # Example feature name
binned_feature = bin_feature(df[feature], binning_type='equal_width', n_bins=3)
df['binned_A1'] = binned_feature  # Adding the binned feature back to the DataFrame
print(df['binned_A1'])


0      2
1      1
2      2
3      2
4      1
      ..
195    1
196    0
197    1
198    1
199    1
Name: binned_A1, Length: 200, dtype: int64


In [None]:
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

def InfoGain(data, split_attribute_name, target_name="target"):
    # Calculate the entropy of the total dataset
    total_entropy = entropy(data[target_name])
    
    # Calculate the values and the corresponding counts for the split attribute 
    vals, counts= np.unique(data[split_attribute_name], return_counts=True)
    
    # Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    # Calculate the information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain


In [None]:
def build_tree(data, originaldata, features, target_attribute_name="target", parent_node_class = None):
    # If all target_values have the same value, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    # If the dataset is empty, return the mode target feature value in the original dataset
    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]
    
    # If the feature space is empty, return the mode target feature value of the direct parent node
    elif len(features) ==0:
        return parent_node_class
    
    # If none of the above conditions holds true, grow the tree!
    else:
        # Set the default value for this node --> The mode target feature value of the current node
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]
        
        # Select the feature which best splits the dataset
        item_values = [InfoGain(data, feature, target_attribute_name) for feature in features] # Return the information gain values for the features in the dataset
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        # Create the tree structure. The root gets the name of the feature with maximum information gain
        tree = {best_feature:{}}
        
        # Remove the feature with the best information gain from the feature space
        features = [i for i in features if i != best_feature]
        
        # Grow a branch under the root node for each possible value of the root node feature
        for value in np.unique(data[best_feature]):
            value = value
            # Split the dataset along the value of the feature with the largest information gain and create sub_datasets
            sub_data = data.where(data[best_feature] == value).dropna()
            
            # Call the build_tree function for each of those sub_datasets with the new parameters
            subtree = build_tree(sub_data, originaldata, features, target_attribute_name, parent_node_class)
            
            # Add the sub tree
            tree[best_feature][value] = subtree
            
        return tree


In [None]:
def find_best_split(data, target):
    best_gain = 0
    best_feature = None
    current_entropy = calculate_entropy(target)
    
    for feature in data.columns:
        # Calculate the Information Gain for splitting on this feature
        gain = information_gain(data, target, feature, current_entropy)
        
        if gain > best_gain:
            best_gain = gain
            best_feature = feature
            
    return best_feature, best_gain

def information_gain(data, target, feature, current_entropy):
    """Calculate the Information Gain of dividing the data on the chosen feature."""
    unique_values = data[feature].unique()
    conditional_entropy = 0
    
    for value in unique_values:
        subset_target = target[data[feature] == value]
        prob = len(subset_target) / len(target)
        conditional_entropy += prob * calculate_entropy(subset_target)
    
    return current_entropy - conditional_entropy


In [None]:
def split_data(data, target, feature):
    """Split the dataset and the target based on the best split feature."""
    unique_values = data[feature].unique()
    splits = {}
    
    for value in unique_values:
        split_mask = data[feature] == value
        splits[value] = (data[split_mask], target[split_mask])
    
    return splits


In [None]:
def build_tree(data, target, max_depth=3, depth=0):
    if depth == max_depth or len(data.columns) == 0:
        # Stopping condition met, return a leaf node
        return DecisionNode(value=target.mode()[0])
    
    best_feature, best_gain = find_best_split(data, target)
    if best_gain == 0:
        # No feature provides a gain, return a leaf node
        return DecisionNode(value=target.mode()[0])
    
    # Perform the split
    splits = split_data(data, target, best_feature)
    
    # Create node and recurse for each split
    node = DecisionNode(feature=best_feature)
    node.children = {
        value: build_tree(sub_data, sub_target, max_depth, depth+1)
        for value, (sub_data, sub_target) in splits.items()
    }
    
    return node
