In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

## Continuous case ambiguity and error calculation

In [None]:
# Function for Continuous case ambiguity 
# Function for ambiguity calculation
def calculate_continuous_case_ambiguity(df, class_column):
    features = df.columns.drop(class_column)
    classes = df[class_column].unique()

    # Step 1: Calculate Hypercubes
    grouped = df.groupby(class_column)[features]
    min_values = grouped.min()
    max_values = grouped.max()
    hypercubes = {cls: (min_values.loc[cls].values, max_values.loc[cls].values) for cls in classes}

    # Step 2: Calculate Overlap Regions
    overlap_regions = []
    class_list = list(hypercubes.keys())
    num_classes = len(class_list)
    
    for i in range(num_classes):
        for j in range(i + 1, num_classes):
            min_overlap = np.maximum(hypercubes[class_list[i]][0], hypercubes[class_list[j]][0])
            max_overlap = np.minimum(hypercubes[class_list[i]][1], hypercubes[class_list[j]][1])
            if np.all(min_overlap <= max_overlap):
                overlap_regions.append((class_list[i], class_list[j], min_overlap, max_overlap))

    # Step 3: Count Samples in Overlap Regions
    samples = df[features].values
    class_labels = df[class_column].values

    samples_in_overlap = {}

    for cls1, cls2, min_overlap, max_overlap in overlap_regions:
        in_overlap_cls1 = np.all((samples >= min_overlap) & (samples <= max_overlap), axis=1) & (class_labels == cls1)
        in_overlap_cls2 = np.all((samples >= min_overlap) & (samples <= max_overlap), axis=1) & (class_labels == cls2)

        count_cls1 = np.sum(in_overlap_cls1)
        count_cls2 = np.sum(in_overlap_cls2)

        region_key = f'{cls1}-{cls2}'
        samples_in_overlap[region_key] = {cls1: count_cls1, cls2: count_cls2}

    # Step 4: Calculate Ambiguity
    total_samples_per_class = df[class_column].value_counts().to_dict()
    print("total samples per class:")
    print(total_samples_per_class)
    ambiguity_values = {cls: 0 for cls in classes}

    for region, counts in samples_in_overlap.items():
        cls1, cls2 = region.split('-')
        cls1 = float(cls1)  # Ensure class labels are correctly interpreted
        cls2 = float(cls2)
        if cls1 in total_samples_per_class and cls2 in total_samples_per_class:
            if total_samples_per_class[cls1] > 0:
                ambiguity_values[cls1] += counts[cls1] / total_samples_per_class[cls1]
            if total_samples_per_class[cls2] > 0:
                ambiguity_values[cls2] += counts[cls2] / total_samples_per_class[cls2]

    mean_ambiguity = np.mean(list(ambiguity_values.values())) if ambiguity_values else 0.0

    return mean_ambiguity, overlap_regions, samples_in_overlap

# function for error calculation
def calculate_error_probability(df, feature_columns, label_column):
    # Train the decision tree
    X = df[feature_columns]
    y = df[label_column]

    clf = DecisionTreeClassifier(max_depth=None)
    clf.fit(X, y)

    training_accuracy = clf.score(X, y)
    print(f'Training Accuracy: {training_accuracy * 100:.2f}%')

    # Function to extract rectangles and labels from a trained decision tree
    def get_rectangles_from_tree(tree):
        left = tree.children_left
        right = tree.children_right
        threshold = tree.threshold
        feature = tree.feature
        value = tree.value
        
        def recurse(node, bounds):
            if feature[node] == _tree.TREE_UNDEFINED:
                # It's a leaf node
                leaf_label = np.argmax(value[node][0])
                return [(bounds, leaf_label)]
            
            new_bounds_left = [list(b) for b in bounds]
            new_bounds_right = [list(b) for b in bounds]
            
            feature_index = feature[node]
            threshold_value = threshold[node]
            
            new_bounds_left[feature_index][1] = threshold_value
            new_bounds_right[feature_index][0] = threshold_value
            
            left_rectangles = recurse(left[node], new_bounds_left)
            right_rectangles = recurse(right[node], new_bounds_right)
            
            return left_rectangles + right_rectangles

        # Initialize bounds for each feature
        initial_bounds = [[-np.inf, np.inf] for _ in range(tree.n_features)]
        rectangles = recurse(0, initial_bounds)
        return rectangles

    # Extract rectangles and labels from the decision tree
    rectangles = get_rectangles_from_tree(clf.tree_)

    # Calculate KDE for each class
    class_0_data = df[df[label_column] == 0][feature_columns]
    class_1_data = df[df[label_column] == 1][feature_columns]

    kde_class_0 = stats.gaussian_kde(class_0_data.T)
    kde_class_1 = stats.gaussian_kde(class_1_data.T)

    # Calculate probabilities for the segments
    segment_probabilities = []
    for rect, label in rectangles:
        bounds_min = [b[0] for b in rect]
        bounds_max = [b[1] for b in rect]
        segment = df[np.all((df[feature_columns] >= bounds_min) & (df[feature_columns] < bounds_max), axis=1)]
        if not segment.empty:
            actual_value = segment[label_column].iloc[0]
            if actual_value == 0.0:
                segment_probabilities.append(kde_class_1.integrate_box(bounds_min, bounds_max, maxpts=200000))
            else:
                segment_probabilities.append(kde_class_0.integrate_box(bounds_min, bounds_max, maxpts=200000))

    # Compute total error probability
    total_error_probability_all_segments = np.sum(segment_probabilities)
    
    
    
    return total_error_probability_all_segments

## Discrete case ambiguity and error calculation

In [None]:
# Function for discrete case ambiguity 
def calculate_discrete_ambiguity(df, feature_columns, class_column):
    
    d = df[class_column].nunique()
    # combinaion generators
    df["feature_combination"] = df[feature_columns].astype(str).agg('_'.join, axis=1)
    
    # total no of combinations
    combination_counts = df.groupby(['feature_combination', class_column]).size().reset_index(name='count')
    
    # no of combination
    total_combination_counts = df['feature_combination'].value_counts().reset_index(name='total_count').rename(columns={'index': 'feature_combination'})
    
    combination_probs = pd.merge(combination_counts, total_combination_counts, on='feature_combination')
    combination_probs['probability'] = combination_probs['count'] / combination_probs['total_count']
    
    
    combination_probs_pivot = combination_probs.pivot(index='feature_combination', columns=class_column, values='probability').fillna(0)
    combination_probs_pivot.columns = [f'P(class={int(col)})' for col in combination_probs_pivot.columns]
    
    df = pd.merge(df, combination_probs, on=['feature_combination', class_column], how='left')

    ambiguity = ((1-combination_probs_pivot.max(axis=1))*(d/(d-1))).mean() # d
    
    return ambiguity, df, combination_probs_pivot


def calculate_discrete_classification_error(data):
    # Calculate P(C_j), the prior probability of class j
    def prior_probability(cls):
        return len(data[data['class'] == cls]) / len(data)
    
    def likelihood(sample, cls):
        feature_match_data = data.copy()
        
        # Loop over all features (excluding the 'class' column)
        for feature in sample.index[:-1]:
            feature_value = sample[feature]
            # Filter the data to match the current feature value
            feature_match_data = feature_match_data[feature_match_data[feature] == feature_value]
        
        # Filter the dataset to include only rows with the same class as 'cls'
        class_feature_match_data = feature_match_data[feature_match_data['class'] == cls]
        
        # Calculate the likelihood as the proportion of this feature combination that belongs to the specified class
        if len(feature_match_data) == 0:
            return 0  # Avoid division by zero if feature combination doesn't exist in the dataset
        probability = len(class_feature_match_data) / len(feature_match_data)
        return 1-probability

    # Calculate the overall discrete classification error
    total_error = 0
    
    for i in data.itertuples(index=False):
        sample = pd.Series(i, index=data.columns)
        true_class = sample['class']
        prior_true_class = prior_probability(true_class)
        
        # Sum the error contributions from misclassification to all other classes
        for other_class in data['class'].unique():
            if other_class != true_class:
                error_component = likelihood(sample, true_class) * prior_true_class
                total_error += error_component
    
    # Normalize by the number of samples
    normalized_error = total_error / len(data)
    return normalized_error


