In [1]:
import pandas as pd
import numpy as np
import os
import sys

src_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

import continuous, discrete, model
from data_loader import DataLoader

# Continuous case

In [2]:
# continuous case ambiguity and error calculation (calculation of error for wine and rice datasets takse a while)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
loader = DataLoader()
dataset_list = ['iris', 'kidney_stone', 'wine', 'rice']
print("Continuous Dataset case: ")
for name in dataset_list:
    df = loader.select_dataset('continuous', name)
    class_name = df.columns[-1]
    ambiguity = continuous.calculate_continuous_ambiguity(df, class_name)
    print(f'Ambiguity for {name} dataset: {ambiguity}')
    total_error_probability = continuous.calculate_continuous_error(df, class_name)
    print(f"Error for {name} dataset: {total_error_probability:.4f}\n")

Continuous Dataset case: 
Ambiguity for iris dataset: 0.17333333333333334
Error for iris dataset: 0.3216
Ambiguity for kidney_stone dataset: 0.6777777777777778
Error for kidney_stone dataset: 0.5736


In [4]:
loader = DataLoader()
blood_df = loader.select_dataset('continuous', 'blood_transfusion')
ambiguity = continuous.calculate_continuous_ambiguity(blood_df, 'class')
print(f'Ambiguity: {ambiguity}')
# error calculation could not be done because decision can't be trained for 100 percent
# error = continuous.calculate_continuous_error(blood_df, 'class')
# print(f'Error: {error}')

Ambiguity: 0.9882416715947171


In [None]:
# glass identification
import warnings
from ucimlrepo import fetch_ucirepo 
glass_identification = fetch_ucirepo(id=42) 
X = glass_identification.data.features 
y = glass_identification.data.targets 

glass_df = pd.concat([X, y], axis=1)

ambiguity = continuous.calculate_continuous_ambiguity(glass_df, 'Type_of_glass')
print(f'Ambiguity (Continuous) for glass identification dataset: {ambiguity}')
# Error calculation
total_error_probability = continuous.calculate_continuous_error(glass_df, 'Type_of_glass')
print(f"Error (Continuous) for glass identification dataset: {total_error_probability:.4f}")

# Discrete case

In [2]:
loader = DataLoader()
dataset_list = ['lens','car', 'zoo', 'tictac', 'balance']
print("Discrete Dataset case: ")
for name in dataset_list:
    df = loader.select_dataset('discrete', name)
    class_name = df.columns[-1]
    ambiguity = discrete.calculate_discrete_ambiguity(df, class_name)
    print(f'Ambiguity for {name} dataset: {ambiguity}')
    total_error_probability, pivot = discrete.calculate_discrete_error(df, class_name)
    print(f"Error for {name} dataset: {total_error_probability:.4f}\n")

Discrete Dataset case: 
Ambiguity for car dataset: 0.44329896907216493
Error for car dataset: 0.2955

Ambiguity for social_network dataset: 0.0
Error for social_network dataset: 0.0000

Ambiguity for tictac dataset: 0.0
Error for tictac dataset: 0.0000

Ambiguity for balance dataset: 0.0
Error for balance dataset: 0.0000

Ambiguity for lens dataset: 0.125
Error for lens dataset: 0.0833



In [7]:
df = loader.select_dataset('discrete', 'lens')
df.head()


Unnamed: 0,age,spectacle_prescription,astigmatic,class
1,1,1,1,3
2,1,1,2,2
3,1,2,1,3
4,1,2,2,1
5,2,1,1,3


In [9]:
ambiguity = discrete.calculate_discrete_ambiguity(df, 'class')
print(f'Ambiguity for {name} dataset: {ambiguity}')
total_error_probability, pivot = discrete.calculate_discrete_error(df, 'class')
print(f"Error for {name} dataset: {total_error_probability:.4f}\n")

Ambiguity for lens dataset: 0.125
Error for lens dataset: 0.0833



In [10]:
pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,class,1,2,3,total_count,max_count,error
age,spectacle_prescription,astigmatic,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,0.0,0.0,3.0,3.0,3.0,0.0
1,1,2,0.0,2.0,1.0,3.0,2.0,0.333333
1,2,1,0.0,0.0,3.0,3.0,3.0,0.0
1,2,2,3.0,0.0,0.0,3.0,3.0,0.0
2,1,1,0.0,0.0,3.0,3.0,3.0,0.0
2,1,2,0.0,3.0,0.0,3.0,3.0,0.0
2,2,1,0.0,0.0,3.0,3.0,3.0,0.0
2,2,2,1.0,0.0,2.0,3.0,2.0,0.333333


In [14]:
# Discrete case ambiguity and error calculation on dummy datasets.
data = {
    "x1": [0, 0, 1, 1, 0, 0, 1],
    "x2": [0, 1, 0, 1, 0, 0, 0],
    "class": [0, 1, 1, 0, 1, 0, 0]
}
df1 = pd.DataFrame(data)
ambiguity = discrete.calculate_discrete_ambiguity(df1, 'class')
ambiguity

error, pivot = discrete.calculate_discrete_error(df1, 'class')
error

0.20833333333333331

In [2]:
# Social network dataset (not entirely discrete)
network_df = pd.read_csv('D:/FAU_Courses/FAU Summer 2024/Seminar/Codes/data/discrete/Social_Network_Ads.csv')

ambiguity =discrete.calculate_discrete_ambiguity(network_df, class_column="Purchased")

print("Ambiguity (discrete) for network_df dataset: ", ambiguity)

error = discrete.calculate_discrete_error(network_df, class_column='Purchased')
print(f"Error (discrete) for network_df dataset : {error}")


Ambiguity (discrete) for network_df dataset:  0.005479452054794521
Error (discrete) for network_df dataset : 0.0027397260273972603


Mixed dataset error calculation: $\LARGE \frac{P(X_c \neq k|X_\chi = \chi ) P(X_\chi = \chi)}{P(X_c \notin k)} $

In [4]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_text, _tree
from scipy import stats
def calculate_error(df, label_column, df_stat):
    """
    Calculate error for continuous featured datasets using Recatangular segmentation and calculating the probability of that segment belong to the other class.
    Args:
        df (pandas dataframe): Dataset
        class_column (string): class column name
    returns:
        float: error for conitnuous case
    """
    # Train the decision tree
    feature_columns = [col for col in df.columns if col != label_column]
    X = df[feature_columns]
    y = df[label_column]

    clf = DecisionTreeClassifier(max_depth=None)
    clf.fit(X, y)

    # training_accuracy = clf.score(X, y)
    # print(f'Training Accuracy: {training_accuracy * 100:.2f}%')

    # Function to extract rectangles and labels from a trained decision tree
    def get_rectangles_from_tree(tree):
        left = tree.children_left
        right = tree.children_right
        threshold = tree.threshold
        feature = tree.feature
        value = tree.value
        
        def recurse(node, bounds):
            if feature[node] == _tree.TREE_UNDEFINED:
                # It's a leaf node
                leaf_label = np.argmax(value[node][0])
                return [(bounds, leaf_label)]
            
            new_bounds_left = [list(b) for b in bounds]
            new_bounds_right = [list(b) for b in bounds]
            
            feature_index = feature[node]
            threshold_value = threshold[node]
            
            new_bounds_left[feature_index][1] = threshold_value
            new_bounds_right[feature_index][0] = threshold_value
            
            left_rectangles = recurse(left[node], new_bounds_left)
            right_rectangles = recurse(right[node], new_bounds_right)
            
            return left_rectangles + right_rectangles

        # Initialize bounds for each feature
        initial_bounds = [[-np.inf, np.inf] for _ in range(tree.n_features)]
        rectangles = recurse(0, initial_bounds)
        return rectangles

    # Extract rectangles and labels from the decision tree
    rectangles = get_rectangles_from_tree(clf.tree_)
    # print(rectangles)
    # Calculate KDE for each class complement
    classes = np.unique(df[label_column])
    kde_by_class = {}
    for cls in classes:
        class_data = df_stat[df_stat[label_column] != cls][feature_columns]
        kde_by_class[cls] = stats.gaussian_kde(class_data.T) # Contain the distribution for other class samples
    
    # Calculate probabilities for the segments
    segment_probabilities = []
    for rect, predicted_label in rectangles:
        bounds_min = [b[0] for b in rect]
        bounds_max = [b[1] for b in rect]
        segment = df[np.all((df[feature_columns] >= bounds_min) & (df[feature_columns] < bounds_max), axis=1)]

        if not segment.empty:
            kde = kde_by_class[predicted_label]
            error_probability = kde.integrate_box(bounds_min, bounds_max, maxpts=500000)
            segment_probabilities.append(error_probability)

    # Compute total error probability
    total_error_probability_all_segments = np.sum(segment_probabilities)
    return total_error_probability_all_segments

In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df_anaemia = pd.read_csv("D:/FAU_Courses/FAU Summer 2024/Seminar/Codes/data/continuous/anaemia.csv")
label_encoder = LabelEncoder()
df_anaemia['Anaemic'] = label_encoder.fit_transform(df_anaemia['Anaemic'])
df_anaemia = df_anaemia.drop(columns=['Number'])

df_anaemia['Sex'] = df_anaemia['Sex'].str.strip()
df_anaemia['Sex'] = label_encoder.fit_transform(df_anaemia['Sex'])
df_drop = df_anaemia.drop(columns='Sex')
# # calculation of error taking the whole dataset (continuous and categorical)
# error_all = calculate_error(df_drop, 'Anaemic', df_drop)
# error_all

df_anaemia.head()
df_drop.head()

Unnamed: 0,%Red Pixel,%Green pixel,%Blue pixel,Hb,Anaemic
0,43.2555,30.8421,25.9025,6.3,1
1,45.6033,28.19,26.2067,13.5,0
2,45.0107,28.9677,26.0215,11.7,0
3,44.5398,28.9899,26.4703,13.5,0
4,43.287,30.6972,26.0158,12.4,0


In [19]:
from model import Model
rf = Model('nb')
rf.train(df_drop)
rf.evaluate()


1.0

In [16]:
df_male = df_anaemia[df_anaemia['Sex'] == 1].drop(columns='Sex')
df_female = df_anaemia[df_anaemia['Sex'] == 0].drop(columns='Sex')
# error only taking the continuous features of male samples
error_male = calculate_error(df_male, 'Anaemic', df_drop) 
error_male

0.2418648805291078

In [18]:
# error only taking the continuous features of female samples
error_female = calculate_error(df_female, 'Anaemic', df_drop) 
error_female

  error_probability = kde.integrate_box(bounds_min, bounds_max, maxpts=500000)


0.2881341162632477

In [19]:
# probability of samples belong to male category 
m = len(df_male)/ len(df_anaemia)
# probability of samples belong to female category 
f = len(df_female)/ len(df_anaemia)
m, f

(0.5288461538461539, 0.47115384615384615)

In [22]:
(error_male*m)

0.12790931181827817

In [20]:
error_calculation = np.mean(((error_male*m)/error_all) +((error_female*f)/error_all))
error_calculation

0.9468410399888565

In [78]:
# ambiguity calculation
ambiguity, overlap_regions, samples_in_overlap = calculate_continuous_ambiguity(df_anaemia, 'Anaemic')
ambiguity

0.37820512820512825

In [87]:
ambiguity_m, overlap_regions, samples_in_overlap = calculate_continuous_ambiguity(df_male, 'Anaemic')
ambiguity_m

0.0

In [88]:
ambiguity_f, overlap_regions, samples_in_overlap = calculate_continuous_ambiguity(df_female, 'Anaemic')
ambiguity_f

0.4181985294117647