In [5]:
import numpy as np
import pandas as pd
import openpyxl
from sklearn.preprocessing import LabelEncoder

Testing the Banknote dataset

In [3]:
bankfile_path = 'Datasets/data_banknote_authentication.txt'

# Initialize lists to hold features and labels
features_list = []
labels_list = []

# Open and read the file
with open(bankfile_path, 'r') as file:
    for line in file:
        # Split the line into components and convert them to the appropriate types
        components = line.strip().split(',')
        features = [float(num) for num in components[:-1]]  # Convert feature values to floats
        label = int(components[-1])  # Convert the label to an integer
        features_list.append(features)
        labels_list.append(label)

# Convert lists to NumPy arrays
dataset = np.array(features_list)
true_labels_for_points = np.array(labels_list)
labels = np.unique(true_labels_for_points)

# Create a features array with numerical labels as strings
features = np.array([str(i) for i in range(dataset.shape[1])])

In [81]:
print(len(features))
features

4


array(['0', '1', '2', '3'], dtype='<U1')

In [82]:
print(len(dataset))
dataset

150


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [83]:
print(len(labels))
labels

3


array([0, 1, 2])

Testing the Breast Cancer path 

In [33]:
cancer_path = 'Datasets/breast+cancer+coimbra/dataR2.csv'
df = pd.read_csv(cancer_path)

# Now, let's convert the DataFrame into the required numpy arrays
# Extracting feature values and labels
features_list = df.iloc[:, :-1].values

# Extract the last column for labels
labels_list = df.iloc[:, -1].values


# Converting lists to NumPy arrays
dataset = np.array(features_list)
true_labels_for_points = np.array(labels_list)

# Create a features array with numerical labels as strings
features = np.array([str(i) for i in range(dataset.shape[1])])

# Convert unique labels to numpy array
labels = np.unique(true_labels_for_points)

Cryotherapy

In [97]:
cryo_path = 'Datasets/Cryotherapy.xlsx'
df = pd.read_excel(cryo_path)

# Now, let's convert the DataFrame into the required numpy arrays
# Extracting feature values and labels
features_list = df.iloc[:, :-1].values

# Extract the last column for labels
labels_list = df.iloc[:, -1].values


# Converting lists to NumPy arrays
dataset = np.array(features_list)
true_labels_for_points = np.array(labels_list)

# Create a features array with numerical labels as strings
features = np.array([str(i) for i in range(dataset.shape[1])])

# Convert unique labels to numpy array
labels = np.unique(true_labels_for_points)

Immunotherapy

In [46]:
im_path = 'Datasets/Immunotherapy.xlsx'
df = pd.read_excel(im_path)

# Now, let's convert the DataFrame into the required numpy arrays
# Extracting feature values and labels
features_list = df.iloc[:, :-1].values

# Extract the last column for labels
labels_list = df.iloc[:, -1].values


# Converting lists to NumPy arrays
dataset = np.array(features_list)
true_labels_for_points = np.array(labels_list)

# Create a features array with numerical labels as strings
features = np.array([str(i) for i in range(dataset.shape[1])])

# Convert unique labels to numpy array
labels = np.unique(true_labels_for_points)

Ionosphere

In [56]:
ion_path = 'Datasets/ionosphere/ionosphere.data'

# Initialize lists to hold features and labels
features_list = []
raw_labels_list = []

# Open and read the file
with open(ion_path, 'r') as file:
    for line in file:
        # Split the line into components and convert them to the appropriate types
        components = line.strip().split(',')
        features = [float(num) for num in components[:-1]]  # Convert feature values to floats
        raw_labels_list.append(components[-1])  # Keep the original label
        features_list.append(features)

# Convert lists to NumPy arrays
dataset = np.array(features_list)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
true_labels_for_points = label_encoder.fit_transform(raw_labels_list)

# Extract the unique labels as sorted array
labels = np.sort(np.unique(true_labels_for_points))

# Create a features array with numerical labels as strings
features = np.array([str(i) for i in range(dataset.shape[1])])

Iris dataset

In [84]:
iris_path = 'Datasets/iris/iris.data'

# Initialize lists to hold features and labels
features_list = []
raw_labels_list = []

# Open and read the file
with open(iris_path, 'r') as file:
    for line in file:
        # Split the line into components and convert them to the appropriate types
        components = line.strip().split(',')
        features = [float(num) for num in components[:-1]]  # Convert feature values to floats
        raw_labels_list.append(components[-1])  # Keep the original label
        features_list.append(features)

# Convert lists to NumPy arrays
dataset = np.array(features_list)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
true_labels_for_points = label_encoder.fit_transform(raw_labels_list)

# Extract the unique labels as sorted array
labels = np.sort(np.unique(true_labels_for_points))

# Create a features array with numerical labels as strings
features = np.array([str(i) for i in range(dataset.shape[1])])

User Knowledge

In [111]:
user_path = 'Datasets/Data_User_Modeling_Dataset_Hamdi_Tolga_KAHRAMAN.xls'
df = pd.read_excel(user_path)

# Now, let's convert the DataFrame into the required numpy arrays
# Extracting feature values and labels
features_list = df.iloc[:, :-1].values

# Extract the last column for labels
labels_list = df.iloc[:, -1].values


# Converting lists to NumPy arrays
dataset = np.array(features_list)
true_labels_for_points = np.array(labels_list)
true_labels_for_points = label_encoder.fit_transform(true_labels_for_points)

# Extract the unique labels as sorted array
labels = np.sort(np.unique(true_labels_for_points))

# Create a features array with numerical labels as strings
features = np.array([str(i) for i in range(dataset.shape[1])])

# Convert unique labels to numpy array
labels = np.unique(true_labels_for_points)

Vertebratla column 

In [130]:
verba_path = 'Datasets/vertebral+column/verbex.data'

# Initialize lists to hold features and labels
features_list = []
raw_labels_list = []

# Open and read the file
with open(verba_path, 'r') as file:
    for line in file:
        # Split the line into components and convert them to the appropriate types
        components = line.strip().split(',')
        features = [float(num) for num in components[:-1]]  # Convert feature values to floats
        raw_labels_list.append(components[-1])  # Keep the original label
        features_list.append(features)

# Convert lists to NumPy arrays
dataset = np.array(features_list)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
true_labels_for_points = label_encoder.fit_transform(raw_labels_list)

# Extract the unique labels as sorted array
labels = np.sort(np.unique(true_labels_for_points))

# Create a features array with numerical labels as strings
features = np.array([str(i) for i in range(dataset.shape[1])])

Wine

In [138]:
wine_path = 'Datasets/wine/wine.data'

# THIS ONE IS A BIT DIFFERENT LABEL IS ON FIRST FEATURW 
# Initialize lists to hold features and raw labels
features_list = []
raw_labels_list = []

# Open and read the file
with open(wine_path, 'r') as file:
    for line in file:
        # Split the line into components and convert them to the appropriate types
        components = line.strip().split(',')
        raw_labels_list.append(components[0])  # The first element is the label
        features = [float(num) for num in components[1:]]  # The rest are features
        features_list.append(features)

# Convert lists to NumPy arrays
dataset = np.array(features_list)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
true_labels_for_points = label_encoder.fit_transform(raw_labels_list)

# Extract the unique labels as sorted array
labels = np.sort(np.unique(true_labels_for_points))

# Create a features array with numerical labels as strings
features = np.array([str(i) for i in range(dataset.shape[1])])

Monk-2

In [7]:
import numpy as np

# Assuming 'wine_path' is the path to the text file containing the dataset
monk_path = 'Datasets/monk+s+problems/monks-2.train' # Update this to the path of your data file

# Initialize lists to hold features and labels
features_list = []
labels_list = []

# Open and read the file
with open(monk_path, 'r') as file:
    for line in file:
        # Split the line into components based on whitespace
        components = line.strip().split()
        # The second-to-last element is the label, and the rest (excluding the last element) are features
        labels_list.append(components[0])  # Second-to-last element as label
        features = [float(num) for num in components[1:-1]]  # Exclude last two elements
        features_list.append(features)

# Convert lists to NumPy arrays
dataset = np.array(features_list)
true_labels_for_points = LabelEncoder().fit_transform(labels_list)

# The unique labels
labels = np.unique(true_labels_for_points)

# Create a features array with numerical labels as strings
features = np.array([str(i) for i in range(dataset.shape[1])])


Tree Building now!

In [8]:
from min_height_tree_module import *
min_depth_tree, min_depth_literals, min_depth,solution = find_min_depth_tree(features, labels, true_labels_for_points, dataset)
#print("Minimum Depth Tree Structure:")
#for node in min_depth_tree:
#    print(node)
print(f"Found at depth: {min_depth}")

Found at depth: 6
