# Deep Neural Decision Tree

### Imports

In [9]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

### DNDT Model

In [10]:
def build_neural_network(input_shape):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu')
    ])
    return model

def train_neural_decision_tree(X_train, y_train, X_test, y_test):
    # Neural network part
    nn_model = build_neural_network((X_train.shape[1],))
    nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    nn_model.fit(X_train, y_train, epochs=10)

    # Get intermediate output from neural network
    intermediate_layer_model = tf.keras.models.Model(inputs=nn_model.input,
                                                     outputs=nn_model.layers[-2].output)
    intermediate_output_train = intermediate_layer_model.predict(X_train)
    intermediate_output_test = intermediate_layer_model.predict(X_test)

    # Decision tree part
    tree_model = DecisionTreeClassifier()
    tree_model.fit(intermediate_output_train, y_train)
    
    # Prediction and evaluation
    y_pred = tree_model.predict(intermediate_output_test)
    return accuracy_score(y_test, y_pred)

### Iris

In [11]:
# Load the Iris dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
column_names = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
iris_data = pd.read_csv(url, names=column_names)

# Preprocess the data
X = iris_data.drop("class", axis=1)
y = iris_data["class"]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2)

# Train the model and evaluate its accuracy
accuracy = train_neural_decision_tree(X_train, y_train, X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 1.0


### Haberman's Survival

In [13]:
# Load the Haberman's Survival dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data"
column_names = ["age", "year_of_operation", "positive_axillary_nodes", "survival_status"]
haberman_data = pd.read_csv(url, names=column_names)

# Preprocess the data
X = haberman_data.drop("survival_status", axis=1)
y = haberman_data["survival_status"]
# Adjust labels to start from 0
y = y - 1

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

# Train the model and evaluate its accuracy
accuracy = train_neural_decision_tree(X_train, y_train, X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.7258064516129032


### Car Evaluation

In [5]:
# Load the Car Evaluation dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
car_data = pd.read_csv(url, names=column_names)

# Preprocess the data: One-hot encode categorical features
X = car_data.drop("class", axis=1)
y = car_data["class"]

one_hot_encoder = OneHotEncoder()
X_encoded = one_hot_encoder.fit_transform(X).toarray()

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2)

# Train the model and evaluate its accuracy
accuracy = train_neural_decision_tree(X_train, y_train, X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.8497109826589595


### Breast Cancer Wisconsin (Original)

In [16]:
# Load the Breast Cancer Wisconsin (Original) dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_names = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape",
                "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin",
                "Normal Nucleoli", "Mitoses", "Class"]
cancer_data = pd.read_csv(url, names=column_names)

# Preprocess the data
# Drop the 'Sample code number' as it's not a feature
cancer_data.drop(["Sample code number"], axis=1, inplace=True)

# Replace missing values denoted by '?' with NaN and then impute
cancer_data.replace("?", np.nan, inplace=True)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
cancer_data = pd.DataFrame(imputer.fit_transform(cancer_data), columns=cancer_data.columns)

X = cancer_data.drop("Class", axis=1)
y = cancer_data["Class"]

# Encode the labels (2 for benign, 4 for malignant) to 0 and 1
y_encoded = y.replace({2: 0, 4: 1})

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2)

# Train the model and evaluate its accuracy
accuracy = train_neural_decision_tree(X_train, y_train, X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.9285714285714286


### Pima Indians Diabetes

In [19]:
# Load the Pima Indians Diabetes dataset
diabetes_data = pd.read_csv('./datasets/diabetes.csv')

# Preprocess the data
# Replace zeros in certain columns with NaN and then impute
columns_to_impute = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
diabetes_data[columns_to_impute] = diabetes_data[columns_to_impute].replace(0, np.nan)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
diabetes_data[columns_to_impute] = imputer.fit_transform(diabetes_data[columns_to_impute])

X = diabetes_data.drop("Outcome", axis=1)
y = diabetes_data["Outcome"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

# Train the model and evaluate its accuracy
accuracy = train_neural_decision_tree(X_train, y_train, X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.7142857142857143


### Poker Hands

In [14]:
# Load the Poker Hand dataset
url_train = "https://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-training-true.data"
url_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-testing.data"
column_names = ["Suit1", "Card1", "Suit2", "Card2", "Suit3", "Card3", "Suit4", "Card4", "Suit5", "Card5", "Class"]
poker_data_train = pd.read_csv(url_train, names=column_names)
poker_data_test = pd.read_csv(url_test, names=column_names)

# Preprocess the data: One-hot encode categorical features
X_train = poker_data_train.drop("Class", axis=1)
y_train = poker_data_train["Class"]
X_test = poker_data_test.drop("Class", axis=1)
y_test = poker_data_test["Class"]

one_hot_encoder = OneHotEncoder()
X_train_encoded = one_hot_encoder.fit_transform(X_train).toarray()
X_test_encoded = one_hot_encoder.transform(X_test).toarray()

# Train the model and evaluate its accuracy
accuracy = train_neural_decision_tree(X_train_encoded, y_train, X_test_encoded, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.453695


### Statlog (German Credit)

In [58]:
# Load the German Credit dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
column_names = ["Status of existing checking account", "Duration in month", "Credit history", 
                "Purpose", "Credit amount", "Savings account/bonds", "Present employment since", 
                "Installment rate in percentage of disposable income", "Personal status and sex", 
                "Other debtors / guarantors", "Present residence since", "Property", 
                "Age in years", "Other installment plans", "Housing", "Number of existing credits at this bank", 
                "Job", "Number of people being liable to provide maintenance for", "Telephone", 
                "Foreign worker", "Credit risk"]
credit_data = pd.read_csv(url, names=column_names, sep=' ', header=None)

# Identify categorical and numerical columns
categorical_cols = ["Status of existing checking account", "Credit history", "Purpose", "Savings account/bonds", 
                    "Present employment since", "Personal status and sex", "Other debtors / guarantors", 
                    "Property", "Other installment plans", "Housing", "Job", "Telephone", "Foreign worker"]
numerical_cols = ["Duration in month", "Credit amount", "Installment rate in percentage of disposable income", 
                  "Present residence since", "Age in years", "Number of existing credits at this bank", 
                  "Number of people being liable to provide maintenance for"]

# Preprocess the data
X = credit_data.drop("Credit risk", axis=1)
y = credit_data["Credit risk"]

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

X_processed = preprocessor.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2)

# Train the model and evaluate its accuracy
accuracy = train_neural_decision_tree(X_train, y_train, X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.755


### Connect-4

In [41]:
# Load the Connect-4 dataset
path_to_dataset = './datasets/connect-4.csv'
column_names = ["C"+str(i) for i in range(1, 43)] + ["Class"]
connect4_data = pd.read_csv(path_to_dataset, names=column_names, header=None)

# Preprocess the data: One-hot encode categorical features
X = connect4_data.drop("Class", axis=1)
y = connect4_data["Class"]

one_hot_encoder = OneHotEncoder()
X_encoded = one_hot_encoder.fit_transform(X).toarray()

# Encode the labels using LabelEncoder from scikit-learn
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# Train the model and evaluate its accuracy
accuracy = train_neural_decision_tree(X_train, y_train, X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.6649644760213144


### Image Segmentation

In [33]:
# Load the Image Segmentation dataset
path_to_dataset = './datasets/segmentation_train.csv'
column_names = ["Class", "Region-centroid-col", "Region-centroid-row", "Region-pixel-count", "Short-line-density-5", 
                "Short-line-density-2", "Vedge-mean", "Vegde-sd", "Hedge-mean", "Hedge-sd", "Intensity-mean", 
                "Rawred-mean", "Rawblue-mean", "Rawgreen-mean", "Exred-mean", "Exblue-mean", "Exgreen-mean", 
                "Value-mean", "Saturation-mean", "Hue-mean"]
segmentation_data = pd.read_csv(path_to_dataset, names=column_names, header=None, skiprows=5)

# Preprocess the data
X = segmentation_data.drop("Class", axis=1)
y = segmentation_data["Class"]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2)

# Train the model and evaluate its accuracy
accuracy = train_neural_decision_tree(X_train, y_train, X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.7317073170731707


### Covrtype

In [38]:
# Load the Covertype dataset
path_to_dataset = './datasets/covtype.csv'
column_names = [f'feature_{i}' for i in range(1, 55)]
covertype_data = pd.read_csv(path_to_dataset, names=column_names)

# Preprocess the data
# Assuming the last column is the target variable
X = covertype_data.iloc[:, :-1]
y = covertype_data.iloc[:, -1]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the model and evaluate its accuracy
accuracy = train_neural_decision_tree(X_train, y_train, X_test, y_test)
print(f'Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.8263125736857052
