In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import pandas as pd


def evaluate_decision_tree(file_name):
    # Load the dataset
    data = pd.read_csv(file_name)
    X = data.iloc[:, :-1].values  # All columns except the last
    y = data.iloc[:, -1].values  # labels

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the Decision Tree model
    model = DecisionTreeClassifier(random_state=42)

    # Create a hyperparameter grid
    param_grid = {
        'max_depth': [None, 10, 15, 20],
        'min_samples_split': [5, 10, 20],
        'criterion': ['gini', 'entropy', "log_loss"],
        'min_samples_leaf': [1, 2, 5, 10]
    }

    # Set up GridSearchCV with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=4)

    # Fit the model on the training data
    grid_search.fit(X_train, y_train)

    # Collect results
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Evaluate the best model on the test data
    best_model = grid_search.best_estimator_
    test_accuracy = best_model.score(X_test, y_test)

    return best_params, best_score, test_accuracy


In [2]:
best_params, best_score, test_accuracy = evaluate_decision_tree('raw_reduced.csv')
print("Best Hyperparameters:", best_params)
print("Best Accuracy (CV):", best_score)
print("Test Accuracy:", test_accuracy)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 5}
Best Accuracy (CV): 0.8418392857142857
Test Accuracy: 0.8500714285714286


In [3]:
best_params, best_score, test_accuracy = evaluate_decision_tree('sobel_hog_reduced.csv')
print("Best Hyperparameters:", best_params)
print("Best Accuracy (CV):", best_score)
print("Test Accuracy:", test_accuracy)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 5}
Best Accuracy (CV): 0.8161964285714285
Test Accuracy: 0.8214285714285714


In [None]:
best_params, best_score, test_accuracy = evaluate_decision_tree('mnist_gaussian_hog_concatenated.csv')
print("Best Hyperparameters:", best_params)
print("Best Accuracy (CV):", best_score)
print("Test Accuracy:", test_accuracy)

In [6]:
best_params, best_score, test_accuracy = evaluate_decision_tree('mnist_sobel.csv')
print("Best Hyperparameters:", best_params)
print("Best Accuracy (CV):", best_score)
print("Test Accuracy:", test_accuracy)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 5}
Best Accuracy (CV): 0.8431964285714285
Test Accuracy: 0.851
