In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold


def evaluatePerformance():
    '''
    Evaluate the performance of decision trees,
    averaged over 100 trials of 10-fold cross-validation
    
    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of decision stump
      stats[1,1] = std deviation of decision stump
      stats[2,0] = mean accuracy of 3-level decision tree
      stats[2,1] = std deviation of 3-level decision tree
      
    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n, d = X.shape

    # Initialize variables to store results
    decision_tree_accuracies = []
    decision_stump_accuracies = []
    dt3_accuracies = []

    # Initialize lists to store learning curve data
    learning_curve_data = {
        'Decision Tree': [],
        'Decision Stump': [],
        '3-level Decision Tree': [],
        'DT_depth_5': [],
        'DT_depth_8': [],
        ...
    }

    # Perform 100 trials of 10-fold cross-validation
    for _ in range(100):
        kf = KFold(n_splits=10, shuffle=True, random_state=42)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Train the decision tree
            for tree_depth in [None, 1, 3, 5, 8]:
                clf = tree.DecisionTreeClassifier(max_depth=tree_depth)
                clf = clf.fit(X_train, y_train)

                # Output predictions on the test set
                y_pred = clf.predict(X_test)

                # Compute accuracy of the model
                accuracy = accuracy_score(y_test, y_pred)

                # Add accuracy to corresponding list
                if tree_depth is None:
                    decision_tree_accuracies.append(accuracy)
                elif tree_depth == 1:
                    decision_stump_accuracies.append(accuracy)
                elif tree_depth == 3:
                    dt3_accuracies.append(accuracy)
                else:
                    # Add accuracy to corresponding list for additional decision trees
                    learning_curve_data[f'DT_depth_{tree_depth}'].append(accuracy)

                # Calculate learning curve data for different subsets of training data
                for subset in np.arange(0.1, 1.1, 0.1):
                    subset_index = int(len(X_train) * subset)
                    X_subset_train = X_train[:subset_index]
                    y_subset_train = y_train[:subset_index]

                    # Train the classifier on subset of training data
                    clf_subset = tree.DecisionTreeClassifier(max_depth=tree_depth)
                    clf_subset = clf_subset.fit(X_subset_train, y_subset_train)

                    # Test accuracy on test set
                    y_subset_pred = clf_subset.predict(X_test)
                    mean_decision_tree_accuracies = np.mean(decision_tree_accuracies)
                    std_decision_tree_accuracies = np.std(decision_tree_accuracies)

                    mean_decision_stump_accuracies = np.mean(decision_stump_accuracies)
                    std_decision_stump_accuracies = np.std(decision_stump_accuracies)

                    mean_dt3_accuracies = np.mean(dt3_accuracies)
                    std_dt3_accuracies = np.std(dt3_accuracies)

                for classifier, curve_data in learning_curve_data.items():
                    mean_curve = np.mean(np.array(curve_data).reshape(-1, 10), axis=0)
                    std_curve = np.std(np.array(curve_data).reshape(-1, 10), axis=0)

                    # Calculate mean and standard deviation of accuracies
                    mean_accuracies = np.mean(curve_data)
                    std_accuracies = np.std(curve_data)

                    # Add mean and standard deviation to corresponding lists
                    learning_curve_data[classifier]['mean'] = mean_accuracies
                    learning_curve_data[classifier]['std'] = std_accuracies

                # Generate learning curves plot
                plt.figure(figsize=(10, 6))
                for classifier, curve_data in learning_curve_data.items():
                    mean_curve = np.mean(np.array(curve_data).reshape(-1, 10), axis=0)
                    std_curve = np.std(np.array(curve_data).reshape(-1, 10), axis=0)
                    plt.errorbar(np.arange(0.1, 1.1, 0.1) * 100, mean_curve, yerr=std_curve, label=classifier)

                plt.xlabel('% of Training Data')
                plt.ylabel('Accuracy')
                plt.title('Learning Curve for Different Classifiers')
                plt.legend()
                plt.grid(True)
                plt.show()

                # Return statistics matrix
                stats = np.zeros((5, 2))
                stats[0, 0] = mean_decision_tree_accuracies
                stats[0, 1] = std_decision_tree_accuracies
                stats[1, 0] = mean_decision_stump_accuracies
                stats[1, 1] = std_decision_stump_accuracies
                stats[2, 0] = mean_dt3_accuracies
                stats[2, 1] = std_dt3_accuracies
                return stats
if __name__ == "__main__":
    stats = evaluatePerformance()
    print("Decision Tree Accuracy = ", stats[0, 0], " (", stats[0, 1], ")")

SyntaxError: ':' expected after dictionary key (1852212338.py, line 44)