In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import decision_tree as dt
import random_forest as rf
import logistic_regression as lr

def accuracy_score(Y_true, Y_predict):
    count = 0
    for i in range(len(Y_true)):
        if(Y_true[i] == Y_predict[i]):
            count+=1
    return count/len(Y_true)


def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n, d = X.shape
    
    print(X[0],y[0])
    accuraciesDecisionTree = []
    accuraciesRandomForest = []
    accuraciesLogistic = []
    for trial in range(10):
        idx = np.arange(n)
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        
        Xtrain = X[1:101, :]  # train on first 100 instances
        Xtest = X[101:, :]
        ytrain = y[1:101, :]  # test on remaining instances
        ytest = y[101:, :]
        
        # train the decision tree
        classifier = dt.DecisionTree(20)
        classifier.fit(Xtrain, ytrain)

        # output predictions on the remaining data
        y_pred = classifier.predict(Xtest)
        accuracy = accuracy_score(ytest, y_pred)
        accuraciesDecisionTree.append(accuracy)
        
        randforest = rf.RandomForest(5,20)
        randforest.fit(X,y)
        y_pred = randforest.predict(Xtest)[0]
        accuracy = accuracy_score(ytest, y_pred)
        accuraciesRandomForest.append(accuracy)
        
        beta_hat = lr.gradient_descent(Xtrain, ytrain, l=1, epsilon=1e-8, step_size=1e-2,
                                max_steps=5000)
        y_pred = lr.predict(beta_hat, Xtest)
        accuracy = accuracy_score(ytest, y_pred)
        accuraciesLogistic.append(accuracy)
        
    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(accuraciesDecisionTree)    
    stddevDecisionTreeAccuracy = np.std(accuraciesDecisionTree)
    meanLogisticRegressionAccuracy = np.mean(accuraciesLogistic)
    stddevLogisticRegressionAccuracy = np.std(accuraciesLogistic)
    meanRandomForestAccuracy = np.mean(accuraciesRandomForest)
    stddevRandomForestAccuracy = np.std(accuraciesRandomForest)
    
    # make certain that the return value matches the API specification
    stats = np.zeros((3, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    return stats


# Do not modify from HERE...
if __name__ == "__main__":
    stats = evaluate_performance()
    print ("Decision Tree Accuracy = ", stats[0, 0], " (", stats[0, 1], ")")
    print ("Random Forest Tree Accuracy = ", stats[1, 0], " (", stats[1, 1], ")")
    print ("Logistic Reg. Accuracy = ", stats[2, 0], " (", stats[2, 1], ")")
# ...to HERE.


Decision Tree Accuracy =  0.669879518072  ( 0.0604694462233 )
Random Forest Tree Accuracy =  0.832530120482  ( 0.0372131209915 )
Logistic Reg. Accuracy =  0.785542168675  ( 0.0143064362494 )
