# Random Forest and Logistic Regression
We use the data of Canses forecast. Features are medical data of patient. Labels are in first column and correspond do a patient has canser or not (0,1).

Clicking "Ran All", below second cell you will see first 5 rows of data. Below the last cell you will see accuracies of Random Forest and Logistic Regression prediction methods.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
from logistic_regression import gradient_descent
from random_forest import RandomForest

In [2]:
def accuracy_score(Y_true, Y_predict):
    
    error=0
    for i in range(len(Y_predict)):
        if Y_true[i] != Y_predict[i]:
            error = error + 1
    error= error*100/len(Y_predict)
    return 100-error


# Load Data
filename = 'SPECTF.dat'
data = np.loadtxt(filename, delimiter=',')
X = data[:, 1:]
y = np.array([data[:, 0]]).T
n, d = X.shape

print("Data")
print(data[:5])

all_accuracies=[]
all_log_accuracies=[]
all_rand_accuracies=[]

for trial in range(7):
    #divide data to train and test
    select = list(range(n))
    random.shuffle(select)
    select_train = np.array(select[0:int(9*n / 10)])
    select_test = np.array(select[int(9*n / 10):n])

    data_train = data[select_train, :]
    data_test = data[select_test, :]

    data_train_Y=data_train[:,0]
    data_train_X=data_train[:,1:data_train.shape[1]]

    data_test_Y=data_test[:,0]
    data_test_X=data_test[:,1:data_test.shape[1]]
    #END

    #Logistic Regression
    beta=gradient_descent(data_train_X,data_train_Y)
    Y_log_pred=[]
    for i in range(data_test_X.shape[0]):
        if np.hstack((1,data_test_X[i])).T.dot(beta)>0:
            Y_log_pred.append(1)
        else:
            Y_log_pred.append(0)
    log_accuracy=accuracy_score(data_test_Y, Y_log_pred)
    all_log_accuracies.append(log_accuracy)
    # END

    # Random Forest
    trees = RandomForest(10,15)
    trees.fit(data_train)
    predict=trees.predict(data_test)
    y_rand_pred = [row[0] for row in predict]

    rand_accuracy = accuracy_score(data_test_Y, y_rand_pred)
    all_rand_accuracies.append(rand_accuracy)
    # END

Data
[[  1.  59.  52.  70.  67.  73.  66.  72.  61.  58.  52.  72.  71.  70.
   77.  66.  65.  67.  55.  61.  57.  68.  66.  72.  74.  63.  64.  56.
   54.  67.  54.  76.  74.  65.  67.  66.  56.  62.  56.  72.  62.  74.
   74.  64.  67.]
 [  1.  72.  62.  69.  67.  78.  82.  74.  65.  69.  63.  70.  70.  72.
   74.  70.  71.  72.  75.  66.  65.  73.  78.  74.  79.  74.  69.  69.
   70.  71.  69.  72.  70.  62.  65.  65.  71.  63.  60.  69.  73.  67.
   71.  56.  58.]
 [  1.  71.  62.  70.  64.  67.  64.  79.  65.  70.  69.  72.  71.  68.
   65.  61.  61.  73.  71.  75.  74.  80.  74.  54.  47.  53.  37.  77.
   68.  72.  59.  72.  68.  60.  60.  73.  70.  66.  65.  64.  55.  61.
   41.  51.  46.]
 [  1.  69.  71.  70.  78.  61.  63.  67.  65.  59.  59.  66.  69.  71.
   75.  65.  58.  60.  55.  62.  59.  67.  66.  74.  74.  64.  60.  57.
   54.  70.  73.  69.  76.  62.  64.  61.  61.  66.  65.  72.  73.  68.
   68.  59.  63.]
 [  1.  70.  66.  61.  66.  61.  58.  69.  69.  72.  68.  6

In [4]:
meanLogisticRegressionAccuracy = np.mean(all_log_accuracies)
stddevLogisticRegressionAccuracy = np.std(all_log_accuracies)
meanRandomForestAccuracy = np.mean(all_rand_accuracies)
stddevRandomForestAccuracy = np.std(all_rand_accuracies)

# make certain that the return value matches the API specification
stats = np.zeros((3, 2))
stats[1, 0] = meanRandomForestAccuracy
stats[1, 1] = stddevRandomForestAccuracy
stats[2, 0] = meanLogisticRegressionAccuracy
stats[2, 1] = stddevLogisticRegressionAccuracy

print("Random Forest Tree Accuracy = ", stats[1, 0], " (", stats[1, 1], ")")
print("Logistic Reg. Accuracy = ", stats[2, 0], " (", stats[2, 1], ")")

Random Forest Tree Accuracy =  78.835978836  ( 3.26159471057 )
Logistic Reg. Accuracy =  79.8941798942  ( 11.6882127166 )
