In [1]:
import numpy
import pandas
import random

In [2]:
def trainTestSplit(dataFrame, testSize):
    if isinstance(testSize, float):
        testSize = round(testSize * len(dataFrame))
    indices = dataFrame.index.tolist()
    testIndices = random.sample(population = indices, k = testSize)
    dataFrameTest = dataFrame.loc[testIndices]
    dataFrameTrain = dataFrame.drop(testIndices)
    return dataFrameTrain, dataFrameTest

In [3]:
def checkPurity(data):
    if len(numpy.unique(data[:, -1])) == 1:
        return True
    else:
        return False

In [4]:
def classifyData(data):
    uniqueClasses, uniqueClassesCounts = numpy.unique(data[:, -1], return_counts = True)
    return uniqueClasses[uniqueClassesCounts.argmax()]

In [5]:
def getPotentialSplits(data, randomAttributes):
    potentialSplits = {}
    _, columns = data.shape
    columnsIndices = list(range(columns - 1))
    if randomAttributes != None  and len(randomAttributes) <= len(columnsIndices):
        columnsIndices = randomAttributes
    for column in columnsIndices:
        values = data[:, column]
        uniqueValues = numpy.unique(values)
        if len(uniqueValues) == 1:
            potentialSplits[column] = uniqueValues
        else:
            potentialSplits[column] = []
            for i in range(len(uniqueValues)):
                if i != 0:
                    currentValue = uniqueValues[i]
                    previousValue = uniqueValues[i - 1]
                    potentialSplits[column].append((currentValue + previousValue) / 2)
    return potentialSplits

In [6]:
def splitData(data, splitColumn, splitValue):
    splitColumnValues = data[:, splitColumn]
    return data[splitColumnValues <= splitValue], data[splitColumnValues > splitValue]

In [7]:
def calculateEntropy(data):
    _, uniqueClassesCounts = numpy.unique(data[:, -1], return_counts = True)
    probabilities = uniqueClassesCounts / uniqueClassesCounts.sum()
    return sum(probabilities * -numpy.log2(probabilities))

In [8]:
def calculateOverallEntropy(dataBelow, dataAbove):
    pDataBelow = len(dataBelow) / (len(dataBelow) + len(dataAbove))
    pDataAbove = len(dataAbove) / (len(dataBelow) + len(dataAbove))
    return pDataBelow * calculateEntropy(dataBelow) + pDataAbove * calculateEntropy(dataAbove)

In [9]:
def determineBestSplit(data, potentialSplits, randomSplits = None):
    overallEntropy = 9999
    bestSplitColumn = 0
    bestSplitValue = 0
    if randomSplits == None:
        for splitColumn in potentialSplits:
            for splitValue in potentialSplits[splitColumn]:
                dataBelow, dataAbove = splitData(data, splitColumn, splitValue)
                currentOverallEntropy = calculateOverallEntropy(dataBelow, dataAbove)
                if currentOverallEntropy <= overallEntropy:
                    overallEntropy = currentOverallEntropy
                    bestSplitColumn = splitColumn
                    bestSplitValue = splitValue
    else:
        for i in range(randomSplits):
            randomSplitColumn = random.choice(list(potentialSplits))
            randomSplitValue = random.choice(potentialSplits[randomSplitColumn])
            dataBelow, dataAbove = splitData(data, randomSplitColumn, randomSplitValue)
            currentOverallEntropy = calculateOverallEntropy(dataBelow, dataAbove)
            if currentOverallEntropy <= overallEntropy:
                overallEntropy = currentOverallEntropy
                bestSplitColumn = randomSplitColumn
                bestSplitValue = randomSplitValue
    return bestSplitColumn, bestSplitValue

In [10]:
def buildDecisionTree(dataFrame, currentDepth = 0, minSampleSize = 2, maxDepth = 1000, randomAttributes = None, randomSplits = None):
    if currentDepth == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = dataFrame.columns
        data = dataFrame.values
        if randomAttributes != None and randomAttributes <= len(COLUMN_HEADERS) - 1:
            randomAttributes = random.sample(population = list(range(len(COLUMN_HEADERS) - 1)), k = randomAttributes)
        else:
            randomAttributes = None
    else:
        data = dataFrame
    if checkPurity(data) or len(data) < minSampleSize or currentDepth == maxDepth:
        return classifyData(data)
    else:
        currentDepth += 1
        potentialSplits = getPotentialSplits(data, randomAttributes)
        splitColumn, splitValue = determineBestSplit(data, potentialSplits, randomSplits)
        dataBelow, dataAbove = splitData(data, splitColumn, splitValue)
        if len(dataBelow) == 0 or len(dataAbove) == 0:
            return classifyData(data)
        else:
            question = str(COLUMN_HEADERS[splitColumn]) + " <= " + str(splitValue)
            decisionSubTree = {question: []}
            yesAnswer = buildDecisionTree(dataBelow, currentDepth, minSampleSize, maxDepth, randomAttributes, randomSplits)
            noAnswer = buildDecisionTree(dataAbove, currentDepth, minSampleSize, maxDepth, randomAttributes, randomSplits)
            if yesAnswer == noAnswer:
                decisionSubTree = yesAnswer
            else:
                decisionSubTree[question].append(yesAnswer)
                decisionSubTree[question].append(noAnswer)
            return decisionSubTree

In [11]:
def classifySample(sample, decisionTree):
    if not isinstance(decisionTree, dict):
        return decisionTree
    question = list(decisionTree.keys())[0]
    attribute, value = question.split(" <= ")
    if sample[attribute] <= float(value):
        answer = decisionTree[question][0]
    else:
        answer = decisionTree[question][1]
    return classifySample(sample, answer)

In [12]:
def decisionTreePredictions(dataFrame, decisionTree):
    predictions = dataFrame.apply(classifySample, axis = 1, args = (decisionTree,))
    return predictions

In [13]:
def calculateAccuracy(predictedResults, category):
    resultCorrect = predictedResults == category
    return resultCorrect.mean()

In [14]:
def trainTestSplit(dataFrame, testSize):
    if isinstance(testSize, float):
        testSize = round(testSize * len(dataFrame))
    indices = dataFrame.index.tolist()
    testIndices = random.sample(population = indices, k = testSize)
    dataFrameTest = dataFrame.loc[testIndices]
    dataFrameTrain = dataFrame.drop(testIndices)
    return dataFrameTrain, dataFrameTest

In [15]:
def bootstrapSample(dataFrame, bootstrapSize):
    randomIndices = numpy.random.randint(low = 0, high = len(dataFrame), size = bootstrapSize)
    return dataFrame.iloc[randomIndices]

In [16]:
def createRandomForest(dataFrame, bootstrapSize, randomAttributes, randomSplits, forestSize = 20, treeMaxDepth = 1000):
    forest = []
    for i in range(forestSize):
        bootstrappedDataFrame = bootstrapSample(dataFrame, bootstrapSize)
        decisionTree = buildDecisionTree(bootstrappedDataFrame, maxDepth = treeMaxDepth, randomAttributes = randomAttributes, randomSplits = randomSplits)
        forest.append(decisionTree)
    return forest

In [17]:
def randomForestPredictions(dataFrame, randomForest):
    predictions = {}
    for i in range(len(randomForest)):
        column = "decision tree " + str(i)
        predictions[column] = decisionTreePredictions(dataFrame, randomForest[i])
    predictions = pandas.DataFrame(predictions)
    return predictions.mode(axis = 1)[0]

In [18]:
def calculateAccuracy(predictedResults, category):
    resultCorrect = predictedResults == category
    return resultCorrect.mean()

## Importing Dataset

In [19]:
dataFrame = pandas.read_csv("breast_cancer.csv")

In [20]:
dataFrame = dataFrame.drop("id", axis = 1)

In [21]:
dataFrame = dataFrame[dataFrame.columns.tolist()[1: ] + dataFrame.columns.tolist()[0: 1]]

In [22]:
dataFrameTrain, dataFrameTest = trainTestSplit(dataFrame, testSize = 0.25)

In [23]:
print("Random Forest - Breast Cancer Dataset")
print("  Maximum bootstrap size (n) is {}".format(dataFrameTrain.shape[0]))
print("  Maximum random subspace size (d) is {}".format(dataFrameTrain.shape[1] - 1))

Random Forest - Breast Cancer Dataset
  Maximum bootstrap size (n) is 427
  Maximum random subspace size (d) is 30


In [24]:
import time

In [25]:
print("\n  Change n, keep other parameters")
for i in range(10, dataFrameTrain.shape[0] + 1, 50):
    startTime = time.time()
    randomForest = createRandomForest(dataFrameTrain, bootstrapSize = i, randomAttributes = 10, randomSplits = 50, forestSize = 30, treeMaxDepth = 3)
    buildingTime = time.time() - startTime
    randomForestTestResults = randomForestPredictions(dataFrameTest, randomForest)
    accuracyTest = calculateAccuracy(randomForestTestResults, dataFrameTest.iloc[:, -1]) * 100
    randomForestTrainResults = randomForestPredictions(dataFrameTrain, randomForest)
    accuracyTrain = calculateAccuracy(randomForestTrainResults, dataFrameTrain.iloc[:, -1]) * 100
    print("  n = {}, d = {}, s = {}, k = {}, maxDepth = {}:".format(i, 10, 50, 30, 3))
    print("    accTest = {0:.2f}%, ".format(accuracyTest), end = "")
    print("accTrain = {0:.2f}%, ".format(accuracyTrain), end = "")
    print("buildTime = {0:.2f}s".format(buildingTime), end = "\n")


  Change n, keep other parameters
  n = 10, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 92.96%, accTrain = 91.80%, buildTime = 0.20s
  n = 60, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 94.37%, accTrain = 94.85%, buildTime = 0.53s
  n = 110, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 95.77%, accTrain = 95.32%, buildTime = 0.69s
  n = 160, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 95.77%, accTrain = 97.19%, buildTime = 0.83s
  n = 210, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 93.66%, accTrain = 96.72%, buildTime = 0.96s
  n = 260, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 96.48%, accTrain = 97.66%, buildTime = 1.11s
  n = 310, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 97.18%, accTrain = 96.72%, buildTime = 1.22s
  n = 360, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 95.77%, accTrain = 97.42%, buildTime = 1.32s
  n = 410, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 94.37%, accTrain = 96.25%, buildTime = 1

In [26]:
print("\n  Change d, keep other parameters")
for i in range(10, dataFrameTrain.shape[1], 2):
    startTime = time.time()
    randomForest = createRandomForest(dataFrameTrain, bootstrapSize = 60, randomAttributes = i, randomSplits = 50, forestSize = 30, treeMaxDepth = 3)
    buildingTime = time.time() - startTime
    randomForestTestResults = randomForestPredictions(dataFrameTest, randomForest)
    accuracyTest = calculateAccuracy(randomForestTestResults, dataFrameTest.iloc[:, -1]) * 100
    randomForestTrainResults = randomForestPredictions(dataFrameTrain, randomForest)
    accuracyTrain = calculateAccuracy(randomForestTrainResults, dataFrameTrain.iloc[:, -1]) * 100
    print("  n = {}, d = {}, s = {}, k = {}, maxDepth = {}:".format(60, i, 50, 30, 3))
    print("    accTest = {0:.2f}%, ".format(accuracyTest), end = "")
    print("accTrain = {0:.2f}%, ".format(accuracyTrain), end = "")
    print("buildTime = {0:.2f}s".format(buildingTime), end = "\n")


  Change d, keep other parameters
  n = 60, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 95.77%, accTrain = 95.78%, buildTime = 0.51s
  n = 60, d = 12, s = 50, k = 30, maxDepth = 3:
    accTest = 92.96%, accTrain = 94.15%, buildTime = 0.51s
  n = 60, d = 14, s = 50, k = 30, maxDepth = 3:
    accTest = 95.07%, accTrain = 95.78%, buildTime = 0.49s
  n = 60, d = 16, s = 50, k = 30, maxDepth = 3:
    accTest = 95.07%, accTrain = 95.55%, buildTime = 0.49s
  n = 60, d = 18, s = 50, k = 30, maxDepth = 3:
    accTest = 92.96%, accTrain = 96.02%, buildTime = 0.49s
  n = 60, d = 20, s = 50, k = 30, maxDepth = 3:
    accTest = 94.37%, accTrain = 94.85%, buildTime = 0.51s
  n = 60, d = 22, s = 50, k = 30, maxDepth = 3:
    accTest = 95.07%, accTrain = 96.25%, buildTime = 0.47s
  n = 60, d = 24, s = 50, k = 30, maxDepth = 3:
    accTest = 95.77%, accTrain = 95.55%, buildTime = 0.53s
  n = 60, d = 26, s = 50, k = 30, maxDepth = 3:
    accTest = 94.37%, accTrain = 96.02%, buildTime = 0.54s
  

In [27]:
print("\n  Change s, keep other parameters")
for i in range(10, 100 + 1, 10):
    startTime = time.time()
    randomForest = createRandomForest(dataFrameTrain, bootstrapSize = 60, randomAttributes = 10, randomSplits = i, forestSize = 30, treeMaxDepth = 3)
    buildingTime = time.time() - startTime
    randomForestTestResults = randomForestPredictions(dataFrameTest, randomForest)
    accuracyTest = calculateAccuracy(randomForestTestResults, dataFrameTest.iloc[:, -1]) * 100
    randomForestTrainResults = randomForestPredictions(dataFrameTrain, randomForest)
    accuracyTrain = calculateAccuracy(randomForestTrainResults, dataFrameTrain.iloc[:, -1]) * 100
    print("  n = {}, d = {}, s = {}, k = {}, maxDepth = {}:".format(60, 10, i, 30, 3))
    print("    accTest = {0:.2f}%, ".format(accuracyTest), end = "")
    print("accTrain = {0:.2f}%, ".format(accuracyTrain), end = "")
    print("buildTime = {0:.2f}s".format(buildingTime), end = "\n")


  Change s, keep other parameters
  n = 60, d = 10, s = 10, k = 30, maxDepth = 3:
    accTest = 92.25%, accTrain = 94.61%, buildTime = 0.17s
  n = 60, d = 10, s = 20, k = 30, maxDepth = 3:
    accTest = 95.07%, accTrain = 94.85%, buildTime = 0.25s
  n = 60, d = 10, s = 30, k = 30, maxDepth = 3:
    accTest = 95.07%, accTrain = 95.08%, buildTime = 0.32s
  n = 60, d = 10, s = 40, k = 30, maxDepth = 3:
    accTest = 94.37%, accTrain = 93.91%, buildTime = 0.42s
  n = 60, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 93.66%, accTrain = 95.32%, buildTime = 0.53s
  n = 60, d = 10, s = 60, k = 30, maxDepth = 3:
    accTest = 95.77%, accTrain = 96.72%, buildTime = 0.69s
  n = 60, d = 10, s = 70, k = 30, maxDepth = 3:
    accTest = 96.48%, accTrain = 95.55%, buildTime = 0.73s
  n = 60, d = 10, s = 80, k = 30, maxDepth = 3:
    accTest = 97.18%, accTrain = 96.72%, buildTime = 0.76s
  n = 60, d = 10, s = 90, k = 30, maxDepth = 3:
    accTest = 95.77%, accTrain = 96.25%, buildTime = 0.79s
  

In [28]:
print("\n  Change k, keep other parameters")
for i in range(10, 100 + 1, 10):
    startTime = time.time()
    randomForest = createRandomForest(dataFrameTrain, bootstrapSize = 60, randomAttributes = 10, randomSplits = 50, forestSize = i, treeMaxDepth = 3)
    buildingTime = time.time() - startTime
    randomForestTestResults = randomForestPredictions(dataFrameTest, randomForest)
    accuracyTest = calculateAccuracy(randomForestTestResults, dataFrameTest.iloc[:, -1]) * 100
    randomForestTrainResults = randomForestPredictions(dataFrameTrain, randomForest)
    accuracyTrain = calculateAccuracy(randomForestTrainResults, dataFrameTrain.iloc[:, -1]) * 100
    print("  n = {}, d = {}, s = {}, k = {}, maxDepth = {}:".format(60, 10, 50, i, 3))
    print("    accTest = {0:.2f}%, ".format(accuracyTest), end = "")
    print("accTrain = {0:.2f}%, ".format(accuracyTrain), end = "")
    print("buildTime = {0:.2f}s".format(buildingTime), end = "\n")


  Change k, keep other parameters
  n = 60, d = 10, s = 50, k = 10, maxDepth = 3:
    accTest = 94.37%, accTrain = 93.91%, buildTime = 0.19s
  n = 60, d = 10, s = 50, k = 20, maxDepth = 3:
    accTest = 97.18%, accTrain = 96.49%, buildTime = 0.40s
  n = 60, d = 10, s = 50, k = 30, maxDepth = 3:
    accTest = 95.77%, accTrain = 95.08%, buildTime = 0.52s
  n = 60, d = 10, s = 50, k = 40, maxDepth = 3:
    accTest = 96.48%, accTrain = 96.96%, buildTime = 0.67s
  n = 60, d = 10, s = 50, k = 50, maxDepth = 3:
    accTest = 96.48%, accTrain = 96.02%, buildTime = 0.78s
  n = 60, d = 10, s = 50, k = 60, maxDepth = 3:
    accTest = 95.77%, accTrain = 94.85%, buildTime = 0.97s
  n = 60, d = 10, s = 50, k = 70, maxDepth = 3:
    accTest = 95.77%, accTrain = 96.25%, buildTime = 1.17s
  n = 60, d = 10, s = 50, k = 80, maxDepth = 3:
    accTest = 95.07%, accTrain = 96.02%, buildTime = 1.38s
  n = 60, d = 10, s = 50, k = 90, maxDepth = 3:
    accTest = 95.07%, accTrain = 95.55%, buildTime = 1.53s
  