In [1]:
#importing the required headers
import numpy as np
import pandas as pd
import random

In [2]:
#traintestSplit for dividing the data into train and test and returning those 2 data sets
def trainTestSplit(dataFrame, testSize):
    if isinstance(testSize, float):
        testSize = round(testSize * len(dataFrame))
    indices = dataFrame.index.tolist()
    testIndices = random.sample(population = indices, k = testSize)
    dataFrameTest = dataFrame.loc[testIndices]
    dataFrameTrain = dataFrame.drop(testIndices)
    return dataFrameTrain, dataFrameTest

In [3]:
#function to check whether all values of the respected attribute is same or not
def checkPurity(data):
    if len(np.unique(data.loc[:,data.columns[-1]])) == 1:
        return True
    else:
        return False
#functio used to classify data into uniqueClasses
def classifyData(data):
    uniqueClasses, uniqueClassesCounts = np.unique(data.loc[:,data.columns[-1]], return_counts = True)
    return uniqueClasses[uniqueClassesCounts.argmax()]

In [4]:
#function to obtain perfect splits based on the importance of attributes based on the previous attribute values
def getPotentialSplits(data, randomAttributes):
    potentialSplits = {}
    _, columns = data.shape
    columnsIndices = list(range(columns - 1))
    if randomAttributes != None  and len(randomAttributes) <= len(columnsIndices):
        columnsIndices = randomAttributes
    for column in columnsIndices:
        values = data.loc[:,data.columns[column]]
        uniqueValues = np.unique(values)
        if len(uniqueValues) == 1:
            potentialSplits[column] = uniqueValues
        else:
            potentialSplits[column] = []
            for i in range(len(uniqueValues)):
                if i != 0:
                    currentValue = uniqueValues[i]
                    previousValue = uniqueValues[i - 1]
                    potentialSplits[column].append((currentValue + previousValue) / 2)
    return potentialSplits
#function used to split data 
def splitData(data, splitColumn, splitValue):
    splitColumnValues = data.loc[:,data.columns[splitColumn]]
    return data[splitColumnValues <= splitValue], data[splitColumnValues > splitValue]



In [5]:
#function used to calculate entropy value for each attribute present in the data
def calculateEntropy(data):
    _, uniqueClassesCounts = np.unique(data.loc[:,data.columns[-1]], return_counts = True)
    probabilities = uniqueClassesCounts / uniqueClassesCounts.sum()
    return sum(probabilities * -np.log2(probabilities))
#function used to calculate overall entropy value for the whole data
def calculateOverallEntropy(dataBelow, dataAbove):
    pDataBelow = len(dataBelow) / (len(dataBelow) + len(dataAbove))
    pDataAbove = len(dataAbove) / (len(dataBelow) + len(dataAbove))
    return pDataBelow * calculateEntropy(dataBelow) + pDataAbove * calculateEntropy(dataAbove)

In [6]:
#function used to return the bestsplit column and value based upon the information gain and overall entropy
def Best_split(data, potentialSplits, rand_S = None):
    OE = 9999999999999
    best_Split_Column = 0
    best_Split_Value = 0
    if rand_S != None:
        for i in range(randomSplits):
            randomSplitColumn = random.choice(list(potentialSplits))
            randomSplitValue = random.choice(potentialSplits[randomSplitColumn])
            dataBelow, dataAbove = splitData(data, randomSplitColumn, randomSplitValue)
            currentOverallEntropy = calculateOverallEntropy(dataBelow, dataAbove)
            if currentOverallEntropy <= OE:
                OE = currentOverallEntropy
                best_Split_Column = randomSplitColumn
                best_Split_Value = randomSplitValue
        
    else:
        for splitColumn in potentialSplits:
            for splitValue in potentialSplits[splitColumn]:
                dataBelow, dataAbove = splitData(data, splitColumn, splitValue)
                currentOverallEntropy = calculateOverallEntropy(dataBelow, dataAbove)
                if currentOverallEntropy <= OE:
                    OE = currentOverallEntropy
                    best_Split_Column = splitColumn
                    best_Split_Value = splitValue
        
    return best_Split_Column, best_Split_Value



In [7]:
#function used to buildDecisionTree 
def buildDecisionTree(dataFrame, currentDepth = 0, minSampleSize = 2, maxDepth = 1000, randomAttributes = None, randomSplits = None):
    global COLUMN_HEADERS
    COLUMN_HEADERS = dataFrame.columns
    if currentDepth == 0:
        data = dataFrame.values
        if randomAttributes != None and randomAttributes <= len(COLUMN_HEADERS) - 1:
            randomAttributes = random.sample(population = list(range(len(COLUMN_HEADERS) - 1)), k = randomAttributes)
        else:
            randomAttributes = None
    else:
        data = dataFrame
    if checkPurity(data) or len(data) < minSampleSize or currentDepth == maxDepth:
        return classifyData(data)
    else:
        currentDepth += 1
        potentialSplits = getPotentialSplits(data, randomAttributes)
        splitColumn, splitValue = Best_split(data, potentialSplits, randomSplits)
        dataBelow, dataAbove = splitData(data, splitColumn, splitValue)
        if len(dataBelow) == 0 or len(dataAbove) == 0:
            return classifyData(data)
        else:
            question = str(COLUMN_HEADERS[splitColumn]) + " <= " + str(splitValue)
            decisionSubTree = {question: []}
            yesAnswer = buildDecisionTree(dataBelow, currentDepth, minSampleSize, maxDepth, randomAttributes, randomSplits)
            noAnswer = buildDecisionTree(dataAbove, currentDepth, minSampleSize, maxDepth, randomAttributes, randomSplits)
            if yesAnswer == noAnswer:
                decisionSubTree = yesAnswer
            else:
                decisionSubTree[question].append(yesAnswer)
                decisionSubTree[question].append(noAnswer)
            return decisionSubTree



In [8]:
def classifySample(sample, decisionTree):
    if not isinstance(decisionTree, dict):
        return decisionTree
    question = list(decisionTree.keys())[0]
    attribute, value = question.split(" <= ")
    if sample[attribute] <= float(value):
        answer = decisionTree[question][0]
    else:
        answer = decisionTree[question][1]
    return classifySample(sample, answer)



In [9]:
#function used to find predictions
def decisionTreePredictions(dataFrame, decisionTree):
    predictions = dataFrame.apply(classifySample, axis = 1, args = (decisionTree,))
    return predictions
#function used to calculate Accuracy
def calculateAccuracy(predictedResults, category):
    resultCorrect = predictedResults == category
    return resultCorrect.mean()

In [10]:
#function used for bootstrapping of data
def bootstrapping(train_df, n_bootstrap):
    bootstrap_indices = np.random.randint(low=0, high=len(train_df), size=n_bootstrap)
    df_bootstrapped = train_df.iloc[bootstrap_indices]
    
    return df_bootstrapped
#Random_forest Implementation
def RF_Implementation(train_df, n_trees, n_bootstrap, n_features, maxDepth):
    forest = []
    for i in range(n_trees):
        df_bootstrapped = bootstrapping(train_df, n_bootstrap)
        tree = buildDecisionTree(df_bootstrapped, maxDepth)
        forest.append(tree)
    
    return forest
#used for predicting the final value
def RF_predict(test_df, forest):
    df_predictions = {}
    for i in range(len(forest)):
        column_name = "tree_{}".format(i)
        predictions = decisionTreePredictions(test_df,forest[i])
        df_predictions[column_name] = predictions

    df_predictions = pd.DataFrame(df_predictions)
    RF_predict = df_predictions.mode(axis=1)[0]
    
    return RF_predict

In [11]:
#extracting the data from the data.csv taking testSize=0.3(30% for test)
file = open("data.csv", "r")
dataFrame = pd.read_csv(file, sep = " ")
dataFrameTrain, dataFrameTest = trainTestSplit(dataFrame, testSize = 0.3)
RF_forest = RF_Implementation(dataFrameTrain, n_trees=4, n_bootstrap=800, n_features=2, maxDepth=4)

In [12]:
#finding the predictions and calculating accuracy
predictions = RF_predict(dataFrameTest, RF_forest)
accuracy = calculateAccuracy(predictions, dataFrameTest.loc[:,dataFrameTest.columns[-1]])
print("Accuracy = {}".format(accuracy))

Accuracy = 0.9246376811594202
