In [10]:
import sys
from math import sqrt,log,exp
from random import choice, randrange, seed
from copy import deepcopy
import numpy as np
import json
import csv
import time

In [11]:
#Random Forest functions
def import_data_rf(filename):
    file = open(filename)
    data = []
    for line in file:
        eachline = line.rstrip().split(' ')
        data.append(eachline)
    file.close()
    return data

def split_function(attribute, value, data):
    left = []
    right = []
    for r in data:
        if r[attribute] < value:
            left.append(r)
        else:
            right.append(r)
    return left, right

def gini_value(groups, class_labels):
    num_groups = float(sum([len(g) for g in groups]))
    gini_value = 0.0
    for g in groups:
        n = float(len(g))
        if n == 0:
            continue
        score = 0.0
        for c in class_labels:
            v = [r[1] for r in g].count(c) / n
            score = score + v*v
        gini_value = gini_value + (1.0 - score) * (n / num_groups)
    return gini_value

def best_splits(data, num_features):
    class_labels = list(set(r[1] for r in data))
    attr_index, attr_value, attr_score, attr_groups = 1000, 1000, 1000, None
    features = list()
    while len(features) < num_features:
        indexes = randrange(len(data[0])-2)
        if indexes not in features:
            features.append(indexes)
    for i in features:
        for r in data:
            groups = split_function(i, r[i], data)
            gini = gini_value(groups, class_labels)
            if gini < attr_score:
                attr_index, attr_value, attr_score, attr_groups = i, r[i], gini, groups
    return {'attr_index':attr_index, 'attr_value':attr_value, 'attr_groups':attr_groups}

def leaf_nodes(group):
    class_labels = [r[1] for r in group]
    return max(set(class_labels), key=class_labels.count)

def split_checks(node, max_depth, size, num_features, depth):
    left, right = node['attr_groups']
    del(node['attr_groups'])

    if not left or not right:
        node['left'] = node['right'] = leaf_nodes(left + right)
        return

    if depth >= max_depth:
        node['left'], node['right'] = leaf_nodes(left), leaf_nodes(right)
        return

    if len(left) <= size:
        node['left'] = leaf_nodes(left)
    else:
        node['left'] = best_splits(left, num_features)
        split_checks(node['left'], max_depth, size, num_features, depth+1)

    if len(right) <= size:
        node['right'] = leaf_nodes(right)
    else:
        node['right'] = best_splits(right, num_features)
        split_checks(node['right'], max_depth, size, num_features, depth+1)
        
def decision_tree(train, max_depth, size, num_features):
    tree_root = best_splits(train, num_features)
    split_checks(tree_root, max_depth, size, num_features, 1)
    return tree_root

def dt_predictions(node, r):
    if r[node['attr_index']] < node['attr_value']:
        if isinstance(node['left'], dict):
            return dt_predictions(node['left'], r)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return dt_predictions(node['right'], r)
        else:
            return node['right']
        
def random_forest_predictions(trees, r):
    predictions = [dt_predictions(t, r) for t in trees]
    return max(set(predictions), key=predictions.count)

def sampling_function(data, ratio):
    sample = list()
    num_sample = round(len(data) * ratio)
    while len(sample) < num_sample:
        ind = randrange(len(data))
        sample.append(data[ind])
    return sample

def random_forest_train(train, ratio, max_depth, size, num_trees, num_features):
    trees = list()
    for i in range(num_trees):
        train_sample = sampling_function(train, ratio)
        tree = decision_tree(train_sample, max_depth, size, num_features)
        trees.append(tree)
    return trees

def random_forest_test(test, trees):
    predictions = [random_forest_predictions(trees, r) for r in test]
    return predictions

In [12]:
#Calculating Accuracy

def accuracy_rf(predictions,actual):
    score=0
    for i,j in zip(predictions,actual):
        if i == j[1]:
            score = score + 1
    accuracy=(score/len(actual))*100
    return accuracy

In [18]:
#Main Function

train = import_data_rf('train-data.txt')
model_file = 'randomforest_model_10.txt'
print ("Start training:")
start = time.time()
max_depth = 5
size = 1
num_features = int(sqrt(len(train[0])-2))
num_trees = 10
ratio = 0.20
trees = random_forest_train(train, ratio, max_depth, size, num_trees, num_features)
print ("Time taken:", time.time() - start)
with open(model_file,'w') as f:
    json.dump(trees,f)
f.close()

test = import_data_rf('test-data.txt')
with open(model_file, 'r') as model:
    trees_file = json.load(model)
print ("Start testing:")
start = time.time()
results = random_forest_test(test, trees_file)
print ("Time taken:", time.time() - start)
accuracy_score=accuracy_rf(results,test)
print("Random Forest Accuracy:",accuracy_score)

Start training:
Time taken: 19519.550724506378
Start testing:
Time taken: 0.05585169792175293
Random Forest Accuracy: 66.48992576882291
