In [1]:
import math
import random
import pandas as pd
import numpy as np
import sklearn.cross_validation

In [2]:
data = pd.read_csv('movie_data.csv', index_col=0)
y = np.array(data.ix[:, 'box_office'].values)
X = np.array(data.drop('box_office', 1))
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
         X, y, test_size=0.1, random_state=42)

In [7]:
class Tree(object):
    def __init__(self, parents=None):
        self.children = []
        self.split_feature = None
        self.split_feature_value = None
        self.parents = parents
        self.label = None

def std_reduction(y_train):
    return -np.std(y_train)

def split_data(x_train, y_train, feature_index):
    attribute_values = x_train[:,feature_index]
    for attribute in set(attribute_values):
        data_subset = []
        for index, point in enumerate(x_train):
            if point[feature_index] == attribute:
                data_subset.append([point, y_train[index]])
        yield data_subset

def gain(x_train, y_train, feature_index):
    reduction = std_reduction(y_train)
    for data_subset in split_data(x_train, y_train, feature_index):
        reduction -= std_reduction([label
                    for (point, label) in data_subset])
    return reduction

def homogeneous(y_train):
    return len(set(y_train)) <= 1

def majority_vote(y_train, node):
    labels = y_train
    choice = max(set(labels), key=list(labels).count)
    node.label = choice
    return node

def build_decision_tree(x_train, y_train, root, remaining_features):
    remaining_features = np.array(list(remaining_features))
    if homogeneous(y_train):
        root.label = y_train[0]
        return root
    
    if remaining_features.shape == 0:
        return majority_vote(y_train, root)
    
    indices = np.random.choice(int(remaining_features.shape[0]), int(2*remaining_features.shape[0]/3), replace = False)

    best_feature = max(remaining_features[indices], key=lambda index: 
                       gain(x_train, y_train, index))
    remaining_features = set(remaining_features)
    if gain(x_train, y_train, best_feature) == 0:
        return majority_vote(y_train, root)
    
    root.split_feature = best_feature
    
    for data_subset in split_data(x_train, y_train, best_feature):
        child = Tree(parents = root)
        child.split_feature_value = data_subset[0][0][best_feature]
        root.children.append(child)
        
        new_x = np.array([point for (point, label) in data_subset])
        new_y = np.array([label for (point, label) in data_subset])
        
        build_decision_tree(new_x, new_y, child, remaining_features - set([best_feature]))
    
    return root

def decision_tree(x_train, y_train):
    return build_decision_tree(x_train, y_train, Tree(), 
                               set(range(len(x_train[0]))))

def find_nearest(array, value):
    nearest = (np.abs(array-value)).argmin()
    return array[nearest]

def evaluate(tree, point):
    if tree.children == []:
#         print "label = %r" %(tree.label)
        return tree.label
    else:
        try:
            matching_children = [child for child in tree.children
                if child.split_feature_value == point[tree.split_feature]]
            return evaluate(matching_children[0], point)
        except:
            array = [child.split_feature_value for child in tree.children]
            point[tree.split_feature] = find_nearest(array, point[tree.split_feature])
            matching_children = [child for child in tree.children
                if child.split_feature_value == point[tree.split_feature]]
            return evaluate(matching_children[0], point)
        
def predict(x_test, tree):
    predicted_labels = [evaluate(tree, point) for point in x_test]
    return predicted_labels

def random_forest(x_train, y_train, x_test, n_estimators = 100):
    x_train_copy = x_train
    y_train_copy = y_train
    x_test_copy = x_test
    labels = []
    sample = []
    predictions = []
    for i in range(n_estimators):
        sample.append(np.random.choice(len(x_train), len(x_train),
                                       replace=True))
    for i in range(n_estimators):
        x_train = x_train_copy.copy()
        y_train = y_train_copy.copy()
        x_test_copy = x_test_copy.copy()
        
        x = x_train[sample[i]]
        y = y_train[sample[i]]
        tree = decision_tree(x, y)
        labels.append(predict(x_test_copy, tree))
    
    for index in range(len(labels[0])):
        predictions.append(np.median([item[index] for item in labels]))
        
    return predictions, labels

In [8]:
predictions, labels = random_forest(x_train, y_train, x_test, n_estimators=100)

In [12]:
# modeling
from sklearn import linear_model
from sklearn import metrics
from sklearn import ensemble

model = linear_model.LinearRegression()
y_hat = model.fit(x_train, y_train).predict(x_test)


forest = ensemble.RandomForestRegressor(n_estimators = 100)
forest_predictions = forest.fit(x_train, y_train).predict(x_test)

In [13]:
# Scikit Learn RF
print "Sci-kit Learn RF:"
print metrics.mean_absolute_error(y_test, forest_predictions)

# Scikit Learn LR
print "Sci-kit Learn Linear Regression:"
print(metrics.mean_absolute_error(y_test, y_hat))

# Scratch RF
print "Scratch RF:"
print(metrics.mean_absolute_error(y_test, predictions))

Sci-kit Learn RF:
2118848.58287
Sci-kit Learn Linear Regression:
4096437.88812
Scratch RF:
5228027.07391
