In [1]:
import pandas as pd
import numpy as np
from src.RandomForest import RandomForest
from sklearn.model_selection import train_test_split
from src.DecisionTree import DecisionTree
from sklearn.model_selection import KFold

df = pd.read_csv('data/playgolf.csv')
y = df.pop('Result').values
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [2]:
rf = RandomForest(num_trees=10, num_features=2)
rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)
print "RF score on golf data set:", rf.score(X_test, y_test)

dt = DecisionTree()
dt.fit(X_train, y_train)
predicted_y = dt.predict(X_test)
print "DT score on golf data set:", dt.score(X_test,y_test)

RF score on golf data set: 0.25
DT score on golf data set: 0.25


In [3]:
df2 = pd.read_csv("data/congressional_voting.csv",header=None)
y = df2.iloc[:,0].values
X = df2.iloc[:,1:].values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
print(X_train.shape,y_train.shape)

((326, 16), (326,))


In [5]:
rf = RandomForest(num_trees=10, num_features=2)
rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)
print "RF score on golf data set:", rf.score(X_test, y_test)

dt = DecisionTree()
dt.fit(X_train, y_train)
predicted_y = dt.predict(X_test)
print "DT score on golf data set:", dt.score(X_test,y_test)

RF score on golf data set: 0.97247706422
DT score on golf data set: 0.889908256881


In [6]:
from sklearn.model_selection import KFold
def crossVal(model,X_train,y_train,n_fold = 8):
    Kf = KFold(n_splits = n_fold)
    score_list = []
    R2_list =[]
    for train_index, val_index in Kf.split(X_train):
        Xc_train,yc_train = X_train[train_index], y_train[train_index]
        Xc_test, yc_test = X_train[val_index],y_train[val_index]
        #print(Xc_train.shape,yc_train.shape,Xc_test.shape,yc_test.shape)
        model.fit(Xc_train, yc_train)
        score_list.append(model.score(Xc_test,yc_test))
    return score_list,np.mean(score_list)

In [7]:
rf = RandomForest(num_trees=5, num_features=3)
dt = DecisionTree()
print("Random Forest Performance ",crossVal(rf,X_train,y_train,n_fold = 5))
print("Decision Tree Performance ",crossVal(dt,X_train,y_train,n_fold = 5))

('Random Forest Performance ', ([0.9393939393939394, 0.9692307692307692, 0.9230769230769231, 0.9230769230769231, 0.9692307692307692], 0.94480186480186501))
('Decision Tree Performance ', ([0.9545454545454546, 0.9692307692307692, 0.9230769230769231, 0.9230769230769231, 0.9384615384615385], 0.94167832167832177))


In [8]:
rf = RandomForest(num_trees=10, num_features=4)
dt = DecisionTree()
print("Random Forest Performance ",crossVal(rf,X_train,y_train,n_fold = 5))
print("Decision Tree Performance ",crossVal(dt,X_train,y_train,n_fold = 5))

('Random Forest Performance ', ([0.9393939393939394, 0.9384615384615385, 0.8769230769230769, 0.9538461538461539, 0.9538461538461539], 0.93249417249417255))
('Decision Tree Performance ', ([0.9545454545454546, 0.9692307692307692, 0.9230769230769231, 0.9230769230769231, 0.9384615384615385], 0.94167832167832177))


In [9]:
rf = RandomForest(num_trees=50, num_features=4)
dt = DecisionTree()
print("Random Forest Performance ",crossVal(rf,X_train,y_train,n_fold = 5))
print("Decision Tree Performance ",crossVal(dt,X_train,y_train,n_fold = 5))

('Random Forest Performance ', ([0.9545454545454546, 0.9692307692307692, 0.9384615384615385, 0.9230769230769231, 0.9692307692307692], 0.95090909090909093))
('Decision Tree Performance ', ([0.9545454545454546, 0.9692307692307692, 0.9230769230769231, 0.9230769230769231, 0.9384615384615385], 0.94167832167832177))


In [10]:
rf = RandomForest(num_trees=100, num_features=4)
dt = DecisionTree()
print("Random Forest Performance ",crossVal(rf,X_train,y_train,n_fold = 5))
print("Decision Tree Performance ",crossVal(dt,X_train,y_train,n_fold = 5))

('Random Forest Performance ', ([0.9545454545454546, 0.9692307692307692, 0.9230769230769231, 0.9230769230769231, 0.9692307692307692], 0.94783216783216795))
('Decision Tree Performance ', ([0.9545454545454546, 0.9692307692307692, 0.9230769230769231, 0.9230769230769231, 0.9384615384615385], 0.94167832167832177))


In [11]:
n = len(rf.forest)
forests = rf.forest
forests_scores = []
print(forests[0].score(X_test,y_test))
for i in xrange(n):
    forests_scores.append(forests[i].score(X_test,y_test))
rf.score(X_test,y_test)

0.917431192661


0.963302752293578

## Compare to Sklearn RandomForestClassifier

In [13]:
from sklearn import preprocessing
from collections import defaultdict

df2 = pd.read_csv("data/congressional_voting.csv",header=None)
d = defaultdict(preprocessing.LabelEncoder)
# Encoding the variable
fit = df2.apply(lambda x: d[x.name].fit_transform(x))

y = fit.iloc[:,0].values
X = fit.iloc[:,1:].values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [17]:
## Compare to Sklearn RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_sk  = RandomForestClassifier(n_estimators=100,max_features=4)
rf = RandomForest(num_trees=100, num_features=4)
rf_sk.fit(X_train,y_train)
rf.fit(X_train,y_train)
print("SK RF score is ", rf_sk.score(X_test,y_test))
print("GV RF score is ", rf.score(X_test,y_test))

('SK RF score is ', 0.96330275229357798)
('GV RF score is ', 0.963302752293578)
