In [15]:
import numpy as np
import import_ipynb
import DecisionTree
from random import random
from sklearn import datasets
from sklearn.model_selection import train_test_split


In [32]:
class RandomForest:
    
    def __init__(self, max_depth = 10 , min_samples = 2 ,n_trees=10, n_features = None):
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.n_features = n_features
        self.n_trees = n_trees
        self.trees = []
        
    def fit(self , X , y):
        for _ in range(self.n_trees):
            tree = DecisionTree.DecisionTree(max_depth=self.max_depth,
                            min_sample=self.min_samples,
                            n_feature=self.n_features)
            X_sample, y_sample = self.bootstrap_samples(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
            
    def bootstrap_samples(self,X, y):
        n_samples = X.shape[0]
        
        idxs = np.random.choice(n_samples , n_samples)
        return X[idxs], y[idxs]
    
    def most_common_label(self,y):
        return np.argmax(np.bincount(y))
    
    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(predictions, 0, 1)
        predictions = np.array([self.most_common_label(pred) for pred in tree_preds])
        return predictions
        

In [16]:
data = datasets.load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234
)

def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [33]:
clf = RandomForest(n_trees=20)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

acc =  accuracy(y_test, predictions)
print(acc)

0.9122807017543859
