In [76]:
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn import datasets

from sklearn.datasets import make_classification

In [77]:
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
print(X.shape, y.shape)

  warn(


(70000, 784) (70000,)


In [78]:
from sklearn.tree import DecisionTreeClassifier


class RFClassifier:
    def __init__(self, n_trees: int, subset_size: float=0.5, subfeature_size: float=1.0, random_state: int=0) -> None:
        self.classifiers = [DecisionTreeClassifier(criterion='gini', max_features='sqrt') for _ in range(n_trees)]
        self.subset_size = subset_size
        self.subfeature_size = subfeature_size
        self.random_state = random_state
        

    def fit(self, x: np.ndarray, y: np.ndarray) -> None:
        gen = np.random.RandomState(self.random_state)
        subset_size = int(x.shape[0] * self.subset_size)
        subfeature_size = int(x.shape[1] * self.subfeature_size)

        for cls in self.classifiers:
            subset_indices = gen.choice(x.shape[0], subset_size)
#             subfeature_indices = gen.choice(x.shape[1], subfeature_size)
            
            x_subset = x[subset_indices, :]
            y_subset = y[subset_indices, ...]
            cls.fit(x_subset, y_subset)

    def predict(self, x) -> np.ndarray:
        pred_table = np.zeros((x.shape[0], len(self.classifiers)), dtype=np.int64)
        for i, cls in enumerate(self.classifiers):
            pred_table[..., i] = cls.predict(x)
        result_pred = np.zeros((x.shape[0]), dtype=np.int64)

        for i, pred in enumerate(pred_table):
            clases, counts = np.unique(pred, return_counts=True)
            result_pred[i] = clases[np.argmax(counts)]

        return result_pred

In [79]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7)

In [80]:
print(x_train.shape, y_train.shape)

(49000, 784) (49000,)


In [81]:
my_rf = RFClassifier(100)
my_rf.fit(x_train, y_train)

In [82]:
y_test_int = np.array(y_test, dtype=int)
y_pred_mrf = my_rf.predict(x_test)
ac = accuracy_score(y_test_int, y_pred_mrf)
print(f"Accuracy of Random Forest: {ac}")

Accuracy of Random Forest: 0.963952380952381


In [83]:
from sklearn.ensemble import RandomForestClassifier

rforest = RandomForestClassifier()
rforest.fit(x_train, y_train)

In [84]:
y_pred = np.array(rforest.predict(x_test), dtype=int)
y_test_ = np.array(y_test, dtype=int)

In [85]:
ac = accuracy_score(y_test_, y_pred)
print(f"Accuracy of Random Forest: {ac}")

Accuracy of Random Forest: 0.9670952380952381
