In [1]:
import torch
import torchvision.transforms as transforms
import torchvision
import sklearn
from sklearn import tree
from sklearn import ensemble
from matplotlib import pyplot as plt
import utils
import numpy as np
from joblib import dump, load
from tqdm.notebook import tqdm

In [2]:
BS = 50000
trainset = torchvision.datasets.MNIST(root='~/Private/data', train=True,
                                        download=False, transform=transforms.ToTensor())
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BS,
                                        shuffle=True, num_workers=2)

testset = torchvision.datasets.MNIST(root='~/Private/data', train=False,
                                    download=False, transform=transforms.ToTensor())
testloader = torch.utils.data.DataLoader(testset, batch_size=BS,
                                            shuffle=False, num_workers=2)

In [7]:
clf = tree.DecisionTreeClassifier(max_depth=5)
clf.ccp_alpha = 0.01
for images, labels in trainloader:
    clf.fit(images.view(images.size(0),-1).numpy(), labels.numpy())
for images, labels in testloader:
    print(clf.score(images.view(images.size(0),-1).numpy(), labels.numpy()))
utils.find_size(clf)

0.6596


272

In [6]:
clf = tree.DecisionTreeClassifier(max_depth=5)
# clf.ccp_alpha = 0.01
for images, labels in trainloader:
    clf.fit(images.view(images.size(0),-1).numpy(), labels.numpy())
for images, labels in testloader:
    print(clf.score(images.view(images.size(0),-1).numpy(), labels.numpy()))
utils.find_size(clf)

0.67


960

In [3]:
# clf = ensemble.RandomForestClassifier(max_depth=5)
# # clf = tree.DecisionTreeClassifier()
# for images, labels in trainloader:
#     clf.fit(images.view(images.size(0),-1).numpy(), labels.numpy())
# dump(clf, 'trained_RandomForest.joblib') 
clf = load('trained_RandomForest.joblib') 

In [4]:
for images, labels in testloader:
    print(clf.score(images.view(images.size(0),-1).numpy(), labels.numpy()))

0.8576


In [5]:
ttt = clf.estimators_
sparsity = []
for tt in ttt:
    sp, table = utils.find_sparsity(tt)
    sparsity.append(sp)
print(np.mean(sparsity))

0.4114805646411945


In [6]:
tt = ttt[0]
tt.n_features_in_

784

In [5]:
class HanpiDecisionTree():
    def __init__(self, DT):
        self.DT = DT
        self.depth = self.DT.get_depth()
        self.features = utils.find_features(self.DT)
        self.selected_features = []
    
    def select_features(self):
        if self.depth > len(self.features):
            self.selected_features = self.features
        else:
            self.selected_features = np.random.choice(self.features, self.depth, replace=False)
    
    def generate_input(self, X):
        T = []
        for i in self.selected_features:
            line = X[:,i]
            T.append(line)
        T = np.array(T)
        T = T.transpose()
        return T

    def predict(self, X):
        if len(self.selected_features) == 0:
            return self.DT.predict(X)
        else:
            XX = self.generate_input(X)
            return self.DT.predict(XX)
    
    def fit(self, X, Y):
        XX = self.generate_input(X)
        self.DT.fit(XX, Y)

class MoRandomForest():
    def __init__(self, estimators):
        self.estimators_ = []
        self.transform(estimators)
        
    
    def transform(self, estimators):
        for i in range(len(estimators)):
            self.estimators_.append(HanpiDecisionTree(estimators[i]))

    def predict(self, X):
        results = []
        predicted = []
        for est in self.estimators_:
            pred = est.predict(X)
            results.append(pred)
        results = np.array(results)
        for i in range(len(results[0])):
            this = results[:,i]
            res = np.bincount(this.astype(np.int32)).argmax()
            predicted.append(res)
        predicted = np.array(predicted)
        return predicted

clf = load('trained_RandomForest.joblib') 
m = MoRandomForest(clf.estimators_)
for images, labels in testloader:
    pred = m.predict(images.view(images.size(0),-1).numpy())
    print((pred == labels.numpy()).sum() / len(labels))

0.8438


In [8]:
for dt in tqdm(m.estimators_):
    # dt = m.estimators_[0]
    # for images, labels in testloader:
    #     pred = dt.predict(images.view(images.size(0),-1).numpy())
    #     acc = (pred == labels.numpy()).sum() / len(labels)
    #     sparsity, table = utils.find_sparsity(dt.DT)
    #     print(f"acc: {acc:.4f}, spasity: {sparsity:.4f}")
    dt.select_features()
    for images, labels in trainloader:
        dt.fit(images.view(images.size(0),-1).numpy(), labels.numpy())
    # for images, labels in testloader:
    #     pred = dt.predict(images.view(images.size(0),-1).numpy())
    #     acc = (pred == labels.numpy()).sum() / len(labels)
    #     sparsity, table = utils.find_sparsity(dt.DT)
    #     print(f"acc: {acc:.4f}, spasity: {sparsity:.4f}")

  0%|          | 0/100 [00:00<?, ?it/s]

In [9]:
for images, labels in testloader:
    pred = m.predict(images.view(images.size(0),-1).numpy())
    print((pred == labels.numpy()).sum() / len(labels))

0.7463


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  res = np.bincount(this.astype(np.int)).argmax()


In [10]:
ttt = m.estimators_
sparsity = []
for tt in ttt:
    sp, table = utils.find_sparsity(tt.DT)
    sparsity.append(sp)
print(np.mean(sparsity))

0.022881451612903225


In [6]:
for dt in tqdm(m.estimators_):
    dt.DT.ccp_alpha = 0.01
    for images, labels in trainloader:
        dt.DT.fit(images.view(images.size(0),-1).numpy(), labels.numpy())
    # for images, labels in testloader:
    #     pred = dt.predict(images.view(images.size(0),-1).numpy())
    #     acc = (pred == labels.numpy()).sum() / len(labels)
    #     sparsity, table = utils.find_sparsity(dt.DT)
    #     print(f"acc: {acc:.4f}, spasity: {sparsity:.4f}")
for images, labels in testloader:
    pred = m.predict(images.view(images.size(0),-1).numpy())
    print((pred == labels.numpy()).sum() / len(labels))

  0%|          | 0/100 [00:00<?, ?it/s]

0.7932


In [8]:
sizes = []
sps = []
for dt in tqdm(m.estimators_):
    size = utils.find_size(dt.DT)
    sp, tb = utils.find_sparsity(dt.DT)
    sps.append(sp)
    sizes.append(size)
print(np.mean(sizes))
print(np.mean(sps))

  0%|          | 0/100 [00:00<?, ?it/s]

154.87
0.42896058435029033


In [11]:
print(tree.export_text(dt.DT))

|--- feature_405 <= 0.00
|   |--- feature_155 <= 0.00
|   |   |--- feature_372 <= 0.00
|   |   |   |--- feature_486 <= 0.00
|   |   |   |   |--- class: 7
|   |   |   |--- feature_486 >  0.00
|   |   |   |   |--- class: 4
|   |   |--- feature_372 >  0.00
|   |   |   |--- class: 7
|   |--- feature_155 >  0.00
|   |   |--- feature_516 <= 0.19
|   |   |   |--- class: 0
|   |   |--- feature_516 >  0.19
|   |   |   |--- class: 2
|--- feature_405 >  0.00
|   |--- feature_381 <= 0.22
|   |   |--- feature_373 <= 0.00
|   |   |   |--- feature_179 <= 0.01
|   |   |   |   |--- feature_520 <= 0.25
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_520 >  0.25
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_179 >  0.01
|   |   |   |   |--- class: 3
|   |   |--- feature_373 >  0.00
|   |   |   |--- class: 5
|   |--- feature_381 >  0.22
|   |   |--- feature_183 <= 0.05
|   |   |   |--- feature_210 <= 0.07
|   |   |   |   |--- class: 4
|   |   |   |--- feature_210 >  0.07
|   |  

In [12]:
utils.find_features(dt.DT)

[516, 100, 486, 520, 210, 179, 372, 405, 373, 183, 154, 155, 381]

In [13]:
dt.DT.get_n_leaves()

14