In [17]:
class Node:
    def __init__(self, feature = None, threshold = None, left = None, right = None,*,value = None):
        self.feature = feature #which feature was devived with
        self.threshold = threshold #threshold value
        self.left = left #left child
        self.right = right #right child
        self.value = value #value of the node

        def is_leaf_node(self):
            return self.value is not None


In [67]:
import numpy as np
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from collections import Counter

class DecisionTree:

    def  __init__(self, max_depth=100, min_samples_split=2, min_samples_leaf=5, random_state=None, criterion='gini'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.random_state = random_state #seed
        self.criterion = criterion
        self.root = None   # root of the tree 

    def _most_common_label(self, y):
        #leggyakoribb label meghatarozasa
        counter = Counter(y)
        return counter.most_common(1)[0][0]
    
    def _entropy(self, y): 
        #entropy meghatarozasa
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])
    
    def _split(self, X_column, split_treshhold):
        #adatok elvalasztasa
        print(split_treshhold)
        left_mask = np.argwhere(X_column <= split_treshhold).flatten()
        right_mask = np.argwhere(X_column > split_treshhold).flatten()
        return left_mask, right_mask

    def _information_gain(self, y, X_column, split_treshhold):
        #information gain meghatarozasa
        #parent entropy
        parent_entropy = self._entropy(y)
        left_mask, right_mask = self._split(X_column, split_treshhold)

        if (len(left_mask) == 0 or len(right_mask) == 0):
            return 0
        #child entropy

        n = len(y)
        n_l, n_r = len(left_mask), len(right_mask)
        e_l, e_r = self._entropy(y[left_mask]), self._entropy(y[right_mask])

        child_entropy = (n_l/n)*e_l + (n_r/n)*e_r


        #calculate information gain
        ig = parent_entropy - child_entropy

        return ig
    
    def _best_split(self, X, y, feat_idxs):
        #legjobb elvalasztas meghatarozasa
        best_gain = -1
        split_idx, split_treshold = None, None

        for feat_idx in feat_idxs:
            X_column = X[:,feat_idx]
            thresholds = np.unique(X_column)

            #print("Thresholds: ", thresholds)

            for threshold in thresholds:
                #calculate information gain
                #print("feat_idx: ", feat_idx, "threshold: ", threshold)
                gain = self._information_gain(y, X_column, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_treshold = threshold

        return split_idx, split_treshold

    def _grow_tree(self, X, y, depth=0):
        number_of_samples, number_of_features = X.shape
        number_of_labels = len(np.unique(y))
        #check stopping criteria
        if(depth >= self.max_depth | number_of_labels == 1 | number_of_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)
        
        #find best split
        feat_idxs = np.random.choice(number_of_features, self.random_state, replace=False)
        #print("feat_idxs: ", feat_idxs)
        best_feature,best_treshhold = self._best_split(X, y, feat_idxs)

        #create children nodes
        #print("Best treshhold: ", best_treshhold)
        left_idxs, right_idxs = self._split(X[:,best_feature], best_treshhold)

        #recursively call the function
        left = self._grow_tree(X[left_idxs,:], y[left_idxs], depth+1)
        right = self._grow_tree(X[right_idxs,:], y[right_idxs], depth+1)

        return Node(best_feature, best_treshhold, left, right)

    def fit(self, X, y):
        #döntési fa tanítása
        self.random_state = X.shape[1] if not self.random_state else min(X.shape[1],self.random_state)
        #print("Random state", self.random_state)
        self.root = self._grow_tree(X, y)
        return None
    
    def _traverse_tree(self, x, node):
        #döntési fa bejárása
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def predict(self, X):
        #tbp
        #regresszios ertek meghatarozasa
        return np.array([self._traverse_tree(x, self.root) for x in X])
    
    def print():
        #döntési fa ábrázolása
        #tbp
        return None

In [69]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

#Minta adatforrás
wine = datasets.load_breast_cancer()
X = wine.data # adatok
y = wine.target # címkék/célváltozók

#Adatok felosztása tanító és tesztelő halmazra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # 80% tanító, 20% tesztelő

#Döntési fa tanítása
dt = DecisionTree(max_depth=10)
dt.fit(X_train, y_train)

#Döntési fa kiértékelése
y_pred = dt.predict(X_test)
print(y_pred)

dt.print()

0.0
0.000692
0.0007929
0.0009737
0.001128
0.001184
0.001487
0.001595
0.001597
0.00186
0.002074
0.00262
0.002817
0.002831
0.003223
0.003297
0.003681
0.003846
0.004174
0.004272
0.004826
0.005254
0.005308
0.005325
0.005383
0.005717
0.005812
0.005832
0.006021
0.006416
0.006493
0.006564
0.006972
0.007004
0.007066
0.007078
0.007276
0.007508
0.007665
0.007741
0.007816
0.007936
0.007975
0.008342
0.008347
0.008496
0.008534
0.008732
0.009075
0.009127
0.009398
0.00941
0.009904
0.01003
0.01018
0.01031
0.01051
0.01056
0.01062
0.01065
0.01072
0.01079
0.01081
0.01099
0.01123
0.01131
0.01132
0.01153
0.01162
0.01163
0.01168
0.01169
0.01186
0.01196
0.01245
0.01246
0.01256
0.01267
0.01272
0.01277
0.01282
0.01307
0.01311
0.01328
0.01329
0.01341
0.01346
0.01349
0.01358
0.01376
0.0139
0.01397
0.01401
0.01412
0.01423
0.01434
0.0145
0.01452
0.01453
0.01457
0.01482
0.01498
0.01509
0.0151
0.01514
0.01529
0.0153
0.01536
0.01551
0.01564
0.01585
0.01596
0.01603
0.01608
0.01613
0.01615
0.01622
0.0163
0.01651
0.0165

RecursionError: maximum recursion depth exceeded while calling a Python object