In [1]:
import pandas as pd
import ssl
import numpy as np
ssl._create_default_https_context = ssl._create_unverified_context
df = pd.read_csv("https://raw.githubusercontent.com/cwhitz/git_intro_to_ml/master/heart.csv")
import math

In [2]:
class Combos:
    def findOptions(self,values):
        if len(values) == 0:
            return [values]
        smaller = self.findOptions(values[1:])
        return smaller + [[values[0]] + x for x in smaller]

    def findCombos(self,vals):
        options = []
        duplicate_options = [([x for x in vals if x not in option], option) for option in self.findOptions(vals)]
        for option in duplicate_options:
            if (option[1],option[0]) not in options and len(option[0]) != len(vals) and len(option[1]) != len(vals):
                options.append(option)
        return options

In [3]:
class SplitFunctions:
    def getColValues(self,node, col):
        return list(node[col].unique())
   
    def getNodesIndex(self,node, col, dtype,split):
        if dtype == "binary":
            return node[col]==split
        if dtype == "numerical":
            return node[col]<=split
        if dtype == "categorical":
            return node[col].isin(split[0])
        
    def findDtype(self, vals):
        if len(vals)==2:
            return "binary"
        elif len(vals)>10:
            return "numerical" 
        elif len(vals)<=10 and len(vals)>2:
            return "categorical"
        return None  
       

In [4]:

class Entropy: 
    def entropy(self,node):
        entropy = 0
        for k in self.types:
            p = np.count_nonzero(node==k)/len(node)
            if p != 0:
                entropy += -p*math.log(p,2)
        return entropy
    def entropy_score(self,node_left,node_right):
        m = len(node_left)+len(node_right)
        i = len(node_left)
        return i/m*self.entropy(node_left.target) + (m-i)/m*self.entropy(node_right.target)
    
    def findEntropyFromCol(self,node,col,dtype):
        vals = self.getColValues(node,col)
        if dtype == "binary":
            splits = vals[:1]
        elif dtype == "numerical":
            splits = sorted(vals)[:-1]
        elif dtype == "categorical":
            splits = self.findCombos(vals)

        nodes = [self.getNodesIndex(node, col, dtype,split) for split in splits]       
        impurities = [self.entropy_score(node[indx],node[~indx]) for indx in nodes]
        return {"entropy":min(impurities),"col":col,"dtype": dtype, "split": splits[impurities.index(min(impurities))]}
  
    
    
class Node:
    def __init__(self,minEntropy):
        self.col = minEntropy["col"]
        self.dtype = minEntropy["dtype"]
        self.split = minEntropy["split"]
        self.entropy = minEntropy["entropy"]
    def getCol(self):
        return self.col
    def getSplit(self):
        return self.split
    def newLevel(self,x):
        if self.dtype == "binary":
            return x[self.col]==self.split
        if self.dtype == "numerical":
            return x[self.col]<=self.split
        if self.dtype == "categorical":
            return x[self.col] in self.split[0]

In [9]:
class DecisionTree(Entropy,Combos, SplitFunctions):
    def __init__(self,X,y,maxDepth):
        self.node = pd.concat([X,y],axis=1)
        self.types = np.unique(y)
        self.maxDepth = maxDepth
        self.cols = {col : self.findDtype(self.getColValues(df,col)) for col in df.columns if col != "target"}
        self.Tree = self.DecisionTree(df,0)
        
    def findMajority(self,node):
        counts = [np.count_nonzero(node.target==k) for k in self.types]
        return self.types[counts.index(max(counts))]
    
    def DecisionTree(self,node, depth):
        cols_with_var_dist = np.count_nonzero([ self.findDtype(self.getColValues(node,col)) for col in node.columns if col!="target"])
        if depth == self.maxDepth or self.entropy(node)==0 or cols_with_var_dist==0:
            return [self.findMajority(node)]
       
        minEntropy = {"entropy":1,"col":None,"dtype": None, "split": None}
        
        for col,dtype in self.cols.items():
            
            if self.findDtype(self.getColValues(node,col)) == None: continue
            newEntropy = self.findEntropyFromCol(node, col,dtype)

            if newEntropy["entropy"] <= minEntropy["entropy"]:
                minEntropy = newEntropy
        
        indx = self.getNodesIndex(node,minEntropy["col"], minEntropy["dtype"],minEntropy["split"])
        node_left, node_right = node[indx],node[~indx]
        return [Node(minEntropy),self.DecisionTree(node_left,depth+1),self.DecisionTree(node_right,depth+1)]
    
    
    

In [10]:
Tree = DecisionTree(Xtrain,ytrain,1).Tree
print(Tree)

[<__main__.Node object at 0x7fdaecea7790>, [1], [0]]


In [2]:
from sklearn.utils import shuffle
df = shuffle(df)
y = df.target
X = df.drop("target",axis=1)
Xtrain= X.head(200)
Xtest = X.tail(103)
ytrain= y.head(200)
ytest = y.tail(103)

In [156]:
class Predict:
    def __init__(self,x,Tree):
        
        while len(Tree) > 1:
            treeLevel = Tree[0]
            direction = 1 if treeLevel.newLevel(x) else 2
            Tree = Tree[direction]
        self.pred = Tree[0]

In [157]:
false = 0
for i in range(len(Xtest)):
    if Predict(Xtest.iloc[i],Tree).pred!=y[i]: 
        false += 1

print(false/len(Xtest))

0.44660194174757284


In [4]:
Xtrain = Xtrain.replace(0,0.3)

In [5]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state = 0, max_depth = 10)
clf.fit(Xtrain, ytrain)
print(clf.score(Xtest, ytest))
print(clf.tree_.max_depth)

0.7087378640776699
9


In [141]:
Tree[0].col

'thal'

In [7]:
from sklearn import tree
from subprocess import check_call

In [8]:
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_2 <= 0.65
|   |--- feature_11 <= 0.65
|   |   |--- feature_12 <= 2.50
|   |   |   |--- feature_9 <= 1.70
|   |   |   |   |--- feature_7 <= 96.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_7 >  96.50
|   |   |   |   |   |--- feature_0 <= 58.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- feature_0 >  58.50
|   |   |   |   |   |   |--- feature_0 <= 61.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- feature_0 >  61.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |--- feature_9 >  1.70
|   |   |   |   |--- feature_6 <= 0.65
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_6 >  0.65
|   |   |   |   |   |--- class: 0
|   |   |--- feature_12 >  2.50
|   |   |   |--- feature_0 <= 58.00
|   |   |   |   |--- feature_10 <= 1.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_10 >  1.50
|   |   |   |   |   |--- feature_0 <= 42.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |

In [158]:
math.log(.5,2)

-1.0