In [75]:
import pandas as pd
import ssl
import numpy as np
ssl._create_default_https_context = ssl._create_unverified_context
df = pd.read_csv("https://raw.githubusercontent.com/cwhitz/git_intro_to_ml/master/heart.csv")


In [56]:
class Combos:
    def findOptions(self,values):
        if len(values) == 0:
            return [values]
        smaller = self.findOptions(values[1:])
        return smaller + [[values[0]] + x for x in smaller]

    def findCombos(self,vals):
        options = []
        duplicate_options = [([x for x in vals if x not in option], option) for option in self.findOptions(vals)]
        for option in duplicate_options:
            if (option[1],option[0]) not in options and len(option[0]) != len(vals) and len(option[1]) != len(vals):
                options.append(option)
        return options

In [57]:
class SplitFunctions:
    def getColValues(self,node, col):
        return list(node[col].unique())
   
    def getNodesIndex(self,node, col, dtype,split):
        if dtype == "binary":
            return node[col]==split
        if dtype == "numerical":
            return node[col]<=split
        if dtype == "categorical":
            return node[col].isin(split[0])
        
    def findDtype(self, vals):
        if len(vals)==2:
            return "binary"
        elif len(vals)>10:
            return "numerical" 
        elif len(vals)<=10 and len(vals)>2:
            return "categorical"
        return None  
       

In [58]:

class Gini: 
    def gini_impurity(self,node):
        return 1-sum([(np.count_nonzero(node==k)/len(node))**2 for k in self.types])
    def gini_impurity_score(self,node_left,node_right):
        m = len(node_left)+len(node_right)
        i = len(node_left)
        return i/m*self.gini_impurity(node_left.target) + (m-i)/m*self.gini_impurity(node_right.target)
    
    def findGiniImpurityFromCol(self,node,col,dtype):
        vals = self.getColValues(node,col)
        if dtype == "binary":
            splits = vals[:1]
        elif dtype == "numerical":
            splits = sorted(vals)[:-1]
        elif dtype == "categorical":
            splits = self.findCombos(vals)

        nodes = [self.getNodesIndex(node, col, dtype,split) for split in splits]       
        impurities = [self.gini_impurity_score(node[indx],node[~indx]) for indx in nodes]
        return {"gini":min(impurities),"col":col,"dtype": dtype, "split": splits[impurities.index(min(impurities))]}
  
    
    
class Node:
    def __init__(self,minGini):
        self.col = minGini["col"]
        self.dtype = minGini["dtype"]
        self.split = minGini["split"]
        self.gini = minGini["gini"]
    def getCol(self):
        return self.col
    def getSplit(self):
        return self.split

In [163]:
class DecisionTree(Gini,Combos, SplitFunctions):
    def __init__(self,X,y,maxDepth):
        self.node = pd.concat([X,y],axis=1)
        self.types = np.unique(y)
        self.maxDepth = maxDepth
        self.cols = {col : self.findDtype(self.getColValues(df,col)) for col in df.columns if col != "target"}
        self.Tree = self.DecisionTree(df,0)
        
    def findMajority(self,node):
        counts = [np.count_nonzero(node.target==k) for k in self.types]
        return self.types[counts.index(max(counts))]
    
    def DecisionTree(self,node, depth):
        cols_with_var_dist = np.count_nonzero([ self.findDtype(self.getColValues(node,col)) for col in node.columns if col!="target"])
        if depth == self.maxDepth or self.gini_impurity(node)==0 or cols_with_var_dist==0:
            return [self.findMajority(node)]
        minGini = {"gini":.5,"col":None,"dtype": None, "split": None}
        for col,dtype in self.cols.items():
            if self.findDtype(self.getColValues(node,col)) == None: continue
            newGini = self.findGiniImpurityFromCol(node, col,dtype)
            print(col, newGini["gini"],newGini["split"])
            if newGini["gini"] <= minGini["gini"]:
                minGini = newGini
        indx = self.getNodesIndex(node,minGini["col"], minGini["dtype"],minGini["split"])
        node_left, node_right = node[indx],node[~indx]
        return [Node(minGini),self.DecisionTree(node_left,depth+1),self.DecisionTree(node_right,depth+1)]
    
    
    

In [164]:
class Predict:
    def newLevel(self,x, col, dtype,split): #decides whether the x should go right or left based on whether the condition is false or true
        if dtype == "binary":
            return x[col]==split
        if dtype == "numerical":
            return x[col]<=split
        if dtype == "categorical":
            return x[col] in split[1]
    def __init__(self,x,Tree):
        
        while len(Tree) > 1:
            treeLevel = Tree[0]
            direction = 1 if self.newLevel(x,treeLevel.col,treeLevel.dtype,treeLevel.split) else 2
            Tree = Tree[direction]
        self.pred = Tree[0]

In [165]:
Tree2 = DecisionTree(Xtrain,ytrain,1).Tree
print(Tree2)

age 0.4553348416602667 54
sex 0.45688047065576126 1
cp 0.3639513951395139 ([2, 1, 3], [0])
trestbps 0.48575283174147954 142
chol 0.48403402368662907 245
fbs 0.49563964148352824 0
restecg 0.4807829685024968 ([1], [0, 2])
thalach 0.4091608964046012 147
exang 0.40140876832781325 1
oldpeak 0.41441363539162834 1.6
slope 0.41900217490911884 ([2], [0, 1])
ca 0.37908538821361815 ([1, 2, 3], [0, 4])
thal 0.35809349996647716 ([3, 1, 0], [2])
[<__main__.Node object at 0x7fc922221880>, [0], [1]]


In [115]:
from sklearn.utils import shuffle
df = shuffle(df)
y = df.target
X = df.drop("target",axis=1)
Xtrain= X.head(200)
Xtest = X.tail(103)
ytrain= y.head(200)
ytest = y.tail(103)

In [120]:
false = 0
for i in range(len(Xtest)):
    if Predict(Xtest.iloc[i],Tree).pred!=y[i]: false += 1
print(false)
print(false/len(Xtest))

52
0.5048543689320388


In [127]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state = 0, max_depth = 10)
clf.fit(Xtrain, ytrain)
print(clf.score(Xtest, ytest))
print(clf.tree_.max_depth)

0.7669902912621359
8


In [141]:
Tree[0].col

'thal'

In [131]:
from sklearn import tree
from subprocess import check_call

In [137]:
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_2 <= 0.50
|   |--- feature_11 <= 0.50
|   |   |--- feature_12 <= 2.50
|   |   |   |--- feature_7 <= 146.00
|   |   |   |   |--- feature_6 <= 0.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_6 >  0.50
|   |   |   |   |   |--- feature_3 <= 115.00
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- feature_3 >  115.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- feature_7 >  146.00
|   |   |   |   |--- feature_4 <= 327.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_4 >  327.50
|   |   |   |   |   |--- feature_9 <= 0.30
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- feature_9 >  0.30
|   |   |   |   |   |   |--- class: 1
|   |   |--- feature_12 >  2.50
|   |   |   |--- feature_9 <= 0.45
|   |   |   |   |--- feature_4 <= 254.00
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_4 >  254.00
|   |   |   |   |   |--- class: 0
|   |   |   |--- feature_9 >  0.45
|   |   |   |   |--- class: 0
|   |

In [None]:
if len(vals)>10:
                find_gini = [.5,vals[0]]
                for val in vals[1:]:
                    node_left = df[df[col]<val].target
                    node_right = df[df[col]>=val].target
                    impurity = self.gini_impurity_score(node_left,node_right)
                    if impurity < find_gini[0]: find_gini=[impurity,val]

In [166]:
int(7.9)

7

In [168]:
def hey():

    print("k")
    return
hey()

k


In [170]:
int(-1.22)

-1