# Improved ID3 algorithm for clinical data classification

In [1]:
# Import the libraries used
import unicodecsv
from collections import defaultdict
from math import cos,sqrt,log2

In [2]:
diagnosis = []
f = open('bcw-des.csv', 'rb')
reader = unicodecsv.DictReader(f)
for row in reader:
    diagnosis.append(row)
f.close()

In [3]:
# Length of read data set
print(len(diagnosis))
# A tuple in the data set 
diagnosis[0]

683


OrderedDict([('Clump Thickness', '1.0_5.0'),
             ('Cell Size', '1.0_3.0'),
             ('Cell Shape', '1.0_3.0'),
             ('Marginal Adhesion', '1.0_3.0'),
             ('Single Epithelial Cell Size', '1.0_2.0'),
             ('Bare Nuclei', '1.0_3.0'),
             ('Bland Chromatin', '1.0_3.0'),
             ('Normal Nucleoli', '1.0_2.0'),
             ('Mitoses', '1.0'),
             ('Class', '2')])

In [4]:
# Data Wrangling: If the tuple contains missing values remove that tuple
def check(data):
    for v in data.values():
        if v=='?':
            return False
    return True

In [5]:
# clean the data and store it in the cleanedData variable
cleanedData=[]
for diag in diagnosis:
    if check(diag):
        cleanedData.append(diag)
    

In [45]:
# Node in a ID3 tree
class Node:
    def __init__(self,testAttr,predictedClass):
        self.testAttr=testAttr
        self.predictedClass=predictedClass
        self.children={}
    def __str__(self):
        return self.testAttr
    def childFill(self,trainData):
        for k,v in self.children.items():
            if v.testAttr=='' and v.predictedClass == '':
                classCount = defaultdict(int)
                for ex in trainData:
                    if ex[testAttr] == k:
                        classCount[ex[className]] += 1
                vals = list(classCount.values())
                clss = lsit(classCount.keys())
                self.children[k] = Node('',clss[vals.index(max( vals))])
def getChild(n,curAttVal,trainData):
    classCount = defauldict(int)
    for ex in trainData:
        if ex[testAttr] == curAttVal:
            classCount[ex[className]] += 1
            vals = list(classCount.values())
            clss = list(classCount.keys())
            return Node('',clss[vals.index(max( vals))])
def printTree(n):
    print(n)
    if not isinstance(n,Node) or len(n.children)==0:
        return
    print('children are:')
    for k,v in n.children.items():
        print(k,v)
    for v in n.children.values():
        printTree(v)

In [46]:
# Import breastAttr.txt into breastAttr
# The breastAttr contains attributes in the data set and their possible values
with open('bcw-datt.txt','r') as f:
    breastAttr=f.read()
s=breastAttr.split('\n')

In [47]:
# possVals is a dictionary containing attrname and its possible values
possVals={}
for line in s:
    temp=line.split(':')
    possVals[temp[0]]=temp[1].split(',')
possVals

{'Bare Nuclei': ['1.0_3.0', '4.0_10.0'],
 'Bland Chromatin': ['1.0_3.0', '4.0_10.0'],
 'Cell Shape': ['1.0_3.0', '4.0_10.0'],
 'Cell Size': ['1.0_3.0', '4.0_10.0'],
 'Class': ['2', '4'],
 'Clump Thickness': ['1.0_5.0', '6.0_10.0'],
 'Marginal Adhesion': ['1.0_3.0', '4.0_10.0'],
 'Mitoses': ['1.0', '2.0_8.0_10.0'],
 'Normal Nucleoli': ['1.0_2.0', '3.0_10.0'],
 'Single Epithelial Cell Size': ['1.0_2.0', '3.0_10.0']}

In [48]:
# From breastClass get the classes present and their respective values
possClassValues=[]
with open('bcw-catt.txt','r') as f:
    breastClass=f.read()
classLine=breastClass.split(':')
className = classLine[0]
possClassValues = classLine[1].split(',')


In [49]:
print(possClassValues,className)

['2', '4'] Class


In [50]:
# Balance function which decides on which attribute the node should split the tree
def entropy(edata):
    temp = defaultdict(int)
    for eavpair in edata:
        temp[eavpair[className]]+=1
    en = len(edata)
    e=1
    for epv in temp.values():
        e*=(epv/en)*log2(epv/en)
    return e

In [34]:
# IID3 improvisation: balance function to reduce multivariate splits
def balance(attrLen,imp = False):
    if not imp:
        return 1
    ans=cos(3.5*attrLen-1.5)/1.8
    ans*=log2(sqrt(attrLen+1))
    return abs(ans)

In [54]:
# The Improved ID3 algorithm written here as train
# The algorithm return the root of the tree
def train(trainData,attributes):
    e = entropy(trainData)
    if e==0:
        return Node('',trainData[0][className])
    if len(attributes) == 0:
        classCount = defaultdict(int)
        for ex in trainData:
            classCount[ex[className]] += 1
        maxClass= possClassValues[0]
        maxCount= classCount[maxClass]
        for k,v in classCount.items():
            if maxCount<v:
                maxCount = v
                maxClass = k
        return Node('',maxClass)
    info = {}
    gain = {}
    n = len(trainData)
    for attr in attributes:
        attrVals = possVals[attr]
        attrTrainData = {}
        attrTrainData = defaultdict(list)
        for avpair in trainData:
            attrTrainData[avpair[attr]].append(avpair)
        info[attr] = 1
        for attrVal in attrTrainData.values():
            info[attr] += (len(attrVal)/n)*entropy(attrVal)
            gain[attr] = (e-info[attr])/balance(len(attrVals),imp = True)
    maxAttr = attributes[0]
    maxGain = gain[maxAttr]
    for k,v in gain.items():
        if maxGain < v:
            maxGain = v
            maxAttr = k
    n=Node(maxAttr,'')
    splitTrainData = defaultdict(list)
    for ex in trainData:
        splitTrainData[ex[maxAttr]].append(ex)
    newAttrs = attributes[:]
    newAttrs.remove(maxAttr)
    for splitAttrVal, splitData in splitTrainData.items():
        if len(splitData)==0:
            print('entereed')
            n.children[splitAttrVal] = getChild(n, splitAttrVal, trainData)
        else:    
            n.children[splitAttrVal]=train(splitData,newAttrs)
#     n.childFill(trainData)
    return

In [57]:
import numpy as np
size = len(cleanedData)
trainSize = 4*size//5
print(trainSize,size)
cleanedData = np.random.permutation(cleanedData)
n=train(cleanedData[:trainSize],list(possVals.keys()))

546 683


In [56]:
# Testing function to check the accuracy of the algorithm
def testing1(n,trainData):
    ab=0
    x=0
    for ex in trainData:
        p=n
        try:
            while p.testAttr!='':
                child=ex[p.testAttr]
                p=p.children[child]
#             print(p.predictedClass,ex[className])
            if p.predictedClass==ex[className]:
                x+=1
        except:
            ab+=1
    return x,ab

In [43]:
cor,incor=testing1(n,cleanedData[trainSize+1:])

In [44]:
print('accuracy={0}'.format(cor/136*100))

accuracy=0.0


In [41]:
printTree(n)

Class
children are:
4 
2 


