In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math
from matplotlib import style
from collections import Counter
style.use('fivethirtyeight') #Shows Grid
import pandas as pd
import random

In [3]:
df = pd.read_csv('Breast-Cancer.csv',na_values = ['?'])
means = df.mean().to_dict()
df.drop(['id'],1,inplace=True)
header = list(df)
df.fillna(df.mean(),inplace = True)
full_data = df.astype(float).values.tolist()
full_data

[[5.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [5.0, 4.0, 4.0, 5.0, 7.0, 10.0, 3.0, 2.0, 1.0, 2.0],
 [3.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 1.0, 1.0, 2.0],
 [6.0, 8.0, 8.0, 1.0, 3.0, 4.0, 3.0, 7.0, 1.0, 2.0],
 [4.0, 1.0, 1.0, 3.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [8.0, 10.0, 10.0, 8.0, 7.0, 10.0, 9.0, 7.0, 1.0, 4.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 10.0, 3.0, 1.0, 1.0, 2.0],
 [2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 5.0, 2.0],
 [4.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [5.0, 3.0, 3.0, 3.0, 2.0, 3.0, 4.0, 4.0, 1.0, 4.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 1.0, 1.0, 2.0],
 [8.0, 7.0, 5.0, 10.0, 7.0, 9.0, 5.0, 5.0, 4.0, 4.0],
 [7.0, 4.0, 6.0, 4.0, 6.0, 1.0, 4.0, 3.0, 1.0, 4.0],
 [4.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [4.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [10.0, 7.0, 7.0, 6.0, 4.0, 10.0, 4.0, 1

In [3]:
test_size1 = 0.5
train_data1 = full_data[:-int(test_size1*len(full_data))]
test_data1 = full_data[-int(test_size1*len(full_data)):]
len(test_data1)

349

In [4]:
test_size2 = 0.1
train_data2 = full_data[:-int(test_size2*len(full_data))]
test_data2 = full_data[-int(test_size2*len(full_data)):]
len(test_data2)

69

In [5]:
test_size3 = 0.3
train_data3 = full_data[:-int(test_size3*len(full_data))]
test_data3 = full_data[-int(test_size3*len(full_data)):]
len(test_data3)

209

In [6]:
def unique_vals(Data,col):
    return set([row[col] for row in Data])

In [7]:
def class_counts(Data):
    counts = {}
    for row in Data:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [8]:
class Question:
    def __init__(self,column,value):
        self.column = column
        self.value = value
    def match(self,example):
        val = example[self.column]
        return val == self.value
    def __repr__(self):
        return "Is %s %s %s?" %(
            header[self.column],"==",str(self.value))

In [9]:
def partition(Data,question):
    true_rows,false_rows = [],[]
    for row in Data:
        if(question.match(row)):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows,false_rows

In [10]:
def gini(Data):
    counts = class_counts(Data)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl]/float(len(Data))
        impurity-=prob_of_lbl**2
    return impurity

In [11]:
def info_gain(left,right,current_uncertainty):
    p = float(len(left))/(len(left)+len(right))
    return current_uncertainty - p*gini(left) - (1-p)*gini(right)

In [12]:
def find_best_split(Data):
    best_gain = 0
    best_question = None
    current_uncertainty = gini(Data)
    n_features = len(Data[0]) - 1
    for col in range(n_features):
        values = unique_vals(Data,col)
        for val in values:
            question = Question(col,val)
            true_rows,false_rows = partition(Data,question)
            if(len(true_rows) == 0 or len(false_rows)==0):
                continue
            gain = info_gain(true_rows,false_rows,current_uncertainty)
            if gain>=best_gain:
                best_gain, best_question = gain , question
    return best_gain,best_question

In [13]:
class Leaf:
    def __init__(self,Data):
        self.predictions = class_counts(Data)

In [14]:
class Decision_Node:
    def __init__(self, question, true_branch,false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch
        #print(self.question)

In [15]:
def build_tree(Data,i=0):
    gain, question = find_best_split(Data)
    
    if gain == 0:
        return Leaf(Data)
    true_rows , false_rows = partition(Data,question)
    true_branch = build_tree(true_rows,i)
    false_branch = build_tree(false_rows,i)
    return Decision_Node(question,true_branch,false_branch)

In [16]:
def print_tree(node,spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "Predict",node.predictions)
        return
    print(spacing+str(node.question))
    print(spacing + "--> True:")
    print_tree(node.true_branch , spacing + " ")
    
    print(spacing + "--> False:")
    print_tree(node.false_branch , spacing + " ")
    

In [17]:
def print_leaf(counts):
    total = sum(counts.values())*1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl]/total * 100)) + "%"
    return probs

In [18]:
def classify(row,node):
    if isinstance(node,Leaf):
        return node.predictions
    if node.question.match(row):
        return classify(row,node.true_branch)
    else:
        return classify(row,node.false_branch)

In [19]:
my_tree = build_tree(train_data1)
print_tree(my_tree)

Is x2 == 1.0?
--> True:
 Is x6 == 10.0?
 --> True:
  Is x8 == 1.0?
  --> True:
   Predict {2.0: 1}
  --> False:
   Predict {4.0: 2}
 --> False:
  Is x8 == 10.0?
  --> True:
   Predict {4.0: 1}
  --> False:
   Is x6 == 5.0?
   --> True:
    Is x7 == 2.0?
    --> True:
     Predict {4.0: 1}
    --> False:
     Predict {2.0: 4}
   --> False:
    Predict {2.0: 155}
--> False:
 Is x3 == 1.0?
 --> True:
  Predict {2.0: 9}
 --> False:
  Is x3 == 2.0?
  --> True:
   Is x9 == 1.0?
   --> True:
    Predict {2.0: 8}
   --> False:
    Predict {4.0: 3}
  --> False:
   Is x8 == 2.0?
   --> True:
    Is x9 == 1.0?
    --> True:
     Predict {2.0: 3}
    --> False:
     Predict {4.0: 2}
   --> False:
    Is x6 == 3.5446559297218156?
    --> True:
     Is x1 == 8.0?
     --> True:
      Predict {4.0: 2}
     --> False:
      Predict {2.0: 3}
    --> False:
     Is x1 == 4.0?
     --> True:
      Is x8 == 3.0?
      --> True:
       Predict {2.0: 1}
      --> False:
       Predict {4.0: 2}
     --> Fals

In [20]:
def calc_accuracy(test_data, my_tree):
    correct,total = 0,0
    for row in test_data:
        if(row[-1] in print_leaf(classify(row,my_tree)).keys()):
            correct += 1
        total += 1
    return correct/total

In [21]:
for row in test_data1:
    print("Actual: %s. Predicted: %s" % (row[-1],print_leaf(classify(row,my_tree))))
accuracy = calc_accuracy(test_data1,my_tree)

Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {4.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0.

In [22]:
print(accuracy,"accuracy for 50% train data and 50% test data")

0.9455587392550143 accuracy for 50% train data and 50% test data


In [23]:
my_tree2 = build_tree(train_data2)
print_tree(my_tree2)

Is x2 == 1.0?
--> True:
 Is x6 == 10.0?
 --> True:
  Is x8 == 1.0?
  --> True:
   Predict {2.0: 1}
  --> False:
   Predict {4.0: 2}
 --> False:
  Is x8 == 10.0?
  --> True:
   Predict {4.0: 1}
  --> False:
   Is x6 == 5.0?
   --> True:
    Is x5 == 1.0?
    --> True:
     Predict {4.0: 1}
    --> False:
     Predict {2.0: 7}
   --> False:
    Predict {2.0: 324}
--> False:
 Is x6 == 1.0?
 --> True:
  Is x3 == 10.0?
  --> True:
   Predict {4.0: 6}
  --> False:
   Is x2 == 4.0?
   --> True:
    Is x5 == 3.0?
    --> True:
     Predict {2.0: 1}
    --> False:
     Predict {4.0: 3}
   --> False:
    Is x5 == 10.0?
    --> True:
     Predict {4.0: 2}
    --> False:
     Is x9 == 4.0?
     --> True:
      Predict {4.0: 1}
     --> False:
      Is x7 == 10.0?
      --> True:
       Predict {4.0: 1}
      --> False:
       Predict {2.0: 43}
 --> False:
  Is x3 == 2.0?
  --> True:
   Is x7 == 3.0?
   --> True:
    Predict {2.0: 5}
   --> False:
    Is x7 == 2.0?
    --> True:
     Predict {2.0: 

In [24]:
for row in test_data2:
    print("Actual: %s. Predicted: %s" % (row[-1],print_leaf(classify(row,my_tree2))))
accuracy2 = calc_accuracy(test_data2,my_tree2)

Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0.

In [25]:
print(accuracy2,"accuracy for 90% train data and 10% test data")

0.9565217391304348 accuracy for 90% train data and 10% test data


In [26]:
my_tree3 = build_tree(train_data3)
print_tree(my_tree3)

Is x2 == 1.0?
--> True:
 Is x6 == 10.0?
 --> True:
  Is x8 == 1.0?
  --> True:
   Predict {2.0: 1}
  --> False:
   Predict {4.0: 2}
 --> False:
  Is x8 == 10.0?
  --> True:
   Predict {4.0: 1}
  --> False:
   Is x6 == 5.0?
   --> True:
    Is x7 == 2.0?
    --> True:
     Predict {4.0: 1}
    --> False:
     Predict {2.0: 5}
   --> False:
    Predict {2.0: 230}
--> False:
 Is x6 == 1.0?
 --> True:
  Is x3 == 10.0?
  --> True:
   Predict {4.0: 6}
  --> False:
   Is x3 == 4.0?
   --> True:
    Is x8 == 1.0?
    --> True:
     Predict {2.0: 1}
    --> False:
     Predict {4.0: 3}
   --> False:
    Is x5 == 10.0?
    --> True:
     Predict {4.0: 1}
    --> False:
     Is x5 == 6.0?
     --> True:
      Predict {4.0: 1}
     --> False:
      Predict {2.0: 34}
 --> False:
  Is x3 == 2.0?
  --> True:
   Is x7 == 3.0?
   --> True:
    Predict {2.0: 5}
   --> False:
    Is x9 == 1.0?
    --> True:
     Is x7 == 4.0?
     --> True:
      Predict {4.0: 1}
     --> False:
      Predict {2.0: 2}
  

In [27]:
for row in test_data3:
    print("Actual: %s. Predicted: %s" % (row[-1],print_leaf(classify(row,my_tree3))))
accuracy3 = calc_accuracy(test_data3,my_tree3)

Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 2.0. Predicted: {2.0: '100%'}
Actual: 4.0. Predicted: {4.0: '100%'}
Actual: 4.0. Predicted: {2.0: '100%'}
Actual: 2.0.

In [28]:
print(accuracy3,"accuracy for 70% train data and 30% test data")

0.9617224880382775 accuracy for 70% train data and 30% test data
