In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import sys
from scipy.stats import mode

# Part 2: Naive Bayes Method

In [2]:
trainingSet = pd.read_csv('breast-cancer-training.csv')
trainingSet = trainingSet.drop("Unnamed: 0", axis = 1) #dont need id column

testSet = pd.read_csv('breast-cancer-test.csv')
testSet = testSet.drop("Unnamed: 0", axis = 1) #dont need id column

In [3]:
trainingFeatures = trainingSet.columns.values
trainingFeatures

array(['class', 'age', 'menopause', 'tumor-size', 'inv-nodes',
       'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat'],
      dtype=object)

In [4]:
trainingValues = trainingSet.values
rows, cols = trainingValues.shape
classLabels = set()
for i in range(rows):
    classLabels.add(trainingValues[i][0])
classLabels

{'no-recurrence-events', 'recurrence-events'}

In [5]:
featuresToFeatureValues = {
    'age': ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99'],
    'menopause': ['lt40', 'ge40', 'premeno'],
    'tumor-size': ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59'],
    'inv-nodes': ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23', '24-26', '27-29', '30-32', '33-35', '36-39'],
    'node-caps': ['yes', 'no'],
    'deg-malig': [1, 2, 3],
    'breast': ['left', 'right'],
    'breast-quad': ['left_up', 'left_low', 'right_up', 'right_low', 'central'],
    'irradiat': ['yes', 'no']
}
len(featuresToFeatureValues)

9

In [6]:
#returns a probability table
def naiveBayesTraining(trainSet, features, classes, featureValues):
    rows, cols = trainSet.shape
    classCount = dict()
    featureToValueToClassProb = dict()
    
    #Initialise Count numbers to 1
    for i, classValue in enumerate(classes):
        #classValue = classes[i]
        if(classCount.get(classValue) == None):
            classCount[classValue] = 1
        for j in range(1, features.size): #for each feature
            feat = features[j]
            featValues = featureValues[feat] # a list
            #print(featValues)
            #print(featValues[0])
            for k in range(len(featValues)): #for each possible feature value
                if(featureToValueToClassProb.get(feat) == None):
                    featureToValueToClassProb[feat] = dict()
                #featValueToClassProb = featureToValueToClassProb[feat]
                if(featureToValueToClassProb[feat].get(featValues[k]) == None):
                    featureToValueToClassProb[feat][featValues[k]] = {classValue: 1}
                else:
                    featureToValueToClassProb[feat][featValues[k]][classValue] = 1
    #print(featureToValueToClassProb)
    #Count the numbers of each class and feature value based on the training instances
    for i in range(rows):
        classValue = trainSet[i][0]
        classCount[classValue] += 1
        for j in range(1, features.size):
            feat = features[j]
            featValue = trainSet[i][j]
            featureToValueToClassProb[feat][featValue][classValue] += 1
            
    #Calculate the total/denominators
    class_total = 0
    featureClassTotals = dict()
    for i, classValue in enumerate(classes):
        class_total += classCount[classValue]
        for j in range(1, features.size):
            feat = features[j]
            featValues = featureValues[feat]
            if(featureClassTotals.get(feat) == None):
                featureClassTotals[feat] = {
                    classValue: 0
                }
            else:
                featureClassTotals[feat][classValue] = 0
            for k in range(len(featValues)):
                featureClassTotals[feat][classValue] += featureToValueToClassProb[feat][featValues[k]][classValue]
    
    
    #Calculate the probabilities from the counting numbers.
    classProbabilities = dict()
    for i, classValue in enumerate(classes):
        classProbabilities[classValue] = classCount[classValue] / class_total
        for j in range(1, features.size):
            feat = features[j]
            featValues = featureValues[feat]
            for k in range(len(featValues)):
                featureToValueToClassProb[feat][featValues[k]][classValue] /= featureClassTotals[feat][classValue]
    return classProbabilities, featureToValueToClassProb

In [7]:
cProb, featProb = naiveBayesTraining(trainingValues, trainingFeatures, classLabels, featuresToFeatureValues)
cProb, featProb 

({'no-recurrence-events': 0.7063197026022305,
  'recurrence-events': 0.2936802973977695},
 {'age': {'10-19': {'no-recurrence-events': 0.005050505050505051,
    'recurrence-events': 0.011494252873563218},
   '20-29': {'no-recurrence-events': 0.010101010101010102,
    'recurrence-events': 0.011494252873563218},
   '30-39': {'no-recurrence-events': 0.1111111111111111,
    'recurrence-events': 0.1839080459770115},
   '40-49': {'no-recurrence-events': 0.31313131313131315,
    'recurrence-events': 0.3103448275862069},
   '50-59': {'no-recurrence-events': 0.3282828282828283,
    'recurrence-events': 0.25287356321839083},
   '60-69': {'no-recurrence-events': 0.1919191919191919,
    'recurrence-events': 0.19540229885057472},
   '70-79': {'no-recurrence-events': 0.030303030303030304,
    'recurrence-events': 0.011494252873563218},
   '80-89': {'no-recurrence-events': 0.005050505050505051,
    'recurrence-events': 0.011494252873563218},
   '90-99': {'no-recurrence-events': 0.005050505050505051,
 

In [8]:
for key, prob in cProb.items():
    print('P(Y =', key, ') =', prob)

P(Y = no-recurrence-events ) = 0.7063197026022305
P(Y = recurrence-events ) = 0.2936802973977695


In [9]:
for feat, featValDict in featProb.items():
    for featVal, probs in featValDict.items():
        for classVal, prob in probs.items():
            print('P(',feat, '=', featVal, '| Y =', classVal, ') = ', prob)

P( age = 10-19 | Y = no-recurrence-events ) =  0.005050505050505051
P( age = 10-19 | Y = recurrence-events ) =  0.011494252873563218
P( age = 20-29 | Y = no-recurrence-events ) =  0.010101010101010102
P( age = 20-29 | Y = recurrence-events ) =  0.011494252873563218
P( age = 30-39 | Y = no-recurrence-events ) =  0.1111111111111111
P( age = 30-39 | Y = recurrence-events ) =  0.1839080459770115
P( age = 40-49 | Y = no-recurrence-events ) =  0.31313131313131315
P( age = 40-49 | Y = recurrence-events ) =  0.3103448275862069
P( age = 50-59 | Y = no-recurrence-events ) =  0.3282828282828283
P( age = 50-59 | Y = recurrence-events ) =  0.25287356321839083
P( age = 60-69 | Y = no-recurrence-events ) =  0.1919191919191919
P( age = 60-69 | Y = recurrence-events ) =  0.19540229885057472
P( age = 70-79 | Y = no-recurrence-events ) =  0.030303030303030304
P( age = 70-79 | Y = recurrence-events ) =  0.011494252873563218
P( age = 80-89 | Y = no-recurrence-events ) =  0.005050505050505051
P( age = 80-89

In [12]:
def predictInstance(testInstance, features, classes, classProb, featProb):
    scores = dict()
    maxScore = -1 
    predictedClass = None
    for i, classValue in enumerate(classes):
        scores[classValue] = classProb.get(classValue)
        #score.append(classProb.get(classValue))
        for j in range(1, features.size):
            feat = features[j]
            scores[classValue] *= featProb[feat][testInstance[j]][classValue]
        if(scores[classValue] > maxScore):
            maxScore = scores[classValue]
            predictedClass = classValue
    return scores, predictedClass

In [16]:
testValues = testSet.values
rows, cols = testValues.shape
correctPredictions = 0
for i in range(rows):
    #print(testValues[i])
    scores, predictedClass = predictInstance(testValues[i], trainingFeatures, classLabels, cProb, featProb)
    if(predictedClass == testValues[i][0]):
        correctPredictions += 1
    for classVal, score in scores.items():
        print('score(Y = ', classVal, ',', testValues[i][1:], ') = ', scores[classVal])
    print('Predicted class: ', predictedClass, ', Actual class: ', testValues[i][0])
    print()          
print('accuracy: ', correctPredictions/rows)

score(Y =  no-recurrence-events , ['50-59' 'premeno' '50-54' '0-2' 'yes' 2 'right' 'left_up' 'yes'] ) =  4.017731924138001e-06
score(Y =  recurrence-events , ['50-59' 'premeno' '50-54' '0-2' 'yes' 2 'right' 'left_up' 'yes'] ) =  7.096642912782161e-06
Predicted class:  recurrence-events , Actual class:  no-recurrence-events

score(Y =  no-recurrence-events , ['50-59' 'ge40' '35-39' '0-2' 'no' 2 'left' 'left_up' 'no'] ) =  0.00033365875538186835
score(Y =  recurrence-events , ['50-59' 'ge40' '35-39' '0-2' 'no' 2 'left' 'left_up' 'no'] ) =  2.6021024013534593e-05
Predicted class:  no-recurrence-events , Actual class:  no-recurrence-events

score(Y =  no-recurrence-events , ['50-59' 'premeno' '10-14' '3-5' 'no' 1 'right' 'left_up' 'no'] ) =  4.707378797592419e-05
score(Y =  recurrence-events , ['50-59' 'premeno' '10-14' '3-5' 'no' 1 'right' 'left_up' 'no'] ) =  9.715041077797865e-07
Predicted class:  no-recurrence-events , Actual class:  no-recurrence-events

score(Y =  no-recurrence-event