# Naive Bayes
Matthew Dacre (2091295)

Alex Vogt (2152320)

James Alence (2117129)

Joshua Wacks (2143116)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import openpyxl
from pathlib import Path

This code is used to predict voting habits of responsees using the following factors:


    Urban or rural primary sample unit
    Views on countries current economic situation
    If they discuss politics
    Requests government action
    Views on corruption
    Education
    



# Binarisation:

## Urban/Rural:

&emsp;Urban: 0

&emsp;Rural: 1

&emsp;Tribal: 2

&emsp;Rural farm = 3


## Views on country's economic condition:


&emsp;Very Bad:0

&emsp;Fairly Bad: 1

&emsp;Fairly Good: 2

&emsp;Very Good: 3

&emsp;Neither good nor bad: 4

&emsp;Don't know: -1



## Discuss Politics:

&emsp;Never: 0

&emsp;Occasionally: 1

&emsp;Frequently: 2

&emsp;Don't know: -1



## Voted:

&emsp;Did Voted: 1

&emsp;Don't know: -1

&emsp;All others: 0


## Requests gov action

&emsp;No, would never do this: 0

&emsp;No, but would do if had the chance: 1

&emsp;Yes, once or twice: 2

&emsp;Yes, several times: 3 

&emsp;Yes, often: 4

&emsp;Don't know: -1


## Corruption:

&emsp;Decreased a lot: 0

&emsp;Decreased somewhat: 1

&emsp;Stayed the same: 2

&emsp;Increased somewhat: 3

&emsp;Increased a lot: 4

&emsp;Don't know: -1



## Education:

&emsp; No formal schooling: 0

&emsp;Informal schooling only: 1

&emsp;Some primary schooling: 2

&emsp;Primary school completed, Some secondary school / high school: 3

&emsp;Secondary school / high school completed: 4

&emsp;Some university: 5

&emsp;University completed, Post-graduate, Post-secondary qualifications, other than university: 6

&emsp;Don't know, Inside the house, Outside the compound: -1





&emsp;

The Data is stored in an excel file called "MLData_Trimmed2.xlsx". Using opnpyxl, the data from the file is loaded into the variable "sheet", and then into an $2389x8$ array

In [2]:
xlsx_file = Path('MLData_Trimmed2.xlsx')
wb_obj = openpyxl.load_workbook(xlsx_file)
sheet = wb_obj.active 

In [3]:
data = np.zeros((sheet.max_row-2, sheet.max_column))

ir = 0
ic = 0

#Iterates trhough each row/cell and places data in $data
for row in sheet.iter_rows(2, max_row = sheet.max_row - 1):
    for cell in row:
        if ic == 0:
            #RESPNO
            data[ir, ic] = cell.value
            
        elif ic == 1:
            #Urban/Rural
            if cell.value == 'Urban':
                data[ir, ic] = 0
            elif cell.value == 'Rural':
                data[ir, ic] = 1
            elif cell.value == 'Tribal':
                data[ir, ic] = 2
            else:
                data[ir, ic] = 3
                
        elif ic == 2:
            #Views on Country's Economic Condition
            if cell.value == "Don't know":
                data[ir, ic] = -1
            elif cell.value == 'Very bad':
                data[ir, ic] = 0
            elif cell.value == 'Fairly bad':
                data[ir, ic] = 1
            elif cell.value == 'Fairly Good':
                data[ir, ic] = 2
            elif cell.value == 'Very good':
                data[ir, ic] = 3
            elif cell.value == 'Neither good nor bad':
                data[ir, ic] = 4
                
        elif ic == 3:
            #Discuss Politics?
            if cell.value == "Don't know":
                data[ir, ic] = -1
            elif cell.value == "Never":
                data[ir, ic] = 0
            elif cell.value == "Occasionally":
                data[ir, ic] = 1
            elif cell.value == "Frequently":
                data[ir, ic] = 2
                
        elif ic == 4:
            #Voted?
            if cell.value == "Don't know":
                data[ir, ic] = -1
            elif cell.value == "You voted in the elections":
                data[ir, ic] = 1
            else:
                data[ir, ic] = 0
            
        elif ic == 5:
            #Requests gov assistance
            if cell.value == "Don't know":
                data[ir, ic] = -1
            elif cell.value == "No, would never do this":
                data[ir, ic] = 0
            elif cell.value == "No, but would do if had the chance":
                data[ir, ic] = 1
            elif cell.value == "Yes, once or twice":
                data[ir, ic] = 2
            elif cell.value == "Yes, several times":
                data[ir, ic] = 3
            elif cell.value == "Yes, often":
                data[ir, ic] = 4
            
            
        elif ic == 6:
            #Corruption
            if cell.value == "Don't know":
                data[ir, ic] = -1
            elif cell.value == "Decreased a lot":
                data[ir, ic] = 0
            elif cell.value == "Decreased somewhat":
                data[ir, ic] = 1
            elif cell.value == "Stayed the same":
                data[ir, ic] = 2
            elif cell.value == "Increased somewhat":
                data[ir, ic] = 3
            elif cell.value == "Increased a lot":
                data[ir, ic] = 4
        elif ic == 7:
            #Education
            invalidStrings = ["Don't know", "Outside the compound", "Inside the house"]
            if cell.value in invalidStrings:
                data[ir, ic] = -1
            elif cell.value == "No formal schooling":
                data[ir, ic] = 0
            elif cell.value == "Informal schooling only":
                data[ir, ic] = 1
            elif cell.value == "Some primary schooling":
                data[ir, ic] = 2
            elif cell.value == "Primary school completed" or cell.value == "Some secondary school / high school":
                data[ir, ic] = 3
            elif cell.value == "Secondary school / high school completed":
                data[ir, ic] = 4
            elif cell.value == "Some university":
                data[ir, ic] = 5
            elif cell.value == "University completed" or cell.value == "Post-graduate" or cell.value == "Post-secondary qualifications, other than university":
                data[ir, ic] = 6
        ic += 1
    ic = 0
    ir += 1

The next cell splits the data into Training (60%), Validation (20%) and Testing (20%) sets

In [4]:
ind = np.arange(2389)
np.random.shuffle(ind)
trainingInd = ind[0:1433]
validationInd = ind[1433:1910]
testingInd = ind[1910:]

The next two cells initialise and populate dictionaries that hold the number of occurrences of each response. These are stored as integers according to the binarisation detailed above.

In [5]:
dictHomeVoted = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0
}
dictHome = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0
}
dictEcoVoted = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0
}
dictEco = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0
}
dictDiscussVoted = {
    0 : 0,
    1 : 0,
    2 : 0
}
dictDiscuss = {
    0 : 0,
    1 : 0,
    2 : 0
}
dictGovActionVoted = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0   
}
dictGovAction = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0   
}
dictCorruptVoted = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0
}
dictCorrupt = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0
}
dictEduVoted = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0,
    5 : 0,
    6 : 0
}
dictEdu = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0,
    5 : 0,
    6 : 0
}
dictVoted = {
    0 : 0,
    1 : 0
}

dictList = [dictVoted, dictHome, dictEco, dictDiscuss, 0, dictGovAction, dictCorrupt, dictEdu, 1, dictHomeVoted, dictEcoVoted, dictDiscussVoted, 1, dictGovActionVoted, dictCorruptVoted, dictEduVoted]

#Splitting into Training, Validation and Testing data
dataTrain = np.take(data, trainingInd, axis=0)
dataVal = np.take(data, validationInd, axis=0)
dataTest = np.take(data, testingInd, axis=0)

In [6]:
#Storing counts of each entry from the training data

for rowCount, row in enumerate(dataTrain):
    if row[4] != -1:
        dictList[0][row[4]] += 1
    for colCount, col in enumerate(row):
        #Need to skip index 0 (RespNo) and index 4 (Voted)
        if colCount == 0 or colCount == 4:
            continue
        else:
            if not -1 in row:
                if row[4] == 0:
                    dictList[colCount][col] += 1
                else:
                    dictList[colCount + 8][col] += 1

Dictlist contains the count of each response, according to if the respondent voted or not. WE can now use Naive Bayes calssification to predict if a respondant voted or not. The counts are stored in $dictList$. $dictList[0:8]$ stores the counts from the respondants that did not vote, and $dictList[8:16]$ contains the counts from the respondants that voted

The cells below find and report the Training, Validation and Tetsing accuracies

In [7]:
#Training accuracy
resultsTrain = np.zeros((len(dataTrain), 2))
for rowCount, row in enumerate(dataTrain):
    voteProb = 1
    notVoteProb = 1
    #P(row|not voted)
    for count, _dict in enumerate(dictList[:8]):
        if count != 4 and count != 0:
            for i in range(len(_dict)):
                if i == row[count]:
                    voteProb *= (dictList[count+8][i] + 1)/(dictList[0][1] + len(_dict))
                    notVoteProb *= (_dict[i] + 1)/(dictList[0][0] + len(_dict))
                else:
                    voteProb *= (1 - (dictList[count+8][i] + 1)/(dictList[0][1] + len(_dict)))
                    notVoteProb *= (1- (_dict[i] + 1)/(dictList[0][0] + len(_dict)))
    votePrediction = (voteProb*(dictList[0][1]/(dictList[0][0] + dictList[0][1])))/(voteProb*(dictList[0][1]/(dictList[0][0] + dictList[0][1])) + notVoteProb*(dictList[0][0]/(dictList[0][0] + dictList[0][1])))
    if votePrediction > 0.5:
        resultsTrain[rowCount] = np.array([1, row[4]])
    else:
        resultsTrain[rowCount] = np.array([0, row[4]])
        
accuracyTraining = 0
correct = [0, 0]
incorrect = [0, 0]
for row in resultsTrain:
    if row[0] == row[1]:
        accuracyTraining += 1
        correct[int(row[1])] += 1
    else:
        incorrect[int(row[1])] += 1
        

print("Training Accuracy = " + str(accuracyTraining/len(resultsTrain)*100)+"%")
print("Confusion Matrix:")
print("\t\tPredicted: 0\t Predicted:1")
print("\t\t---------------------------------")
print("Actual: 0" + "\t|\t" + str(correct[0]) + "\t|\t" + str(incorrect[0]) + "\t|")
print("\t\t---------------------------------")
print("Actual: 1" + "\t|\t" + str(incorrect[1]) + "\t|\t" + str(correct[1]) + "\t|")
print("\t\t---------------------------------")

Training Accuracy = 71.31891137473832%
Confusion Matrix:
		Predicted: 0	 Predicted:1
		---------------------------------
Actual: 0	|	45	|	358	|
		---------------------------------
Actual: 1	|	53	|	977	|
		---------------------------------


In [8]:
#Validation accuracy
resultsValidate = np.zeros((len(dataVal), 2))
for rowCount, row in enumerate(dataVal):
    voteProb = 1
    notVoteProb = 1
    #P(row|not voted)
    for count, _dict in enumerate(dictList[:8]):
        if count != 4 and count != 0:
            for i in range(len(_dict)):
                if i == row[count]:
                    voteProb *= (dictList[count+8][i] + 1)/(dictList[0][1] + len(_dict))
                    notVoteProb *= (_dict[i] + 1)/(dictList[0][0] + len(_dict))
                else:
                    voteProb *= (1 - (dictList[count+8][i] + 1)/(dictList[0][1] + len(_dict)))
                    notVoteProb *= (1- (_dict[i] + 1)/(dictList[0][0] + len(_dict)))
    votePrediction = (voteProb*(dictList[0][1]/(dictList[0][0] + dictList[0][1])))/(voteProb*(dictList[0][1]/(dictList[0][0] + dictList[0][1])) + notVoteProb*(dictList[0][0]/(dictList[0][0] + dictList[0][1])))
    if votePrediction > 0.5:
        resultsValidate[rowCount] = np.array([1, row[4]])
    else:
        resultsValidate[rowCount] = np.array([0, row[4]])
        
accuracyValidation = 0
correct = [0, 0]
incorrect = [0, 0]
for row in resultsValidate:
    if row[0] == row[1]:
        accuracyValidation += 1
        correct[int(row[1])] += 1
    else:
        incorrect[int(row[1])] += 1

print("Validation Accuracy = " + str(accuracyValidation/len(resultsValidate)*100)+"%")
print("Confusion Matrix:")
print("\t\tPredicted: 0\t Predicted:1")
print("\t\t---------------------------------")
print("Actual: 0" + "\t|\t" + str(correct[0]) + "\t|\t" + str(incorrect[0]) + "\t|")
print("\t\t---------------------------------")
print("Actual: 1" + "\t|\t" + str(incorrect[1]) + "\t|\t" + str(correct[1]) + "\t|")
print("\t\t---------------------------------")

Validation Accuracy = 72.53668763102725%
Confusion Matrix:
		Predicted: 0	 Predicted:1
		---------------------------------
Actual: 0	|	15	|	117	|
		---------------------------------
Actual: 1	|	14	|	331	|
		---------------------------------


In [9]:
#Testing accuracy
resultsTesting = np.zeros((len(dataTest), 2))
for rowCount, row in enumerate(dataTest):
    voteProb = 1
    notVoteProb = 1
    #P(row|not voted)
    for count, _dict in enumerate(dictList[:8]):
        if count != 4 and count != 0:
            for i in range(len(_dict)):
                if i == row[count]:
                    voteProb *= (dictList[count+8][i] + 1)/(dictList[0][1] + len(_dict))
                    notVoteProb *= (_dict[i] + 1)/(dictList[0][0] + len(_dict))
                else:
                    voteProb *= (1 - (dictList[count+8][i] + 1)/(dictList[0][1] + len(_dict)))
                    notVoteProb *= (1- (_dict[i] + 1)/(dictList[0][0] + len(_dict)))
    votePrediction = (voteProb*(dictList[0][1]/(dictList[0][0] + dictList[0][1])))/(voteProb*(dictList[0][1]/(dictList[0][0] + dictList[0][1])) + notVoteProb*(dictList[0][0]/(dictList[0][0] + dictList[0][1])))
    if votePrediction > 0.5:
        resultsTesting[rowCount] = np.array([1, row[4]])
    else:
        resultsTesting[rowCount] = np.array([0, row[4]])
        
accuracyTesting = 0
correct = [0, 0]
incorrect = [0, 0]
for row in resultsTesting:
    if row[0] == row[1]:
        accuracyTesting += 1
        correct[int(row[1])] += 1
    else:
        incorrect[int(row[1])] += 1
print("Testing Accuracy = " + str(accuracyTesting/len(resultsTesting)*100)+"%")
print("Confusion Matrix:")
print("\t\tPredicted: 0\t Predicted:1")
print("\t\t---------------------------------")
print("Actual: 0" + "\t|\t" + str(correct[0]) + "\t|\t" + str(incorrect[0]) + "\t|")
print("\t\t---------------------------------")
print("Actual: 1" + "\t|\t" + str(incorrect[1]) + "\t|\t" + str(correct[1]) + "\t|")
print("\t\t---------------------------------")

Testing Accuracy = 71.18997912317327%
Confusion Matrix:
		Predicted: 0	 Predicted:1
		---------------------------------
Actual: 0	|	14	|	121	|
		---------------------------------
Actual: 1	|	17	|	327	|
		---------------------------------
