In [1]:
import numpy as np
import matplotlib.pyplot as plt
import openpyxl
from pathlib import Path

# Naive Bayes Implementation

This code is used to predict voting habits of responsees using the following factors:


    Urban or rural primary sample unit
    Views on countries current economic situation
    If they discuss politics
    Requests government action
    Views on corruption
    Education
    



# Binarisation:

## Urban/Rural:

Urban: 0

Rural: 1

Tribal: 2

Rural farm = 3


## Views on country's economic condition:


Very Bad:0

Fairly Bad: 1

Fairly Good: 2

Very Good: 3

Neither good nor bad: 4

Don't know: -1



## Discuss Politics:

Never: 0

Occasionally: 1

Frequently: 2

Don't know: -1




## Voted:

Did Voted: 1

Don't know: -1

All others: 0


## Requests gov action

No, would never do this: 0

No, but would do if had the chance: 1

Yes, once or twice: 2

Yes, several times: 3 

Yes, often: 4

Don't know: -1


## Corruption:

Decreased a lot: 0

Decreased somewhat: 1

Stayed the same: 2

Increased somewhat: 3

Increased a lot: 4

Don't know: -1



## Education:

No formal schooling: 0

Informal schooling only: 1

Some primary schooling: 2

Primary school completed, Some secondary school / high school: 3

Secondary school / high school completed: 4

Some university: 5

University completed, Post-graduate, Post-secondary qualifications, other than university: 6

Don't know, Inside the house, Outside the compound: -1


In [5]:
#Storing the excel sheet in $sheet

xlsx_file = Path('MLData_Trimmed2.xlsx')
wb_obj = openpyxl.load_workbook(xlsx_file)
sheet = wb_obj.active 

In [12]:
#This cell stores all of the data in $data, a 
#[sheet.max_row -2, sheet.max_column - 1] size arrray of ints. Data is stored
#according to Binarisation cell

#Initialising $data
data = np.zeros((sheet.max_row-2, sheet.max_column))

ir = 0
ic = 0

#Iterates trhough each row/cell and places data in $data
for row in sheet.iter_rows(2, max_row = sheet.max_row - 1):
    for cell in row:
        if ic == 0:
            #RESPNO
            data[ir, ic] = cell.value
            
        elif ic == 1:
            #Urban/Rural
            if cell.value == 'Urban':
                data[ir, ic] = 0
            elif cell.value == 'Rural':
                data[ir, ic] = 1
            elif cell.value == 'Tribal':
                data[ir, ic] = 2
            else:
                data[ir, ic] = 3
                
        elif ic == 2:
            #Views on Country's Economic Condition
            if cell.value == "Don't know":
                data[ir, ic] = -1
            elif cell.value == 'Very bad':
                data[ir, ic] = 0
            elif cell.value == 'Fairly bad':
                data[ir, ic] = 1
            elif cell.value == 'Fairly Good':
                data[ir, ic] = 2
            elif cell.value == 'Very good':
                data[ir, ic] = 3
            elif cell.value == 'Neither good nor bad':
                data[ir, ic] = 4
                
        elif ic == 3:
            #Discuss Politics?
            if cell.value == "Don't know":
                data[ir, ic] = -1
            elif cell.value == "Never":
                data[ir, ic] = 0
            elif cell.value == "Occasionally":
                data[ir, ic] = 1
            elif cell.value == "Frequently":
                data[ir, ic] = 2
                
        elif ic == 4:
            #Voted?
            if cell.value == "Don't know":
                data[ir, ic] = -1
            elif cell.value == "You voted in the elections":
                data[ir, ic] = 1
            else:
                data[ir, ic] = 0
            
        elif ic == 5:
            #Requests gov assistance
            if cell.value == "Don't know":
                data[ir, ic] = -1
            elif cell.value == "No, would never do this":
                data[ir, ic] = 0
            elif cell.value == "No, but would do if had the chance":
                data[ir, ic] = 1
            elif cell.value == "Yes, once or twice":
                data[ir, ic] = 2
            elif cell.value == "Yes, several times":
                data[ir, ic] = 3
            elif cell.value == "Yes, often":
                data[ir, ic] = 4
            
            
        elif ic == 6:
            #Corruption
            if cell.value == "Don't know":
                data[ir, ic] = -1
            elif cell.value == "Decreased a lot":
                data[ir, ic] = 0
            elif cell.value == "Decreased somewhat":
                data[ir, ic] = 1
            elif cell.value == "Stayed the same":
                data[ir, ic] = 2
            elif cell.value == "Increased somewhat":
                data[ir, ic] = 3
            elif cell.value == "Increased a lot":
                data[ir, ic] = 4
        elif ic == 7:
            #Education
            invalidStrings = ["Don't know", "Outside the compound", "Inside the house"]
            if cell.value in invalidStrings:
                data[ir, ic] = -1
            elif cell.value == "No formal schooling":
                data[ir, ic] = 0
            elif cell.value == "Informal schooling only":
                data[ir, ic] = 1
            elif cell.value == "Some primary schooling":
                data[ir, ic] = 2
            elif cell.value == "Primary school completed" or cell.value == "Some secondary school / high school":
                data[ir, ic] = 3
            elif cell.value == "Secondary school / high school completed":
                data[ir, ic] = 4
            elif cell.value == "Some university":
                data[ir, ic] = 5
            elif cell.value == "University completed" or cell.value == "Post-graduate" or cell.value == "Post-secondary qualifications, other than university":
                data[ir, ic] = 6
        ic += 1
    ic = 0
    ir += 1

In [38]:
#Spliting the indeces of the data set into training, validation, and testing

ind = np.arange(2389)
np.random.shuffle(ind)
trainingInd = ind[0:1433]
validationInd = ind[1433:1910]
testingInd = ind[1910:]

In [43]:
#Now need to set up naive bayes tables

#Setting up list of dicts
dictHomeVoted = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0
}
dictHome = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0
}
dictEcoVoted = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0
}
dictEco = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0
}
dictDiscussVoted = {
    0 : 0,
    1 : 0,
    2 : 0
}
dictDiscuss = {
    0 : 0,
    1 : 0,
    2 : 0
}
dictGovActionVoted = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0   
}
dictGovAction = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0   
}
dictCorruptVoted = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0
}
dictCorrupt = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0
}
dictEduVoted = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0,
    5 : 0,
    6 : 0
}
dictEdu = {
    0 : 0,
    1 : 0,
    2 : 0,
    3 : 0,
    4 : 0,
    5 : 0,
    6 : 0
}
dictVoted = {
    0 : 0,
    1 : 0
}

dictList = [dictVoted, dictHome, dictEco, dictDiscuss, 0, dictGovAction, dictCorrupt, dictEdu, 1, dictHomeVoted, dictEcoVoted, dictDiscussVoted, 1, dictGovActionVoted, dictCorruptVoted, dictEduVoted]

#Pulling from data into dataTest, dataVal, and dataTrain
dataTrain = np.take(data, trainingInd, axis=0)
dataVal = np.take(data, validationInd, axis=0)
dataTest = np.take(data, testingInd, axis=0)

In [48]:
#Storing counts of each entry from the training data

for rowCount, row in enumerate(dataTrain):
    if row[4] != -1:
        dictList[0][row[4]] += 1
    for colCount, col in enumerate(row):
        #Need to skip index 0 (RespNo) and index 4 (Voted)
        if colCount == 0 or colCount == 4:
            continue
        else:
            if not -1 in row:
                if row[4] == 0:
                    dictList[colCount][col] += 1
                else:
                    dictList[colCount + 8][col] += 1

In [27]:
#Dictlist contains the count of each response, based on if they voted or not
#Can now use naive bayes to find the probability of voting
#The counts of voted/not voted are started in dictList[0]

In [85]:
#Find model accuracy using training data set
resultsTrain = np.zeros((len(dataTrain), 2))
for rowCount, row in enumerate(dataTrain):
    voteProb = 1
    notVoteProb = 1
    #P(row|not voted)
    for count, _dict in enumerate(dictList[:8]):
        if count != 4 and count != 0:
            for i in range(len(_dict)):
                if i == row[count]:
                    voteProb *= (dictList[count+8][i] + 1)/(dictList[0][1] + len(_dict))
                    notVoteProb *= (_dict[i] + 1)/(dictList[0][0] + len(_dict))
                else:
                    voteProb *= (1 - (dictList[count+8][i] + 1)/(dictList[0][1] + len(_dict)))
                    notVoteProb *= (1- (_dict[i] + 1)/(dictList[0][0] + len(_dict)))
    votePrediction = (voteProb*(dictList[0][1]/(dictList[0][0] + dictList[0][1])))/(voteProb*(dictList[0][1]/(dictList[0][0] + dictList[0][1])) + notVoteProb*(dictList[0][0]/(dictList[0][0] + dictList[0][1])))
    if votePrediction > 0.5:
        resultsTrain[rowCount] = np.array([1, row[4]])
    else:
        resultsTrain[rowCount] = np.array([0, row[4]])
                                                                                    

In [87]:
accuracyTraining = 0
correct = [0, 0]
for row in resultsTrain:
    if row[0] == row[1]:
        accuracyTraining += 1
        correct[int(row[1])] += 1
print("Training Accuracy = " + str(accuracyTraining/len(results)))
print("Confusion Matrix Diagonal:")
print(correct)

Training Accuracy = 0.7048150732728542
Confusion Matrix Diagonal:
[67, 943]


In [None]:
#Validation accuracy

In [None]:
#Testing accuracy