In [1]:
# To import relevant libraries
import numpy as np
import pandas as pd
import random
import sys
from random import randrange
from math import sqrt
from math import pi
from math import exp
from math import isnan

### Function to stratify dataset

In [2]:
# Stratifying the data into training data set and testing data set
# The testSize is specified as ratio

def trainTestSplit(data, testSize):
    # Converting the test size (ratio) into numbers of rows 
    if isinstance(testSize, float):
        testSize = round(testSize * len(data))
        
    # Getting the index of the data from the dataset
    dataIndex = data.index.tolist()
    
    # Generate random indexes to split the dataset
    testIndex = random.sample(population = dataIndex, k = testSize)
    
    # Allocate the testing set according to the random indexes (testIndex)
    testSet = data.loc[testIndex]
    
    # Dropping the testset rows from the original data set, and make the remaining rows as the training set
    trainSet = data.drop(testIndex)
    
    # Return the train and test set data
    return trainSet, testSet

### Gaussian Naive Bayes Function

In [3]:
# Calculate the Gaussian probability distribution for continuous features
# To prevent numerical underflow, when the exponent underflows is out of Pythons' float precision, set
# exponent to the smallest possible float value by the system
# To prevent the zero frequency/count problem, the function catches occurences of 0/null standard deviation
# and returns the probability of 1/number of observations

def calculateGaussianProbability(x, mean, stdev, totalRows):
    if stdev == 0 or isnan(stdev):
        return 1/totalRows # if stdev is 0, return the probability of "Add one count"
    
    exponent = exp(-((float(x) - mean)**2 / (2 * stdev**2 )))

    
    if exponent == 0:
        # Set exponent to smallest possible float supported by the system
        exponent = sys.float_info.min
        
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

### Function to calculate probability for categorical features

In [4]:
# Calculate probability for categorical features
# To prevent the zero frequency/count problem, the function catches the occurences of 0/null standard deviation

def calculateProbability(x, X1, count1, X2, count2, classCount, totalRows):
    if x == X1:
        # If zero frequency occurs, add 1 to count and return the probability
        if count1/classCount == 0:
            return 1/totalRows
        
        return count1/classCount
    else:
        # If zero frequency occurs, add 1 to count and return the probability
        if count2/classCount == 0:
            return 1/totalRows
        
        return count2/classCount
    

### Function to calculate class probability

In [5]:
# Calculate the probabilities of predicting each class for a given row:
# For continuous features use Gaussian probability function
# For categorical feature calculate_probability function

def calculateClassProbabilities(summaries, row):
    
    # Get the length of the dataset
    # Sum up all the counts of each label class
    totalRows = sum([summaries[label][0][2] for label in summaries])
    
    # Instantiate a dictionary to store probability of each label class for a given row
    probabilities = dict()
    
    # Get the class value: classValue
    # Get the summaries for each class: classSummaries
    for classValue, classSummaries in summaries.items():
        
        # Get the probability of each label class e.g., If class label 1 has a length of 12345
        # and length of dataset is 234567 then this probability is 12345/234567
        probabilities[classValue] = summaries[classValue][0][2]/float(totalRows)
        
        #looping through each summaries
        for i in range(len(classSummaries)):
            # if categorical feature
            if len(classSummaries[i]) > 3:
                X1, count1, X2, count2, classCount = classSummaries[i]
                probabilities[classValue] = \
                probabilities[classValue] * calculateProbability(row[i], X1, count1, X2, count2, classCount, totalRows)
            # if continuous feature
            else:
                mean, stdev, _ = classSummaries[i]
                probabilities[classValue] = \
                probabilities[classValue] * calculateGaussianProbability(row[i], mean, stdev, totalRows)
                
    return probabilities

### Make prediction

In [6]:
# Predict the class for a given row
def predictClass(summaries, row):
    # Storing the probabilities by calling the method
    probabilities = calculateClassProbabilities(summaries, row)
    
    # Initializing the variables
    bestLabel, bestProb = None, -1
    
    # Looping through to find the best label with the best probability
    for classVal, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classVal
    return bestLabel

### Function to get a summary dictionary of dataset

In [7]:
def summarizeDataset(data):
    # Creating a empty dict to store the target class
    summaries = {}
    # For loop to loop the target column that are unique
    for i in data.iloc[:,-1].unique():
        # listing the feature variable of each unique class
        feature = []
        # looping through all the feature variable except the target column
        for j in range(len(data.columns)-1):
            # Storing the size of unique values in a variable
            uniqueValueSize = len(data.iloc[:,j].unique())
            
            # If categorical feature
            if(uniqueValueSize < 5):
                aList = list()
                # Creating a subset for each class
                df = data[data.iloc[:,-1] == i]
                
                # looping through the categorical feature for unique vals
                for k in data.iloc[:,j].unique():
                    # storing the unique value and counting the val
                    aList.append(k)
                    aList.append(len(df[df.iloc[:,j] == k]))
                feature.append(tuple([aList[i] for i in range(len(aList))] + [len(data[data.iloc[:,-1] == i])]))
            # else continuous feature
            else:
                feature.append((data[data.iloc[:,-1] == i].mean(axis = 0)[j], \
                                data[data.iloc[:,-1] == i].std(axis = 0)[j], len(data[data.iloc[:,-1] == i])))
        # storing the summaries     
        summaries[i] = feature
    return summaries

In [8]:
# Naive Bayes Classifier Algorithm to predict the train set and the test set
def naiveBayesian(trainSet, testSet):
    # Method to get the summary
    summary = summarizeDataset(trainSet)
    
    # Creating an empty list
    predictions = list()
    
    # looping through the test set values 
    for row in testSet.values:
        output = predictClass(summary, row)
        predictions.append(output)
        
    return(predictions)

### Function to determine regression metrics

In [9]:
# Calculated as:
# check for equality of predicted value and labels in test_set
# calculates the sum of correct prediction
# divides the sum by length of test_set

    
def accuracy(predictions, testSet):
    yTest = list(testSet.iloc[:,-1])
    correctCount = 0
    sumError = 0.0
    rsmeError = 0.0
    for i in range(len(yTest)):
        if predictions[i] == yTest[i]:
            correctCount += 1
        sumError += abs(predictions[i] - yTest[i])
        predictionError = abs(predictions[i] - yTest[i])
        rsmeError = (predictionError**2)
        
    print(f'Number of exact matches in predictions: {correctCount}/{len(yTest)}')        
    print(f'Mean Squared Error(MSE): {np.square(np.subtract(yTest,predictions)).mean()}')
    print(f'Root Mean Squared Error (RMSE): {sqrt(rsmeError/float(len(yTest)))}')
    print(f'Mean Absolute Error(MSE): {sumError/float(len(yTest))}')
    
    return (round(correctCount/len(testSet)*100,3))

## Preprocessing

In [10]:
# Define headers
colNames = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation',
                                      'relationship','race','sex','capital-gain','capital-loss','hours-per-week',
                                      'native-country','income']

df = pd.read_csv('adult.data', header = None, names = colNames)

#print the created dataframe 
df.head().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             5 non-null      int64 
 1   workclass       5 non-null      object
 2   fnlwgt          5 non-null      int64 
 3   education       5 non-null      object
 4   education-num   5 non-null      int64 
 5   marital-status  5 non-null      object
 6   occupation      5 non-null      object
 7   relationship    5 non-null      object
 8   race            5 non-null      object
 9   sex             5 non-null      object
 10  capital-gain    5 non-null      int64 
 11  capital-loss    5 non-null      int64 
 12  hours-per-week  5 non-null      int64 
 13  native-country  5 non-null      object
 14  income          5 non-null      object
dtypes: int64(6), object(9)
memory usage: 728.0+ bytes


In [11]:
for column in df.columns:
    if df[column].dtype == object:
        newdf = pd.Series(df[column], dtype='category')
        df[column] = newdf.cat.codes

In [12]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [13]:
# shuffling dataset with sample
df = df.sample(frac=1, random_state=1).reset_index(drop=True)
# df shape
print(df.shape)
#handle missing values and replace them with a '0'
df.isnull().sum()

(32561, 15)


age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [14]:
# Split the dataset into training and testing
trainSet, testSet = trainTestSplit(df, 0.3)
trainSet.describe()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,22793.0,22793.0,22793.0,22793.0,22793.0,22793.0,22793.0,22793.0,22793.0,22793.0,22793.0,22793.0,22793.0,22793.0,22793.0
mean,38.658316,3.868512,189819.8,10.279252,10.072917,2.609354,6.592901,1.439345,3.663493,0.671873,1095.611109,87.297855,40.51121,36.733646,0.241697
std,13.608335,1.450223,106200.1,3.873117,2.578845,1.511998,4.230034,1.606389,0.850613,0.469542,7545.500698,401.380878,12.319507,7.800896,0.428121
min,17.0,0.0,12285.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,28.0,4.0,117674.0,9.0,9.0,2.0,3.0,0.0,4.0,0.0,0.0,0.0,40.0,39.0,0.0
50%,37.0,4.0,178341.0,11.0,10.0,2.0,7.0,1.0,4.0,1.0,0.0,0.0,40.0,39.0,0.0
75%,48.0,4.0,237865.0,12.0,12.0,4.0,10.0,3.0,4.0,1.0,0.0,0.0,45.0,39.0,0.0
max,90.0,8.0,1484705.0,15.0,16.0,6.0,14.0,5.0,4.0,1.0,99999.0,4356.0,99.0,41.0,1.0


In [15]:
# An insight into the test data set to see if any 
testSet.describe()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,9768.0,9768.0,9768.0,9768.0,9768.0,9768.0,9768.0,9768.0,9768.0,9768.0,9768.0,9768.0,9768.0,9768.0,9768.0
mean,38.402744,3.869779,189681.7,10.342445,10.098792,2.617629,6.525696,1.462735,3.671376,0.662981,1035.735053,87.317772,40.265356,36.684378,0.238739
std,13.71407,1.469333,104022.5,3.863435,2.558413,1.492716,4.225951,1.607626,0.84459,0.472716,6997.40036,406.642307,12.411267,7.877219,0.426335
min,17.0,0.0,19214.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,27.0,4.0,118497.0,9.0,9.0,2.0,3.0,0.0,4.0,0.0,0.0,0.0,40.0,39.0,0.0
50%,37.0,4.0,178520.0,11.0,10.0,2.0,7.0,1.0,4.0,1.0,0.0,0.0,40.0,39.0,0.0
75%,47.0,4.0,235640.8,12.0,12.0,4.0,10.0,3.0,4.0,1.0,0.0,0.0,45.0,39.0,0.0
max,90.0,8.0,1455435.0,15.0,16.0,6.0,14.0,5.0,4.0,1.0,99999.0,4356.0,99.0,41.0,1.0


In [16]:
# Test the model on training set
trainPred = naiveBayesian(trainSet, trainSet)
print('Accuracy of prediction for training set:', accuracy(trainPred, trainSet))

Number of exact matches in predictions: 18234/22793
Mean Squared Error(MSE): 0.200017549247576
Root Mean Squared Error (RMSE): 0.0
Mean Absolute Error(MSE): 0.200017549247576
Accuracy of prediction for training set: 79.998


In [17]:
# # Test the model on testing set
testPred = naiveBayesian(trainSet, testSet)
print('Accuracy of prediction for testing set:', accuracy(testPred, testSet))

Number of exact matches in predictions: 7849/9768
Mean Squared Error(MSE): 0.19645782145782145
Root Mean Squared Error (RMSE): 0.0
Mean Absolute Error(MSE): 0.19645782145782145
Accuracy of prediction for testing set: 80.354
