In [1]:
#import csv file
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import defaultdict
import numpy as np
import math
f = pd.read_csv('adult_csv.csv')
f.shape

(48842, 15)

In [2]:
#drop columns and drop rows with null values
f = f.drop(columns = ["capitalgain","capitalloss","native-country"])
f = f.dropna()
f.shape

(46033, 12)

In [3]:
#split into 90% training and 10% testing set
train, test = train_test_split(f, test_size=0.1)

In [4]:
#split 'class' column from the training set
#trainX is only later used to get the attribute names and type
trainY = train['class']
trainX = train.drop(['class'], axis = 1)


In [5]:
#split training data by class
datasets = {}
by_class = train.groupby('class')

for groups, data in by_class:
    datasets[groups] = data

In [6]:
datasets

{'<=50K':        age  workclass  fnlwgt     education  education-num  \
 1154     2  State-gov  165309       HS-grad              9   
 15967    0    Private   22201       HS-grad              9   
 8493     3    Private  105431       HS-grad              9   
 5564     3    Private  191357     Bachelors             13   
 14973    0    Private   68577     Assoc-voc             11   
 ...    ...        ...     ...           ...            ...   
 1640     3    Private  158993       HS-grad              9   
 13167    1    Private   56150          11th              7   
 19991    0    Private  117767  Some-college             10   
 15135    3    Private  341762       HS-grad              9   
 15282    2    Private  174308          11th              7   
 
            marital-status         occupation    relationship  \
 1154   Married-civ-spouse    Protective-serv         Husband   
 15967  Married-civ-spouse    Protective-serv         Husband   
 8493             Divorced      Other-

In [7]:
#for each class in separated dataset, store into new table and drop the 'class' column
#get number of rows for each class
trainXLess = datasets['<=50K']
trainXLess = trainXLess.drop(['class'], axis = 1)

trainXGreater = datasets['>50K']
trainXGreater = trainXGreater.drop(['class'], axis = 1)

NoOfClassGreater50k = len(trainXGreater)
NoOfClassLess50k = len(trainXLess)
print(NoOfClassGreater50k)
print(NoOfClassLess50k)

10231
31198


In [8]:
#get the nominal attributes into a list
nominal = [i for i in trainX.columns if trainX[i].dtype=='O']
nominal

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex']

In [9]:
#dictionary to store{label : count of that lable}
value_dictLess = {}
value_dictGreater = {}

#dictionary to store{attribute : count of unique labels in that attribute}
NoOfLabels = {}

#dictionary to store{label : label's attribute}
labelsmap = defaultdict(str)


In [10]:
#for each attribute in nominal
#build dictionary for <50K
for i in nominal: 
    values = trainXLess[i].value_counts().keys().tolist()
    counts = trainXLess[i].value_counts().tolist()
    temp = dict(zip(values, counts))
    value_dictLess.update(temp)
    
    for j in values:
        labelsmap[j] = i

    NoOfLabels[i] = len(pd.unique(train[i]))



In [11]:
#same as above, build dict for >50K
for i in nominal: 
    values = trainXGreater[i].value_counts().keys().tolist()
    counts = trainXGreater[i].value_counts().tolist()
    temp = dict(zip(values, counts))
    value_dictGreater.update(temp)
    for j in values:
        labelsmap[j] = i

In [12]:
labelsmap

defaultdict(str,
            {'Private': 'workclass',
             'Self-emp-not-inc': 'workclass',
             'Local-gov': 'workclass',
             'State-gov': 'workclass',
             'Federal-gov': 'workclass',
             'Self-emp-inc': 'workclass',
             'Without-pay': 'workclass',
             'HS-grad': 'education',
             'Some-college': 'education',
             'Bachelors': 'education',
             '11th': 'education',
             'Assoc-voc': 'education',
             'Masters': 'education',
             '10th': 'education',
             'Assoc-acdm': 'education',
             '7th-8th': 'education',
             '9th': 'education',
             '12th': 'education',
             '5th-6th': 'education',
             '1st-4th': 'education',
             'Prof-school': 'education',
             'Doctorate': 'education',
             'Preschool': 'education',
             'Never-married': 'marital-status',
             'Married-civ-spouse': 'marital-status'

In [13]:
print(value_dictLess)

{'Private': 23925, 'Self-emp-not-inc': 2505, 'Local-gov': 1991, 'State-gov': 1310, 'Federal-gov': 771, 'Self-emp-inc': 677, 'Without-pay': 19, 'HS-grad': 11317, 'Some-college': 7182, 'Bachelors': 4080, '11th': 1388, 'Assoc-voc': 1313, 'Masters': 1062, '10th': 1041, 'Assoc-acdm': 1012, '7th-8th': 716, '9th': 586, '12th': 500, '5th-6th': 404, '1st-4th': 201, 'Prof-school': 187, 'Doctorate': 143, 'Preschool': 66, 'Never-married': 12724, 'Married-civ-spouse': 10599, 'Divorced': 5135, 'Separated': 1197, 'Widowed': 1054, 'Married-spouse-absent': 471, 'Married-AF-spouse': 18, 'Adm-clerical': 4346, 'Other-service': 4262, 'Craft-repair': 4250, 'Sales': 3648, 'Prof-specialty': 3084, 'Exec-managerial': 2888, 'Machine-op-inspct': 2389, 'Handlers-cleaners': 1742, 'Transport-moving': 1678, 'Farming-fishing': 1181, 'Tech-support': 921, 'Protective-serv': 591, 'Priv-house-serv': 208, 'Armed-Forces': 10, 'Not-in-family': 9602, 'Husband': 9358, 'Own-child': 5926, 'Unmarried': 4087, 'Other-relative': 122

In [14]:
print(value_dictGreater)

{'Private': 6617, 'Self-emp-not-inc': 964, 'Self-emp-inc': 835, 'Local-gov': 833, 'Federal-gov': 505, 'State-gov': 476, 'Without-pay': 1, 'Bachelors': 2886, 'HS-grad': 2195, 'Some-college': 1824, 'Masters': 1281, 'Prof-school': 539, 'Assoc-voc': 457, 'Doctorate': 378, 'Assoc-acdm': 369, '11th': 73, '10th': 73, '7th-8th': 51, '12th': 41, '9th': 32, '5th-6th': 23, '1st-4th': 8, 'Preschool': 1, 'Married-civ-spouse': 8727, 'Never-married': 648, 'Divorced': 596, 'Widowed': 111, 'Separated': 90, 'Married-spouse-absent': 46, 'Married-AF-spouse': 13, 'Exec-managerial': 2625, 'Prof-specialty': 2478, 'Sales': 1314, 'Craft-repair': 1241, 'Adm-clerical': 688, 'Transport-moving': 421, 'Tech-support': 384, 'Machine-op-inspct': 335, 'Protective-serv': 283, 'Other-service': 179, 'Farming-fishing': 158, 'Handlers-cleaners': 118, 'Armed-Forces': 4, 'Priv-house-serv': 3, 'Husband': 7746, 'Not-in-family': 1116, 'Wife': 942, 'Unmarried': 283, 'Own-child': 97, 'Other-relative': 47, 'White': 9293, 'Black': 4

In [15]:
NoOfLabels

{'workclass': 7,
 'education': 16,
 'marital-status': 7,
 'occupation': 14,
 'relationship': 6,
 'race': 5,
 'sex': 2}

In [16]:
#dictionary to store{label : probability of this label}
probGreater50 = defaultdict(float)
probLess50 = defaultdict(float)

In [17]:
#calculate probability using Laplace smoothing
for i in value_dictGreater:
    label = labelsmap[i]
    c = NoOfLabels[label]
    prob = (value_dictGreater[i]+1) / (NoOfClassGreater50k + c)
    
    probGreater50[i] = prob


In [18]:
probGreater50

defaultdict(float,
            {'Private': 0.6464153154913069,
             'Self-emp-not-inc': 0.09425669075991405,
             'Self-emp-inc': 0.0816565735495214,
             'Local-gov': 0.0814612228950967,
             'Federal-gov': 0.049423715569447156,
             'State-gov': 0.04659113108028912,
             'Without-pay': 0.0001953506544246923,
             'Bachelors': 0.28174099736508246,
             'HS-grad': 0.21430662632965747,
             'Some-college': 0.17810090758270714,
             'Masters': 0.12510978823070168,
             'Prof-school': 0.052698350736801014,
             'Assoc-voc': 0.04469600858787938,
             'Doctorate': 0.036986435054162194,
             'Assoc-acdm': 0.03610812920854884,
             '11th': 0.0072216258417097685,
             '10th': 0.0072216258417097685,
             '7th-8th': 0.005074655996877135,
             '12th': 0.004098760612862301,
             '9th': 0.0032204547672489508,
             '5th-6th': 0.00234214892163

In [19]:
#same as above, calculate probability using Laplace smoothing
for i in value_dictLess:
    label = labelsmap[i]
    c = NoOfLabels[label]
    prob = (value_dictLess[i]+1) / (NoOfClassLess50k + c)
    
    probLess50[i] = prob

In [20]:
probLess50

defaultdict(float,
            {'Private': 0.766736099983977,
             'Self-emp-not-inc': 0.08030764300592853,
             'Local-gov': 0.06383592373017144,
             'State-gov': 0.042012497997115844,
             'Federal-gov': 0.024739625060086524,
             'Self-emp-inc': 0.02172728729370293,
             'Without-pay': 0.0006409229290177856,
             'HS-grad': 0.3625937079515602,
             'Some-college': 0.23012109950663165,
             'Bachelors': 0.130742615493048,
             '11th': 0.044499263151150124,
             'Assoc-voc': 0.04209649516242712,
             'Masters': 0.03405523162683411,
             '10th': 0.03338245658999167,
             'Assoc-acdm': 0.03245338630101877,
             '7th-8th': 0.022970461972191967,
             '9th': 0.018805664125072084,
             '12th': 0.0160504901646697,
             '5th-6th': 0.012974947139104248,
             '1st-4th': 0.006471455116293971,
             'Prof-school': 0.006022938425065676,
   

In [21]:
#get the numerical attributes into a list
numerical = [i for i in trainX.columns if train[i].dtype!='O']

In [22]:
numerical

['age', 'fnlwgt', 'education-num', 'hoursperweek']

In [23]:
#dictionary to store{label : (mean, std))}
Numvalue_dictLess = {}
Numvalue_dictGreater = {}


In [24]:
#calculate mean and std
for i in numerical:
    meanLess = sum(trainXLess[i])/float(len(trainXLess[i]))
    meanGreater = sum(trainXGreater[i])/float(len(trainXGreater[i]))
    stdLess = np.std(trainXLess[i])
    stdGreater = np.std(trainXGreater[i])
    Numvalue_dictLess[i] = (meanLess,stdLess)
    Numvalue_dictGreater[i] = (meanGreater,stdGreater)

In [25]:
Numvalue_dictGreater

{'age': (2.325872348744013, 0.9999287233560142),
 'fnlwgt': (188546.9170169094, 102408.16473671778),
 'education-num': (11.608053953670218, 2.372139308361044),
 'hoursperweek': (2.348646271136741, 0.7420367819417936)}

In [26]:
#separate test data
#use testY to calculate accuracy later
testX = test.drop(['class'], axis = 1)
testY = test['class']

In [27]:
testX.columns.values

array(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'hoursperweek'], dtype=object)

In [28]:
#function to calculate probability in continuous domain
#parameters: value of the testing data, less = True to use Numvalue_dictLess, the attribute name
def calculateP(x, less, att):
    meann, std = 0,0
    if less:
        meann, std = Numvalue_dictLess[att]
    else:
        meann, std = Numvalue_dictGreater[att]
        
    exponent = math.exp(-((x-meann)**2 / (2 * std**2 )))
    p = (1 / (math.sqrt(2 * math.pi) * std)) * exponent
    return p

#probability of each class
ProbL = NoOfClassLess50k / float(len(train))
ProbG = NoOfClassGreater50k / float(len(train))

#store result 
ProbTest = []

#for each value in each row, depending on its datatype,
#nominal: get probability from corresponding maps calculated earlier
#numerical: get probability from above function
#multiply all the probabilities together corresponding to its class
#take the larger probability
#store in the result array
for i, data in testX.iterrows():
    AL, AG, WL, WG, FL, FG, EL, EG = 0,0,0,0,0,0,0,0
    ENL, ENG, ML, MG, OL, OG, RL, RG = 0,0,0,0,0,0,0,0
    RAL, RAG, SL, SG, HL, HG = 0,0,0,0,0,0
    
    #less = True to use Numvalue_dictLess, else False
    data = list(data)
    AL = calculateP(data[0], True, 'age')
    AG = calculateP(data[0], False, 'age')
    
    WL = probLess50[data[1]]
    WG = probGreater50[data[1]]
    
    FL = calculateP(data[2], True, 'fnlwgt')
    FG = calculateP(data[2], False, 'fnlwgt')
    
    EL = probLess50[data[3]]
    EG = probGreater50[data[3]]
    
    ENL = calculateP(data[4], True, 'education-num')
    ENG = calculateP(data[4], False, 'education-num')
    
    ML = probLess50[data[5]]
    MG = probGreater50[data[5]]
    
    OL = probLess50[data[6]]
    OG = probGreater50[data[6]]
    
    RL = probLess50[data[7]]
    RG =probGreater50[data[7]]
    
    RAL = probLess50[data[8]]
    RAG =probGreater50[data[8]]
    
    SL = probLess50[data[9]]
    SG = probGreater50[data[9]]
    
    HL = calculateP(data[10], True, 'hoursperweek')
    HG = calculateP(data[10], False, 'hoursperweek')
    
    less = AL*WL*FL*EL*ENL*ML*OL*RL*RAL*SL*HL*ProbL
    greater = AG*WG*FG*EG*ENG*MG*OG*RG*RAG*SG*HG*ProbG
    
    if less > greater:
        ProbTest.append('<=50K')
    else:
        ProbTest.append('>50K')
        
        

In [29]:
#compare each entry in the class we just created and the original test data
#if same, increment counter
#calculate accuracy by counter/total number of test cases
testY = list(testY)
counter = 0
i=0
while i < len(testX):
    if ProbTest[i] == testY[i]:
        counter += 1
    
    i += 1
accuracy = counter/len(testX)


In [30]:
accuracy

0.8108166811468288