# Earnings Predictor
## Using Adult Dataset CSV, Applied knowledge of the Naive Bayes Model and mathematical equations to code an analytic model for predicting earnings given a set of large set of features. Developed from scratch in python without using any libraries for Naive Bayes Algorithm.

#### This function should prepare the data by reading it from a file and converting it into a useful format for training and testing and implement 90-10 splitting as specified in the project description.

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from collections import Counter

#headers
numeric = ["age", "education num", "hours per week"]
category = ["work class", "education", "marital status", "occupation", "relationship", "race", "sex", "native country (region)"]
classifers = [" <=50K", " >50K"]

#reads files and splits it
def preprocess(filename):    
    df = pd.read_csv(filename)
    df.head()
    
    #removes rows with question marks in them
    for num in numeric:
        df = df[df[num] != ' ?']

    df_train, df_test = train_test_split(df, test_size=0.1, shuffle = False)
    
    return df_train, df_test, df
    

#### This function should calculat prior probabilities and likelihoods (conditional probabilities) from the training data and using to build a naive Bayes model

In [2]:


#slpits into the to labels
def splitClassifers(df, lis):
    #creates 2 datafreams for both classifiers
    d0 = pd.DataFrame(columns=lis)
    d1 = pd.DataFrame(columns=lis)
    count0 = 0
    count1 = 0
    
    #splits into correct labels by looping through
    #dataframe
    for index, row in df.iterrows():
        
        if row["label"] == classifers[0]:
            d0 = d0.append(row[lis])
            count0 += 1
            
        if row["label"] == classifers[1]:
            d1 = d1.append(row[lis])
            count1 += 1
    
    #returns dataframes and size of data frames
    return d0, d1, count0, count1


#gets the nominal probability using naive bayes
def getProbNominal(c0,c1,prior0,prior1,cCount0,cCount1, df_train):
        
        #creates a dictionary within a dictionary
        #first dimension stores all headers
        #second store all possible values and their probabilities
        probNominal0 = {}
        probNominal1 = {}
        
        
        #loops and assigns values in the right place
        for cat in category:
            catlis = []
            for index, row in df_train.iterrows():
                if row[cat] not in catlis:
                    catlis.append(row[cat])
                

            probNominal0[cat] = {}
            probNominal1[cat] = {}
            #calculates probabilitlies for each item within category
            for item in catlis:
                rec0 = 0
                rec1 = 0
                

                if item in c0[cat].values:
                    rec0 = c0[cat].value_counts()[item]
                    
                if item in c1[cat].values:
                    rec1 = c1[cat].value_counts()[item]
                    
                probNominal0[cat][item] = (1 + rec0)/(cCount0 + len(catlis))
                probNominal1[cat][item] = (1 + rec1)/(cCount1 + len(catlis))
            
        
        #returns probability for both classifiers in the form of the
        #2d dictionary
        return probNominal0, probNominal1
    

#calculates gussian probability
def getProbNumericGuassian(value, category, nPrior0, nPrior1, n0, n1):
    mean0 = n0[category].mean()
    std0 = n0[category].std()
    mean1 = n1[category].mean()
    std1 = n1[category].std()
    
    prob0 = (1/ (np.sqrt(2 * np.pi) * std0))* np.exp(-((value-mean0)**2 / (2 * std0 *2)))
    prob1 = (1/ (np.sqrt(2 * np.pi) * std1))* np.exp(-((value-mean1)**2 / (2 * std1 *2)))
    
    return prob0, prob1

#trains data by geting pirors and calculating all nessasary probabilities
def train(df_train):
    
    
    c0, c1, cCount0, cCount1 =splitClassifers(df_train, category)
    n0, n1, nCount0, nCount1 =splitClassifers(df_train, numeric)
    
    
    cPrior0 = cCount0/(cCount0+cCount1)
    cPrior1 = cCount1/(cCount1+cCount0)
    nPrior0 = nCount0/(nCount0+nCount1)
    nPrior1 = nCount1/(nCount1+nCount0)
    

    probC0, probC1 = getProbNominal(c0,c1,cPrior0,cPrior1,cCount0,cCount1, df_train)
    
    return probC0, probC1, cPrior0, cPrior1, nPrior0, nPrior1, n0, n1

#### This function should predict classes for new items in the testing data

In [3]:

def predict(probC0, probC1, df_test, cPrior0, cPrior1, nPrior0, nPrior1, n0, n1):
    
    
    #lists saving predicted values and their probabilites
    predictedValues = []
    logprobabilities = []
    
    for index, row in df_test.iterrows():
        
        
        probnom0 = cPrior0
        probnom1 = cPrior1
        totalProb0 = 0
        totalProb1 = 0
        
        #caculates total probability for each test cases category
        for nominal in category:
                probnom0 *= probC0[nominal][row[nominal]]    
                probnom1 *= probC1[nominal][row[nominal]]
        
        probnum0 = nPrior0
        probnum1 = nPrior1
        
        #caculates total probability for each test cases numeric category
        for numCat in numeric:
            
            p0, p1 = getProbNumericGuassian(row[numCat], numCat, nPrior0, nPrior1, n0, n1)
            
            probnum0 *= p0
            probnum1 *= p1
            
        totalProb0 = probnum0 * probnom0
        totalProb1 = probnum1 * probnom1
        
        #finds higher probability between both classifiers and saves accordingly
        if totalProb0 > totalProb1:

            predictedValues.append(classifers[0])
            
        else:
            
            predictedValues.append(classifers[1])
        
        logprobabilities.append([np.log(totalProb0),np.log(totalProb1)])
    
    
    
    return predictedValues, logprobabilities

#### This function should evaliate the prediction performance by comparing your model’s class outputs to ground truth labels, return and output accuracy, confusion matrix and F1 score.

In [5]:


#prints relevant values
def evaluate(df_test, predictedValues):
    
    #gets accuracy score
    print("accuracy score: ",accuracy_score(df_test['label'], predictedValues))
    print()
    
    #gets confusion matrix
    print("confusion matrix: ")
    print(confusion_matrix(df_test['label'], predictedValues))
    print()
    
    #gets all values in confusion matrix, will be useful later
    tp, fp, fn, tn = confusion_matrix(df_test['label'], predictedValues).ravel()
    
    #calculates fscore
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    
    f1 = (2*precision*recall)/(precision+recall)
    
    print("F score: ",f1)
    print()
    return tp, fp, fn, tn 

#### This cell acts like a "main" function calling all functions above

In [6]:
#  reads in the data and apply your NB model to the ADULT data
df_train, df_test, df = preprocess("adult.csv")


#trans dataset and outputs probabilities for related categories
probC0, probC1, cPrior0, cPrior1, nPrior0, nPrior1, n0, n1= train(df_train)

#predicts using test data and outputs predicted values and log probabilities
predictedValues, logporbabilites = predict(probC0, probC1, df_test, cPrior0, cPrior1, nPrior0, nPrior1, n0, n1)



# Second, print the full evaluation results from the evaluate() function

tp, fp, fn, tn = evaluate(df_test, predictedValues)

# Third, print data statistics and model predictions, as instructed below 
# N is the total number of instances, F the total number of attributes, L the total number of labels
# The "class probabilities" may be unnormalized
# The "predicted class ID" must be in range (0, L)

#print("Attribute vectors of instances [0, 1, 2]: ", [df.iloc[[0]].values.tolist(),df.iloc[[1]].values.tolist(),df.iloc[[2]].values.tolist()]) # of the first three records in adult.csv

#print("\nNumber of instances (N): ", len(df))
#print("Number of attributes (F): ", len(df.columns))
#print("Number of labels (L): ", len(pd.unique(df["label"])))


# print out the prediction results of the last three instances
print("\n\nPredicted class log-probabilities for instance N-3: ",logporbabilites[len(logporbabilites)-3] )
print("Predicted class ID for instance N-3: ",predictedValues[len(predictedValues)-3] )
print("\nPredicted class log-probabilities for instance N-2: ", logporbabilites[len(logporbabilites)-2])
print("Predicted class ID for instance N-2: ", predictedValues[len(predictedValues)-2])
print("\nPredicted class log-probabilities for instance N-1: ", logporbabilites[len(logporbabilites)-1])
print("Predicted class ID for instance N-1: ", predictedValues[len(predictedValues)-1])



accuracy score:  0.86

confusion matrix: 
[[69  8]
 [ 6 17]]

F score:  0.9078947368421053



Predicted class log-probabilities for instance N-3:  [-23.34177986125614, -21.359906228009308]
Predicted class ID for instance N-3:   >50K

Predicted class log-probabilities for instance N-2:  [-34.67407731543075, -27.981720618223235]
Predicted class ID for instance N-2:   >50K

Predicted class log-probabilities for instance N-1:  [-17.46728258073397, -19.216525280553885]
Predicted class ID for instance N-1:   <=50K
