In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing, model_selection, metrics
from sklearn.naive_bayes import CategoricalNB, GaussianNB

In [None]:
df = pd.read_csv('adult.csv')
df.head()

What are we missing in the data?

In [None]:
(df.where(df == "?")).count()

Let's remove all rows that have any missing data, since it makes up only a negligible amount of the total data

In [None]:
# remove messy rows
print('total rows:', len(df))
df_clean = df[df != '?'].dropna()
print('rows removed:', len(df) - len(df_clean))
df_clean
(df_clean.where(df_clean == "?")).count()

Here, we will make our training and test splits:

In [None]:
var_names = list(df_clean.columns)
var_names.remove('income')
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(df_clean[var_names], df_clean['income'], test_size=0.33, random_state=42)
print(len(X_train))
print(len(X_test))
print(len(Y_train))
print(len(Y_test))

For our preprocessing, we will label encode all of our categorical attributes, and do a standard scaling for all of our continuous attributes.

In [None]:
# preprocess categorical data
cat_names = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
df_cat_str = df_clean[cat_names]

cat_encoders = {}

X_train_cat = pd.DataFrame()
X_test_cat = pd.DataFrame()
for col in cat_names:
    # fit encoders
    encoder = preprocessing.LabelEncoder().fit(df_cat_str[col])
    cat_encoders[col] = encoder

    # transform data to labels
    X_train_cat[col] = encoder.transform(X_train[col])
    X_test_cat[col] = encoder.transform(X_test[col])

In [None]:
cont_names = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
df_cont = df_clean[cont_names]

scaler = preprocessing.StandardScaler().fit(df_cont)

X_train_cont = pd.DataFrame()
X_test_cont = pd.DataFrame()
X_train_cont[cont_names] = scaler.transform(X_train[cont_names])
X_test_cont[cont_names] = scaler.transform(X_test[cont_names])

And now, we train our models over the categorical and continuous data seperately:

In [None]:
cat_NB = CategoricalNB()
cat_NB.fit(X_train_cat, Y_train)

In [None]:
cont_NB = GaussianNB()
cont_NB.fit(X_train_cont, Y_train)

In order to predict with both of our models, we can multiply the inferred probabilites from both of our models and choose the one with the greatest likelihood. It is actually implemented using the log probabilities, which are all negative, so instead we find the minimum of their multiplication.

In [None]:
def predict(df):
    df = df.copy()
    for col, encoder in cat_encoders.items():
        df[col] = encoder.transform(df[col])
    df[cont_names] = scaler.transform(df[cont_names])

    # log probabilities to avoid underflow
    cat_ps = cat_NB.predict_log_proba(df[cat_names])
    cont_ps = cont_NB.predict_log_proba(df[cont_names])

    combined_ps = cat_ps * cont_ps
    return cont_NB.classes_[combined_ps.argmin(axis=1)]
predict(df_clean)

And here is our accuracy:

In [None]:
Y_hat = predict(X_test)
metrics.accuracy_score(Y_hat, Y_test)