In [14]:
# Import some dependencies
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import pandas as pd

# GET target for the live dataset
url = 'https://serene-forest-99801.herokuapp.com/api/allData'

In [15]:
# Set up vectorizers to transform data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer()
tfidf = TfidfTransformer()

In [16]:
# Import the live dataset
dataset = pd.read_json(url)

In [17]:
# An attempt to massage the dataset to produce better results
# * Needed to bucket categories so each had 10+ features

# Regex for everything not in a defined bucket
regex = '^((?!(^White$)).).+$'

# Removes 'unspecified' since they are unknowable features
# and buckets all others into 'White' or 'PoC'
# for model evaluation
dataset = dataset[dataset.race != 'unspecified']
dataset['race'] = dataset['race'].replace(
    to_replace=regex,
    value='PoC',
    regex=True
)

In [18]:
# Vectorize the data as needed and split into `X` and `y` varibles for training / testing
X = tfidf.fit_transform(vectorizer.fit_transform(dataset.writing))

y = dataset.race

In [19]:
# Test Bernoulli Naive Bayes with K-Fold (10-fold) cross-validation
from sklearn.naive_bayes import MultinomialNB
multiNB = MultinomialNB()
print(cross_val_score(multiNB, X, y, cv=10, scoring='accuracy').mean())

0.668650793651


In [20]:
# Test Multinomial Naive Bayes with K-Fold (10-fold) cross-validation
from sklearn.naive_bayes import BernoulliNB
bernoulliNB = BernoulliNB()
print(cross_val_score(bernoulliNB, X, y, cv=10, scoring='accuracy').mean())

0.569642857143


In [21]:
# Test Logistic Regression with K-Fold (10-fold) cross-validation
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())

0.60753968254
