In [6]:
# Load training and prediction data
training_data = TrainingData(["./filteredtweets/hurricane0.json", "./filteredtweets/hurricane1.json"], ["./filteredtweets/weinstein0.json"], train_frac=0.1)
prediction_data = read_json(["./filteredtweets/noMatch0.json", "./filteredtweets/noMatch1.json"])

In [7]:
# Build Multinomial Naive Bayes Classifier
nb = MNBClassifier(training_data, prediction_data)

In [8]:
# Predict our prediction data
nb.predict()

In [9]:
# Print average train accuracy from kfold cross validation with k=5
print("Mean accuracy on training data: ", nb.mean_accuracy())

Mean accuracy on training data:  0.9931223077857171


In [10]:
print("Total training tweets: ", len(training_data.X))
print("Total training Hurricane Harvey: ", len(training_data.hh_train))
print("Total training Harvey Weinstein: ", len(training_data.hw_train))
print("Total prediction tweets: ", len(nb.prediction_data))
print("Total predicted as Hurricane Harvey: ", len(nb.hhind))
print("Total predicted as Harvey Weinstein: ", len(nb.hwind))

Total training tweets:  66156
Total training Hurricane Harvey:  51374
Total training Harvey Weinstein:  14782
Total prediction tweets:  1171276
Total predicted as Hurricane Harvey:  971421
Total predicted as Harvey Weinstein:  199855


In [11]:
print(nb.hhind[30000:30100])
print(nb.hwind[100000:100100])
print(nb.prediction_data[35807])
print(nb.prediction_data[1070460])

[33431 33432 33433 33434 33435 33436 33438 33439 33440 33443 33444 33446
 33447 33449 33450 33451 33453 33454 33455 33456 33457 33458 33460 33463
 33465 33468 33475 33476 33480 33482 33483 33484 33485 33488 33489 33490
 33492 33494 33495 33497 33498 33499 33500 33504 33506 33507 33508 33510
 33512 33514 33515 33517 33519 33520 33521 33522 33523 33524 33525 33526
 33527 33528 33529 33530 33535 33537 33538 33539 33544 33545 33546 33549
 33550 33551 33552 33554 33555 33556 33557 33558 33559 33560 33561 33562
 33563 33564 33565 33567 33568 33569 33570 33571 33572 33573 33575 33576
 33577 33578 33580 33581]
[1070376 1070377 1070378 1070379 1070380 1070381 1070382 1070383 1070384
 1070385 1070386 1070387 1070388 1070389 1070390 1070391 1070392 1070393
 1070394 1070395 1070396 1070397 1070398 1070399 1070400 1070401 1070402
 1070403 1070404 1070405 1070406 1070407 1070408 1070409 1070410 1070411
 1070412 1070413 1070414 1070415 1070416 1070417 1070418 1070419 1070420
 1070421 1070422 1070423 

In [12]:
# Top words for each classifier
tophh, tophw = nb.top_features(n=10)
print("Top Hurricane Harvey features: ", tophh)
print("Top Harvey Weinstein features: ", tophw)

Top Hurricane Harvey features:  ['hashtagtexassearchandrescue' 'hashtaghelpandhopeforhouston'
 'hashtaghelpforharvey' 'hashtaghelpforhouston' 'hashtaghelpharvey'
 'hashtaghelphouston' 'hashtaghelpincrisis' 'hashtaghelping'
 'hashtaghelpacripple' 'hashtaghelpinghand']
Top Harvey Weinstein features:  ['men' 'trump' 'rt' 'spacey' 'kevin' 'did' 'brad' 'harvey' 'weinstein'
 'fuck']


## Helper Classes & Functions

In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import cross_validate, StratifiedKFold, KFold
import re
import numpy as np
import json

In [2]:
def read_json(files):
    data = []
    for file in files:
        with open(file) as json_data:
            data = data + [' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet["text"].replace("#", "hashtag")).split()) for tweet in json.load(json_data)]
    return data

In [3]:
class MNBClassifier(object):
    def __init__(self, training_data, prediction_data):
        self.training_data = training_data
        self.prediction_data = prediction_data+training_data.hh_test+training_data.hw_test
        self.nb = None
        self.traincv = None
        self.X_train_tfidf = None
        self.hhind = None
        self.hwind = None

    def train(self):
        self.nb = MultinomialNB()
        self.traincv = CountVectorizer(stop_words='english')
        X_train_counts = self.traincv.fit_transform(self.training_data.X)
        self.X_train_tfidf = TfidfTransformer().fit_transform(X_train_counts)
        self.nb.fit(self.X_train_tfidf, self.training_data.y)
    
    def mean_accuracy(self):
        if self.nb is None:
            self.train()
        nbValidation = Cross_Validator(estimator=self.nb, X=self.X_train_tfidf, y=self.training_data.y)
        nbValidation.run()
        return nbValidation.get_mean_score()
        
    def top_features(self, n=10):
        if self.nb is None:
            self.train() 
        feature_names = np.asarray(self.traincv.get_feature_names())
        bottom = np.argsort(self.nb.coef_[0])[:n]
        top = np.argsort(self.nb.coef_[0])[-n:]

        return feature_names[bottom], feature_names[top]
    
    def predict(self):
        if self.nb is None:
            self.train()
        cv = CountVectorizer(stop_words='english', vocabulary=self.traincv.vocabulary_)
        predict_counts = cv.transform(self.prediction_data)
        predict_tfidf = TfidfTransformer().fit_transform(predict_counts)
        yhat = self.nb.predict(predict_tfidf)
        self.hhind = np.where(yhat == 0)[0]
        self.hwind = np.where(yhat == 1)[0]    

In [4]:
class TrainingData(object):
    def __init__(self, hurricane_filepath, weinstein_filepath, train_frac=1):
        self.train_frac = train_frac
        self.hh_train, self.hh_test = self.load_data(hurricane_filepath)
        self.hw_train, self.hw_test = self.load_data(weinstein_filepath)
        self.X, self.y = self.get_training_data()
        
    def load_data(self, file):
        raw = read_json(file)
        return raw[:int(len(raw)*self.train_frac)], raw[int(len(raw)*self.train_frac):len(raw)]
        
    def get_training_data(self):
        return np.array(self.hh_train + self.hw_train), np.array(([0]*len(self.hh_train)) + ([1]*len(self.hw_train)))

In [5]:
class Cross_Validator(object):
    def __init__(self, estimator, X, y, folds=5, trials=1):
        self.estimator = estimator
        self.X = X
        self.y = y
        self.folds = folds
        self.scores = None
        self.trials = trials
        
    def run(self):
        all_test_scores = []

        for ii in range(self.trials):
            scores = cross_validate(self.estimator, self.X, self.y, cv=StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=None), scoring="accuracy")
            all_test_scores = all_test_scores + list(scores["test_score"])
        self.scores = all_test_scores
        
    def get_mean_score(self):
        if self.scores is None: self.run()
        return np.mean(self.scores)
    
    def print_scores(self):
        if self.scores is None: self.run()
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,4))
        pd.DataFrame(self.scores).hist(ax=ax, color=mycolors["blue"], edgecolor="white")
        ax.set_axisbelow(True)
        ax.set_title("Bootstrapped Cross-Val Accuracies")