In [12]:
# Load training and prediction data
training_data = TrainingData(["./filteredtweets/hurricane0.json", "./filteredtweets/hurricane1.json"], ["./filteredtweets/weinstein0.json"], train_frac=0.1)
prediction_data = read_json(["./filteredtweets/noMatch0.json", "./filteredtweets/noMatch1.json"])

In [13]:
# Build Multinomial Naive Bayes Classifier
nb = MNBClassifier(training_data, prediction_data)

In [14]:
# Predict our prediction data
nb.predict()

In [10]:
nb.to_geojson()

KeyError: 'coordinates'

In [None]:
nb.to_json()

## Helper Classes & Functions

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import cross_validate, StratifiedKFold, KFold
import re
import numpy as np
import json
import pandas as pd
import codecs

In [8]:
def read_json(files):
    data = []
    for file in files:
        with open(file) as json_data:
            for tweet in json.load(json_data):
                tweet["text"] = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet["text"].replace("#", "hashtag")).split())
                data.append(tweet)
    return data

In [9]:
class MNBClassifier(object):
    def __init__(self, training_data, prediction_data):
        self.raw_train = np.array(training_data)
        self.raw_test = np.array(prediction_data+training_data.hh_test+training_data.hw_test)
        self.X_train, self.y_train = [tweet["text"] for tweet in training_data.X], training_data.y
        self.prediction_data = [tweet["text"] for tweet in prediction_data+training_data.hh_test+training_data.hw_test]
        self.nb = None
        self.traincv = None
        self.X_train_tfidf = None
        self.hhind = None
        self.hwind = None

    def train(self):
        self.nb = MultinomialNB()
        self.traincv = CountVectorizer(stop_words='english')
        X_train_counts = self.traincv.fit_transform(self.X_train)
        self.X_train_tfidf = TfidfTransformer().fit_transform(X_train_counts)
        self.nb.fit(self.X_train_tfidf, self.y_train)
    
    def mean_accuracy(self):
        if self.nb is None:
            self.train()
        nbValidation = Cross_Validator(estimator=self.nb, X=self.X_train_tfidf, y=self.y_train)
        nbValidation.run()
        return nbValidation.get_mean_score()
        
    def top_features(self, n=10):
        if self.nb is None:
            self.train() 
        feature_names = np.asarray(self.traincv.get_feature_names())
        bottom = np.argsort(self.nb.coef_[0])[:n]
        top = np.argsort(self.nb.coef_[0])[-n:]

        return feature_names[bottom], feature_names[top]
    
    def predict(self):
        if self.nb is None:
            self.train()
        cv = CountVectorizer(stop_words='english', vocabulary=self.traincv.vocabulary_)
        predict_counts = cv.transform(self.prediction_data)
        predict_tfidf = TfidfTransformer().fit_transform(predict_counts)
        yhat = self.nb.predict(predict_tfidf)
        self.hhind = np.where(yhat == 0)[0]
        self.hwind = np.where(yhat == 1)[0]
        
    def to_csv(self, hhloc="./classifiedtweets/hhtweets", hwloc="./classifiedtweets/hwtweets"):
        hhtweets = self.raw_test[self.hhind]
        hwtweets = self.raw_test[self.hwind]
        
        hhtweetsdate = pd.DataFrame([{"text": tweet["text"], "date": tweet["date"]} for tweet in hhtweets])
        hhtweetsloc = pd.DataFrame([{"text": tweet["text"], "location": tweet["location"]} for tweet in hhtweets])
        
        hwtweetsdate = pd.DataFrame([{"text": tweet["text"], "date": tweet["date"]} for tweet in hwtweets])
        hwtweetsloc = pd.DataFrame([{"text": tweet["text"], "location": tweet["location"]} for tweet in hwtweets])
        
        hhtweetsdate.to_csv(hhloc + 'date' + '.csv', index=False)
        hhtweetsloc.to_csv(hhloc + 'location' + '.csv', index=False)
        
        hwtweetsdate.to_csv(hwloc + 'date' + '.csv', index=False)
        hwtweetsloc.to_csv(hwloc + 'location' + '.csv', index=False)
        
    def to_geojson(self, hhloc="./classifiedtweets/hhtweets", hwloc="./classifiedtweets/hwtweets"):
        hhtweets = self.raw_test[self.hhind]
        hwtweets = self.raw_test[self.hwind]
        
        geo_data = {
            "type" : "FeatureCollection",
            "features": []
        }
        
        for tweet in hhtweets [1:]:
            geo_json_feature = {
                "type": "Feature",
                "geometry": tweet[''],
                "properties": {
                    "text": tweet['text'],
                    "created_at": tweet['created_at']
                }
            }
            geo_data['features'].append(geo_json_feature)
 
        # Save geo data
        with open('geo_data.json', 'w') as fout:
            fout.write(json.dumps(geo_data, indent=4))
    
    def to_json(self, hhloc="./classifiedtweets/hhtweets", hwloc="./classifiedtweets/hwtweets"):
        hhtweets = self.raw_test[self.hhind]
        hwtweets = self.raw_test[self.hwind]
        b = hhtweets.tolist() # nested lists with same data, indices
        file_path = "geo_list.json" ## your path variable
        json.dump(b, codecs.open(file_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4) ### this saves the array in .json forma

In [10]:
class TrainingData(object):
    def __init__(self, hurricane_filepath, weinstein_filepath, train_frac=1):
        self.train_frac = train_frac
        self.hh_train, self.hh_test = self.load_data(hurricane_filepath)
        self.hw_train, self.hw_test = self.load_data(weinstein_filepath)
        self.X, self.y = self.get_training_data()
        
    def load_data(self, file):
        raw = read_json(file)
        return raw[:int(len(raw)*self.train_frac)], raw[int(len(raw)*self.train_frac):len(raw)]
        
    def get_training_data(self):
        return np.array(self.hh_train + self.hw_train), np.array(([0]*len(self.hh_train)) + ([1]*len(self.hw_train)))

In [11]:
class Cross_Validator(object):
    def __init__(self, estimator, X, y, folds=5, trials=1):
        self.estimator = estimator
        self.X = X
        self.y = y
        self.folds = folds
        self.scores = None
        self.trials = trials
        
    def run(self):
        all_test_scores = []

        for ii in range(self.trials):
            scores = cross_validate(self.estimator, self.X, self.y, cv=StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=None), scoring="accuracy")
            all_test_scores = all_test_scores + list(scores["test_score"])
        self.scores = all_test_scores
        
    def get_mean_score(self):
        if self.scores is None: self.run()
        return np.mean(self.scores)
    
    def print_scores(self):
        if self.scores is None: self.run()
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,4))
        pd.DataFrame(self.scores).hist(ax=ax, color=mycolors["blue"], edgecolor="white")
        ax.set_axisbelow(True)
        ax.set_title("Bootstrapped Cross-Val Accuracies")