In [1]:
import numpy as np
import json

In [2]:
class TextNB:
    def __init__(self, X_train, y_train, alpha=1.0):
        """
        :param X_train: a list or ndarray of text strings to use as training data 
        :param y_train: an ndarray of true labels associated with the text data 
        :param alpha: the Laplace smoothing parameter 
        """
        
        # store training data 
        self.X_train = X_train 
        self.y_train = y_train 
        
        # store smoothing parameter
        self.alpha = alpha 
        
        # get number of classes 
        self.num_classes = len(set(y_train))
        
        # initialize vocab to feature map 
        self.vocab = dict() 
        
        # initialize class counts 
        self.class_counts = np.zeros(self.num_classes, dtype=int)
        
        # track total docs
        self.total_docs = 0
        
        # initialize feature counts (Note, will need to update this with the correct
        # number of columns during the training process)
        self.feature_counts = np.zeros((self.num_classes, 0), dtype=int)
        
    def train(self):
        """
        Learn the vocabularly, class_counts, and feature counts from the training data 
        """
        j = 0
        for i in range(0, len(self.X_train)):
            words = self.X_train[i].split()
            for word in words:
                if word not in self.vocab:
                    self.vocab[word] = j
                    j += 1
                
            self.class_counts[self.y_train[i]] += 1
            self.total_docs += 1
        
        # initialize feature counts 
        self.feature_counts = np.zeros((self.num_classes, len(self.vocab)), dtype=int)
        
        for i in range(0, len(self.X_train)):
            words = self.X_train[i].split()
            for word in words:
                self.feature_counts[self.y_train[i]][self.vocab[word]] += 1                    
                    
    def predict_log_score(self, text_str):
        """
        Get the log-probability score for each class
        for a query string
        
        :param text_str: a single string of text to compute the log_score for 
        """
        class_scores = np.zeros(self.num_classes) 
        words = text_str.split()            
        
        for c in range(0, self.num_classes):
            for word in words:
                if word in self.vocab:
                    class_scores[c] += np.log((self.feature_counts[c][self.vocab[word]]+self.alpha)/(np.sum(self.feature_counts[c])+(self.alpha*len(self.vocab))))
            class_scores[c] += np.log((self.class_counts[c]+self.alpha)/(self.total_docs+(self.alpha*self.num_classes)))
        
        return class_scores
        
    
    def predict(self, text_list):
        """
        Predict the class of each example in text_list  
        
        :param text_list: a list or ndarray of text strings to make predictions on 
        """
                
        yhat = np.zeros(len(text_list), dtype=int)
        
        for ii, x in enumerate(text_list):
            class_scores = self.predict_log_score(x)
            yhat[ii] = np.argmax(class_scores)
        
        return yhat 
        
        
    def accuracy(self, text_list, y_true):
        """
        Make predictions on texts in text_list and compute accuracy relative to 
        true labels in y_true 
        
        :param text_list: a list or ndarray of text strings to make predictions on 
        :param y_list: an ndarray of true labels associated with the text data 
        """
        yhat = self.predict(text_list)
        return np.sum(yhat == y_true)/len(y_true)

In [93]:
# Load training data, label it, and split into training and validation sets
training_data = TrainingData("./filteredtweets/hurricane.json", "./filteredtweets/weinstein.json")

In [94]:
# Create NB classifier using training data
nb = TextNB(training_data.X_train, training_data.y_train)

In [96]:
# Train the model
nb.train()

In [98]:
# Check the accuracy of training data
nb.accuracy(training_data.X_train, training_data.y_train)

0.995475429310134

In [99]:
# Check the accuracy of validation data
nb.accuracy(training_data.x_valid, training_data.y_valid)

0.9965354202331458

In [100]:
#TODO:
#load data to classify
#predict labels on new data

## Helper Classes & Functions

In [92]:
class TrainingData(object):
    def __init__(self, hurricane_filepath, weinstein_filepath, train_frac=0.7):
        self.train_frac = train_frac
        self.hurricane = self.read_json(hurricane_filepath)
        self.weinstein = self.read_json(weinstein_filepath)
        X, y = self.get_training_data()
        self.X_train, self.x_valid, self.y_train, self.y_valid = self.train_valid_split(X, y)
        
    def read_json(self, filepath):
        with open(filepath) as json_data:
            return json.load(json_data) 
        
    def get_training_data(self):
        return np.array([tweet["text"] for tweet in self.hurricane] + [tweet["text"] for tweet in self.weinstein]), np.array(([0]*len(self.hurricane)) + ([1]*len(self.weinstein)))
        
    def train_valid_split(self, X, y):
        """
        Randomly splits the data into training and validation sets 

        :param X: (n x p) ndarray of feature data 
        :param y: (n x 1) ndarray of labels/targets  
        :param train_frac: float indicating fraction of data to train on 
        """
        
        # Reference: https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
        X_concat_y = np.c_[X.reshape(len(X), -1), y.reshape(len(y), -1)]
        np.random.shuffle(X_concat_y)
        
        # Now that it has been shuffled, break them back into seperate arrays
        X = X_concat_y[:, :X.size//len(X)].reshape(X.shape)
        y = X_concat_y[:, X.size//len(X):].reshape(y.shape)
        
        # Split based on tran_frac percentage
        X_train = X[:int(len(X)*self.train_frac)]
        X_valid = X[int(len(X)*self.train_frac):]
        
        y_train = y[:int(len(y)*self.train_frac)]
        y_valid = y[int(len(y)*self.train_frac):]
        
        return np.array(X_train), np.array(X_valid), np.array(y_train).astype(int), np.array(y_valid).astype(int)