## Baselines

In [29]:
from __future__ import print_function 
import numpy as np
import tensorflow as tf 

import os
import sys
import time 
import pickle 

from sklearn import svm 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding

import keras.backend as K

In [30]:
class Classifier(object):
    """
    """
    def __init__(self, batch_size, epochs, raw_data_path=None, embedded_data_path=None):
        """
        """
        self.batch_size = batch_size
        self.epochs = epochs

        # data placeholders
        self.x_train = None 
        self.x_val = None 
        self.x_test = None 
        self.y_train = None 
        self.y_val = None 
        self.y_test = None 
        
        # load data selectively 
        if raw_data_path != None:
            self._load_raw_data(raw_data_path)
        if embedded_data_path != None:
            self._load_embedded_data(embedded_data_path)
            
        # variable to hold the model 
        self.model = None 
        
    def set_batch_size(self, new_batch_size):
        self.batch_size = new_batch_size
        
    def set_epochs(self, new_epochs):
        self.epochs = new_epochs
        
    def _load_raw_data(self, raw_data_path):
        """ saved data format 
        processed_data = {
            'texts': filtered_texts,
            'scores': scores,
            'scores_dict':scores_dict,
            'count': count, 
            'embeddings_index': embeddings_index
        }
        """
        with open(raw_data_path, 'rb') as f:
            raw_data = pickle.load(f)
            self.texts = raw_data['texts']
            self.scores = raw_data['scores']
            self.scores_dict = raw_data['scores_dict']
            self.coun = raw_data['count'] 
            print('loaded raw processed data')
        
    def _load_embedded_data(self, embedded_data_path, validation_split=0.1):
        """
        """ 
        # f = np.load('data_and_embedding100.npz')
        f = np.load(embedded_data_path)
        
        self.num_labels = int(f['num_labels'])
        self.num_words = int(f['num_words'])
        self.embedding_dim = int(f['embedding_dim'])
        self.max_sequence_length = int(f['max_sequence_length'])

        self.x = f['x_train']
        self.y = f['y_train']
        self.x_test = f['x_test']
        self.y_test = f['y_test']

        self.embedding_matrix = f['embedding_matrix']
        
        indices = np.arange(self.x.shape[0])
        np.random.shuffle(indices)
        self.x = self.x[indices]
        self.y = self.y[indices]
        num_validation_samples = int(validation_split * self.x.shape[0])

        self.x_train = self.x[:-num_validation_samples]
        self.y_train = self.y[:-num_validation_samples]
        self.x_val = self.x[-num_validation_samples:]
        self.y_val = self.y[-num_validation_samples:]
        print('loaded embedded datasets')
    
    

In [31]:
class MajorityClass(Classifier):
    """ the self.model variable holds the majority class 
    """
    def __init__(self, batch_size, epochs, raw_data_path='data/raw_processed_data.pkl'):
        super(MajorityClass, self).__init__(batch_size, epochs, raw_data_path=raw_data_path)
    
    def build_majority_predictor(self):
        """ construct the label distribution dict in training set,
            return the majority class as baseline predictor 
        """
        max_occr = max(list(self.scores_dict.values()))
        for label in self.scores_dict:
            if self.scores_dict[label] == max_occr:
                self.model = label
            
    def predict_majority_predictor(self, test_data):
        """ for the majority predictor, the model itself is the majority label 
        """
        predictions = self.model * np.ones(test_data.shape[0]) 
        return predictions 
    
    def evaluate_majority_predictor(self):
        """
        """
        pass 
    
    def save_model(self):
        """ save the trained model to the 'models/' directory
        """
        with open('models/majority_class.pkl', 'wb') as f:
            pickle.dump(self.model, f)
    

In [32]:
class LogisticRegression(Classifier):
    """
    """
    def __init__(self, batch_size, epochs, raw_data_path=None, embedded_data_path='data/data_and_embedding100.npz', embedding_dim=100):
        super(LogisticRegression, self).__init__(batch_size, epochs, raw_data_path=None, embedded_data_path=embedded_data_path) 
        
        # construct an embedding layer (only necessary for logistic regression)
        self.embedding_dim = embedding_dim 
        self.embedding_layer = self._construct_embedding_layer()
    
    def _construct_embedding_layer(self):
        """
        """
        return Embedding(self.num_words,
                        self.embedding_dim,
                        weights=[self.embedding_matrix],
                        input_length=self.max_sequence_length,
                        trainable=False)
    
    def build_logistic_regression(self):
        """
        """
        sequence_input = Input(shape=(self.max_sequence_length, ), dtype='int32')
        embedded_sequences = self.embedding_layer(sequence_input)
        x = Lambda(self.embedding_mean)(embedded_sequences)
        preds = Dense(self.num_labels, activation='softmax')(x)

        model = Model(sequence_input, preds)
        model.summary()
        self.model = model
    
    def train_logistic_regression(self, loss='categorical_crossentropy', optimizer='adam', ):
        """
        """
        self.model.compile(loss=loss,
              optimizer=optimizer,
              metrics=['acc']) 
        
        start_time = time.time()

        self.model.fit(self.x_train, self.y_train,
                  batch_size=self.batch_size,
                  epochs=self.epochs,
                  validation_data=(self.x_val, self.y_val))

        print("Training time: ", time.time() - start_time)
        
    def predict_logistic_regression(self, test_data):
        """ feed into the logistic regression model to get predictions 
        """
        predictions = self.model.predict(test_data)
        return predictions
    
    def evaluate_logistic_regression(self, x_test_data, y_test_data):
        """
        """
        res = self.model.evaluate(x_test_data, y_test_data)
        print(res[1])  # model.metrics_names[1] is acc 
    
    def embedding_mean(self, x):
        """ for logistic regression model 
        """
        return tf.reduce_mean(x, axis=1)
    
    def save_model(self):
        """ save the trained model to the 'models/' directory
        """
        self.model.save('models/logistic_regression.h5')


In [35]:
class SVM(Classifier):
    """
    """
    def __init__(self, batch_size, epochs=50000, raw_data_path=None, 
                 embedded_data_path='data/data_and_embedding100.npz', model_type='embedding'):
        """
        """
        super(SVM, self).__init__(batch_size, epochs=epochs, raw_data_path=None, embedded_data_path=embedded_data_path)
        self.model_type = model_type   # bow (bag of words) or embedding
        
        # load or construct dataset for SVM
        self._construct_SVM_data()
        
    def _construct_SVM_data(self):
        """ load the saved data or construct a new one 
        """
        try:
            if self.model_type == 'bow':
                f = np.load("data/svm_bow_data")
                self.x_train_bow = f['x_train_bow']
                self.x_val_bow = f['x_val_bow']
                self.x_test_bow = f['x_test_bow']
            elif self.model_type == 'embedding':
                f = np.load("data/svm_embedding_data")
                self.x_train_embedded = f['x_train_embedded']
                self.x_val_embedded = f['x_val_embedded']
                self.x_test_embedded = f['x_test_embedded']
                
            self.y_train_svm = f['y_train_svm']
            self.y_val_svm = f['y_val_svm']
            self.y_test_svm = f['y_test_svm']
        except:     
            if self.model_type == 'bow':
                self.x_train_bow = self.convert_doc_feature_vec(self.x_train, self.embedding_matrix)
                self.x_val_bow = self.convert_doc_feature_vec(self.x_val, self.embedding_matrix)
                self.x_test_bow = self.embed_doc(self.x_test, self.embedding_matrix)
            elif self.model_type == 'embedding':
                self.x_train_embedded = self.embed_doc(self.x_train, self.embedding_matrix)
                self.x_val_embedded = self.embed_doc(self.x_val, self.embedding_matrix)
                self.x_test_embedded = self.embed_doc(self.x_test, self.embedding_matrix)

            self.y_train_svm = self.convert_labels(self.y_train)
            self.y_val_svm = self.convert_labels(self.y_val)
            self.y_test_svm = self.convert_labels(self.y_test)
            
            self._save_SVM_data()
        
    def _save_SVM_data(self):
        """
        """
        if self.model_type == 'bow':
            np.savez("data/svm_bow_data",
                x_train_bow = self.x_train_bow,
                x_val_bow = self.x_val_bow,
                x_test_bow = self.x_test_bow,
                y_train_svm = self.y_train_svm,
                y_val_svm = self.y_val_svm,
                y_test_svm = self.y_test.svm)
        elif self.model_type == 'embedding':
            np.savez("data/svm_embedding_data",
                x_train_embedded = self.x_train_embedded,
                x_val_embedded = self.x_val_embedded,
                x_test_embedded = self.x_test_embedded,
                y_train_svm = self.y_train_svm,
                y_val_svm = self.y_val_svm,
                y_test_svm = self.y_test_svm)
        
    def build_SVM(self):
        """
        """
        self.model = svm.LinearSVC(max_iter=self.epochs, verbose=1)
        
    def train_SVM(self):
        """
        """
        if self.model_type == 'bow':
            self.model.fit(self.x_train_bow, self.y_train_svm)
        elif self.model_type == 'embedding':
            self.model.fit(self.x_train_embedded, self.y_train_svm)
    
    def predict_SVM(self, x_test_data):
        """
        """
        preds = self.model.predict(x_test_data)
        return preds 
        
    def evaluate_SVM(self):
        """
        """
        if self.model_type == 'bow':
            preds = self.model.predict(self.x_test_bow)
            acc = np.mean(1*np.equal(np.array(self.y_test_svm), preds))
        elif self.model_type == 'embedding':
            preds = self.model.predict(self.x_test_embedded)
            acc = np.mean(1*np.equal(np.array(self.y_test_svm, dtype=preds.dtype), preds))
        print("accuracy: %g" % (acc*100), end='')
        print("%")

    # Bag of words 
    # implementation is flawed, consuming too much memory 
    def construct_feature_vec(self, text, embedding_matrix):
        text_vec = [0] * embedding_matrix.shape[0]
        zero_flag = 1
        for word in text:
            if zero_flag and word < 1:
                continue 
            else:
                zero_flag = 0
                text_vec[word] += 1
        return text_vec 

    def convert_doc_feature_vec(self, doc, embedding_matrix):
        return [self.construct_feature_vec(text, embedding_matrix) for text in doc]
    
    # Word embedding 
    def embed_text(self, text, embedding_matrix):
        instance_count = 0
        text_embedding = np.zeros(embedding_matrix[0].shape)
        for word in text:
            if word != 0:
                instance_count += 1
                text_embedding +=  embedding_matrix[word]
        return text_embedding/instance_count 

    def embed_doc(self, doc, embedding_matrix):
        return [self.embed_text(text, embedding_matrix) for text in doc]

    def convert_labels(self, one_hot_labels):
        return [list(label).index(1.0) for label in one_hot_labels]
    

### main code

In [36]:
if __name__ == '__main__':
    """ test the baseline classifiers 
    """
    
    majority_classifier = MajorityClass(batch_size=128, epochs=10, raw_data_path='data/raw_processed_data.pkl')
    print('constructed majority class classifier')
    
    logistic_classifier = LogisticRegression(batch_size=128, epochs=10, raw_data_path=None, embedded_data_path='data/data_and_embedding100.npz', embedding_dim=100)
    print('constructed logitic regression classifier')
    
    svm_classifier = SVM(batch_size=128, epochs=50000, raw_data_path=None, 
                 embedded_data_path='data/data_and_embedding100.npz', model_type='embedding')
    print('constructed SVM classifier')
    

loaded raw processed data
constructed majority class classifier
loaded embedded datasets
constructed logitic regression classifier
loaded embedded datasets
constructed SVM classifier
