### Naive Bayes + top 9000 bag-of-words
Author: Jeanne Elizabeth Daniel

Serves as baseline for MomConnect answer selection task.

Note: we train and test in batches as the method is memory intensive.

In [1]:
import pandas as pd

In [9]:
import gensim
from gensim.utils import simple_preprocess
import preprocess_data

In [19]:
from sklearn import naive_bayes
import numpy as np

In [10]:
def preprocess(text):
    
     """ Method for preprocessing text
    
    Args:
        text: string of text
        min_token_length: integer value indicating min number of characters in a token
        join: boolean indicating if function should join the list of tokens into the string or not
    
    Returns:
        list of cleaned words
    """
    
    if type(text) != str:
        return []
    
    result = []
    for token in gensim.utils.simple_preprocess(text, min_len =1, max_len = 40):
        if len(token) > 2:
            result.append(token)
    
    return result

In [12]:
def label_preprocess(entry):
    
    """ Returns integer ID corresponding to response for easy comparison and classification
    
    Args:
        entry: query item 
        responses: dict containing all the template responses with their corresponding IDs
        
    Return: 
        integer corresponding to each response     
        
    """
        
    if responses.get(entry) != None:
        return responses[entry]
    else:
        return len(responses) #default unknown class
    

In [18]:
def create_sparse_feature_vector(text, features):
    
    """ Constructs a sparse feature vector containing list of word IDs
    
    Args:
        text: text sentence
        features: dict with words as keys and corresponding ids as values
        
    Returns:
        list of word IDs for sequence of words
    
    """
    
    sparse = []
    
    for n in preprocess(text):
        f = features.get(n)
        if f != None:
            sparse.append(f)
    return sparse

In [20]:
def create_batch(df, features): 
    
    """ Create batch of feature vectors in matrix form
    
    Args:
        df: dataset of questions
        features: vocabulary of words in dict form
        
    Returns:
        matrix where rows are bag-of-words representation of questions
        and columns represent the features (words)
    
    """
    
    matrix = np.zeros((df.shape[0], len(features)))
    all_text = list(df['helpdesk_question']) 
    
    for i in range(len(all_text)):
        sparse = create_sparse_feature_vector(all_text[i], features)
        for s in sparse:
            matrix[i][s] = 1
            
    return matrix


In [21]:
def batch_train(df, features, increment= 10000):
    
    """ Train MNB in batches
        
    Args:
        df: dataframe of questions
        features: vocabulary of words in dict form
        increment: size of batch
        
    Returns:
        Trained MNB classifier
        
    """
    
    print("Number of features:", len(features))
    clf = naive_bayes.MultinomialNB()

    for i in range(0, df.shape[0], increment):
        df_subset = df[i:i+increment]
        train_batch = create_batch(df_subset, features)
        clf.partial_fit(train_batch, 
                        df_subset['helpdesk_reply'].apply(label_preprocess), 
                        classes = list(range(len(responses) + 1)))
        print("Train accuracy on batch", i, 
              clf.score(train_batch, 
                        df_subset['helpdesk_reply'].apply(label_preprocess)))
        
    return clf
        

In [22]:
def classifier_validate(df, features, clf, increment= 10000):
    
    """ Test the accuracy of the MNB classifier 
    
    Args:
        df: dataframe of questions and responses
        features: vocabulary of words in dict form
        clf: trained classifier
        increment: batch size
        
    Returns:
        None
        
    """
    
    sum_of_scores = 0
    for i in range(0, df.shape[0], increment):
        df_subset = df[i:i+increment]
        valid_batch = create_batch(df_subset, features)
        score = clf.score(valid_batch, 
                          df_subset['helpdesk_reply'].apply(label_preprocess))
        print("Validation Accuracy on batch:", score)
        sum_of_scores += int(score*df_subset.shape[0])
        del valid_batch
        
    print("Overall Accuracy:", sum_of_scores/df.shape[0])
    


In [23]:
def classifier_validate_top_5(df, features, clf, increment= 10000):
    
    """ Test the top-5 accuracy of the MNB classifier 
    
    Args:
        df: dataframe of questions and responses
        features: vocabulary of words in dict form
        clf: trained classifier
        increment: batch size
        
    Returns:
        None
        
    """
    
    sum_of_scores = 0
    for i in range(0, df.shape[0], increment):
        df_subset = df[i:i+increment]
        valid_batch = create_batch(df_subset, features)
        valid_responses = list(df_subset['helpdesk_reply'].apply(label_preprocess))
        score = 0
        for i in range(len(valid_batch)):
            if valid_responses[i] in np.argsort(clf.predict_proba(valid_batch[i].reshape(1, -1)))[0][-5:]:
                score += 1
        
        #score = clf.score(valid_batch, df_subset['helpdesk_reply'].apply(label_preprocess))
        print("Validation Accuracy on batch:", score/len(valid_batch))
        sum_of_scores += score
        del valid_batch, valid_responses
        
    print("Overall Accuracy:", sum_of_scores/df.shape[0])

In [2]:
data = pd.read_csv('dataset_7B', delimiter = ';', engine = 'python')

In [3]:
data_text = data.loc[data['set'] == 'Train'][['helpdesk_question']]

In [4]:
number_of_classes = data.loc[data['set'] == 
                             'Train']['helpdesk_reply'].value_counts().shape[0]

In [5]:
responses = pd.DataFrame(data.loc[data['set'] == 
                            'Train']['helpdesk_reply'].value_counts()).reset_index()

In [42]:
all_responses = dict(data['helpdesk_reply'].value_counts())

In [7]:
responses['reply'] = responses['index']
responses['index'] = responses.index
responses = dict(responses.set_index('reply')['index'])

In [8]:
data_text['index'] = data_text.index
documents = data_text

In [11]:
samples = data['helpdesk_question'].sample(10)

In [14]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=5, no_above=0.3, keep_n=950000)
print(len(dict(dictionary)))

9286


In [15]:
dictionary_of_words = pd.DataFrame(pd.Series(dict(dictionary)))

In [16]:
dictionary_of_words['index'] = dictionary_of_words.index 

In [17]:
dictionary_of_words = dictionary_of_words.set_index(0)['index'].to_dict()

### Fitting the Naive Bayes classifier to the training set

In [24]:
clf = batch_train(data.loc[data['set'] == 'Train'], dictionary_of_words, increment = data.loc[data['set'] == 'Train'].shape[0])

Number of features: 9286
Train accuracy on batch 0 0.585673982491806


We achieve a training accuracy of 58.57%


### Here we validate the model on the Full and Low-resource Test Set

In [26]:
classifier_validate(data.loc[(data['set'] == 'Valid') ], dictionary_of_words, clf)

Validation Accuracy on batch: 0.5205
Validation Accuracy on batch: 0.521
Validation Accuracy on batch: 0.5188
Validation Accuracy on batch: 0.5207161125319693
Overall Accuracy: 0.5201063996244719


In [27]:
classifier_validate(data.loc[(data['set'] == 'Valid') & (data['low_resource'] == 'True')], dictionary_of_words, clf)

Validation Accuracy on batch: 0.4380952380952381
Overall Accuracy: 0.4380952380952381


### Model Testing

We test the top-5 accuracy on the full test set and low-resource test set

In [28]:
classifier_validate_top_5(data.loc[(data['set'] == 'Test') ], dictionary_of_words, clf)

Validation Accuracy on batch: 0.8234
Validation Accuracy on batch: 0.8241
Validation Accuracy on batch: 0.819
Validation Accuracy on batch: 0.8298253470667264
Overall Accuracy: 0.8226972357521795


In [29]:
classifier_validate_top_5(data.loc[(data['set'] == 'Test') & (data['low_resource'] == 'True')], dictionary_of_words, clf)

Validation Accuracy on batch: 0.7451923076923077
Overall Accuracy: 0.7451923076923077


We test the top-1 accuracy on the full test set and low-resource test set

In [30]:
classifier_validate(data.loc[(data['set'] == 'Test') ], dictionary_of_words, clf)

Validation Accuracy on batch: 0.5235
Validation Accuracy on batch: 0.5119
Validation Accuracy on batch: 0.5269
Validation Accuracy on batch: 0.5315718763994626
Overall Accuracy: 0.5215152173238606


In [31]:
classifier_validate(data.loc[(data['set'] == 'Test') & (data['low_resource'] == 'True')], dictionary_of_words, clf)

Validation Accuracy on batch: 0.43960336538461536
Overall Accuracy: 0.43960336538461536
