### Naive Bayes in movie review data
 
 * Pang, B., Lee, L., & Vaithyanathan, S. (2002, July). [Thumbs up?: sentiment classification using machine learning techniques.](http://www.cs.cornell.edu/home/llee/papers/sentiment.pdf) In Proceedings of the ACL-02 conference on Empirical methods in natural language processing-Volume 10 (pp. 79-86). Association for Computational Linguistics.
 
 * You can download the data from [this website](http://www.cs.cornell.edu/people/pabo/movie-review-data/) (There are different versions, let's download the 1.1 version of the polarity dataset (`polarity dataset v1.1 (2.2Mb) (includes README.1.1):...`))

In [1]:
import os
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
# download the dataset (positive and negative tokens) first
cwd = os.getcwd()
pos_path = cwd + '/tokens/pos/'
neg_path = cwd + '/tokens/neg/'

In [3]:
def filetowordlist(path, sfx):
    words = []
    for item in os.listdir(path):
        if sfx in item:
            f=open(path + item, encoding="iso8859-1")
            lines = [line.strip() for line in f]
            f.close()
            wordsinfile = ""
            for l in lines:
                wordsinfile = wordsinfile + l + " "
            
            words.append(wordsinfile)
    return words

possents_all = np.array(filetowordlist(pos_path, ".txt"))
negsents_all = np.array(filetowordlist(neg_path, ".txt"))

In [4]:
def text_preprocessing(sents: list):
    """
    This function computes preprocessing. The prefix NOT is prepended to every word 
    after a token of logical negation (i.e., n’t, not, no, never) until the next punctuation mark.
    
    Newly formed 'words' like NOT like, NOT recommend will thus occur more often
    in negative document and act as cues for negative sentiment, while words like
    NOT bored, NOT dismiss will acquire positive associations.
    
    Non-alphanumerical characters are removed and any digit will be converted to #.
    """
    vocab = dict()
    new_sents = []
    
    for sent in sents:
        
        new_sent = []
        sent = sent.split()
        idx = 0
        
        for i, word in enumerate(sent):
            
            # list comprehension is computationally more efficient (i.e., faster) than regex
            word = re.sub("[^a-zA-Z0-9]+", "", word)
            # word = ''.join([char for char in word if char.isalnum()]) 
            word = re.sub("[0-9]+", "#", word)
            
            if i > 0:
                
                if (re.search(r"(nt|n't|\b[nN]ot\b|\b[nN]o\b|\b[nN]ever\b)", sent[idx-1]) 
                    and not re.search("^[\.,:;?!]", sent[i]) 
                    and not re.search("[\.,:;?!]$", sent[i-1])):
                    
                    word = "NOT_" + word
                    
                else:
                    idx += int(abs(i-idx) + 1)
            else:
                idx += 1
            
            if len(word) > 0:
                
                new_sent.append(word)

                if word in vocab:
                    vocab[word] += 1
                else:
                    vocab[word] = 1
            
        new_sents.append(' '.join(new_sent))
        
    return vocab, new_sents

In [5]:
def extract_vocabulary(pos_sents, neg_sents, threshold = 3):
    
    vocab_pos, pos_sents = text_preprocessing(pos_sents)
    vocab_neg, neg_sents = text_preprocessing(neg_sents)
    
    # concatenate the vocabs
    for word, freq in vocab_neg.items():
        if word in vocab_pos:
            vocab_pos[word] += freq
        else:
            vocab_pos[word] = freq
    
    # only keep words that occur more often than a specific threshold
    vocab = [word for word, freq in vocab_pos.items() if freq >= threshold]
    
    return vocab, pos_sents, neg_sents

In [6]:
vocab, pos_sents, neg_sents = extract_vocabulary(possents_all, negsents_all)

#### Each document will be represented as a unigram bag-of-words representation

In [7]:
def vectorize(pos_sents: list, neg_sents: list, vocab: list, n_best_factor = 1):
    """
    Word unigram document representation using tf-idf weighting.
    This function takes a precomputed vocabulary, that is passed to the tf-idf vectorizer.
    """
    
    X = np.concatenate((pos_sents, neg_sents))
    y = np.concatenate((np.ones(len(pos_sents), dtype = int), np.zeros(len(neg_sents), dtype = int)))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
    
    vectorizer = TfidfVectorizer(encoding = 'utf-8', lowercase = False, ngram_range = (1, 1), 
                                 analyzer = 'word', norm = 'l2', use_idf = True, smooth_idf = True,
                                 sublinear_tf = True, vocabulary = vocab)
    
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
        
    #n_best = int(len(vectorizer.idf_) * n_best_factor)
    #idx_best = np.argsort(vectorizer.idf_)[:n_best]
        
    #X_train = X_train[:, idx_best]
    #X_test = X_test[:, idx_best]
    
    return X_train, y_train, X_test, y_test

In [8]:
X_train, y_train, X_test, y_test = vectorize(pos_sents, neg_sents, vocab)

#### Training and testing using sklearn's Multinomial Naive Bayes classifier

In [9]:
MNB = MultinomialNB(fit_prior = True)
MNB.fit(X_train, y_train)
y_pred = MNB.predict(X_test)

In [10]:
print(accuracy_score(y_test, y_pred))

0.8125
