# Import packages

In [1]:
import os.path as op
import numpy as np
import pandas as pd
from glob import glob
from IPython.display import Image
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

# Load the data

The data come from the movie data review from Cornell v2.0 polarity dataset available at  http://www.cs.cornell.edu/people/pabo/movie-review-data/.

In [2]:
filenames_neg = sorted(glob(op.join('/PATH/TO/DATA', 'data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('/PATH/TO/DATA', 'data', 'imdb1', 'pos', '*.txt')))

stop_words = pd.read_csv('/PATH/TO/DATA/data/english.stop.txt', header=None)

texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

2000 documents


# Naïve Bayes from scratch for sentiment analysis

For text mining we first built a homemade CountVectorizer (which is also available on sklearn).

In [3]:
def count_meaningful_words(texts, stop_words):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
        
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    vocabulary = {}
    words = set()
    for text in texts:
        words = words.union(set(text.split()))
    words = [x for x in words if x not in stop_words[0].values]
    vocabulary = dict(zip(words, range(len(words)))) 
    
    n_features = len(vocabulary)
    n_samples = len(texts)
    counts = np.zeros((n_samples, n_features))
    
    for i, text in enumerate(texts):
        tmp_words_list = text.split()
        tmp_words = set(tmp_words_list)
        for word in tmp_words:
            if word not in stop_words[0].values:
                count = tmp_words_list.count(word)
                counts[i, vocabulary[word]] = count
            
    return vocabulary, counts

The pseudo-code used to build our NB classifier comes from : https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf p.260

In [4]:
class NB():
    
    def __init__(self):
        pass

    def fit(self, X, y):        
        texts_to_matrix = count_meaningful_words(X, stop_words)
        
        self.vocabulary = texts_to_matrix[0]
        words_freq_matrix = texts_to_matrix[1]
        number_docs = len(X)
        self.classes = np.unique(y)
        self.proba_c = []
        self.cond_prob = np.zeros(shape=(len(self.vocabulary), len(self.classes)))
        
        for c in self.classes:
            id_class = np.where(y == c)
            words_from_c = np.take(words_freq_matrix, id_class, axis=0)[0]   
            self.proba_c.append(len(words_from_c)/number_docs)
            T_c = words_from_c.sum(axis=0)
            for i in range(0, len(T_c)):
                self.cond_prob[i][c] = (T_c[i] + 1) / (len(words_from_c) + np.sum(T_c))
        
        return self

    
    def predict(self, X_new):
        predictions = []
        for text in X_new:
            score = np.zeros(shape=(len(self.classes)))
            for c in self.classes:
                words = text.split()
                score[c] = np.log(self.proba_c[c])
                for word in words:
                    if word in self.vocabulary:
                        score[c] += np.log(self.cond_prob[self.vocabulary.get(word)][c])               
            predictions.append(np.argmax(score))   
            
        return predictions

    def score(self, X, y):
        return np.mean(self.predict(X) == y)


# Comparaison of our homemade NB to sklearn NB 

In [5]:
X = texts
kf = KFold(n_splits=5, shuffle=True)
kf.get_n_splits(X)

i = 0
scores = np.zeros(shape=5)
for train_index, test_index in kf.split(X):
    X_train, X_test = np.take(X, train_index, axis=0), np.take(X, test_index, axis=0)
    y_train, y_test = np.take(y, train_index, axis=0), np.take(y, test_index, axis=0)
    
    naive_bayes_cv = NB()
    naive_bayes_cv.fit(X_train, y_train)
    score = naive_bayes_cv.score(X_test, y_test)
    scores[i] = score
    print('Score on fold', i+1,'=', score)
    i += 1
    
print('Average score of homemade NB classifier =', np.mean(scores))

Score on fold 1 = 0.8175
Score on fold 2 = 0.8075
Score on fold 3 = 0.79
Score on fold 4 = 0.77
Score on fold 5 = 0.8425
Average score of homemade NB classifier = 0.8055


In [6]:
# Define a pipeline combining a text feature extractor with a simple classifier
text_classification = Pipeline([
    ('vect', CountVectorizer()),
    ('naive_bayes', MultinomialNB()),
])


X = texts
kf = KFold(n_splits=5, shuffle=True)
kf.get_n_splits(X)

i = 0
scores = np.zeros(shape=5)
for train_index, test_index in kf.split(X):
    X_train, X_test = np.take(X, train_index, axis=0), np.take(X, test_index, axis=0)
    y_train, y_test = np.take(y, train_index, axis=0), np.take(y, test_index, axis=0)
    
    text_classification.fit(X_train, y_train)
    score = text_classification.score(X_test, y_test)
    scores[i] = score
    print('Score on fold', i+1,'=', score)
    i += 1
    
print('Average score of sklearn NB classifier =', np.mean(scores))

Score on fold 1 = 0.8325
Score on fold 2 = 0.8075
Score on fold 3 = 0.8125
Score on fold 4 = 0.79
Score on fold 5 = 0.8275
Average score of sklearn NB classifier = 0.8140000000000001
