# Extract word counts

In [None]:
import spacy
from sklearn.base import TransformerMixin

# Create a spaCy parser
# In Conda, to download lexicons
# python3 -m spacy download en
nlp = spacy.load('en')


class BagOfWords(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''
        Extract each word from each document and record True if it was discovered
        @return list of dictionaries, where the first dictionary is the list of words
        in the first tweet, and so on. For each dict, key is a word and the value is
        True to indicate this word was discovered. Any word not in the dictionay will 
        be assumed to not occurred in the tweet.
        '''
        results = []
        for document in X:
            row = {}
            for word in list(nlp(document, tag=False, parse=False, entity=False)):
                if len(word.text.strip()):
                    # We only use binary features here 
                    # True if the word is in the doc, False otherwise
                    row[word.text] = True
            results.append(row)
        return results

# Converts dictionaries into a matrix
`DictVectorizer` takes a list of dictionaries and converts them into a matrix. The features in this matrix are the keys in each of the dictionaries, and the vlaues correspond to the occurrence of those features in each sample. 

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import BernoulliNB # BernoulliNB is designed for binary features
import json

input_filename = "data/datasets/twitter/python_tweets.json"
labels_filename = "data/datasets/twitter/python_classes.json"

tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0: continue
        tweets.append(json.loads(line)['text'])

with open(labels_filename) as inf:
    labels = json.load(inf)

# Ensure only classified tweets are loaded
tweets = tweets[:len(labels)]
assert len(tweets) == len(labels)

#### Putting it all together

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('bag-of-words', BagOfWords()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB()) ])

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(pipeline, tweets, labels, scoring='f1')
print("Score: {:.3f}".format(np.mean(scores)))

# What are the most important features to the model

In [None]:
model = pipeline.fit(tweets, labels)

# Get only the Naive Bayes step from the pipeline
nb = model.named_steps['naive-bayes']

# Each unique word is a feature. 
# The probabilities for each word are stored as log probabilites as log(P(A|f)) where f is a given feature
# The reason these are stored as log probabilities is because the actual values are ver low 
# -> Log stop underflow errors where small probabilities are just rounded to zeros
# Ofc, since the probabilities are multiplied together (check Naive Bayes theorem), a single value of 0 will
# result in the whole answer always being 0
feature_probabilities = nb.feature_log_prob_

# Get the top features' indices
top_features = np.argsort(-nb.feature_log_prob_[1])[:50]

# Map features' indices to actual values
dv = model.named_steps['vectorizer']
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index], np.exp(feature_probabilities[1][feature_index]))

#### Sample results of running the cell

0 : 0.53125

1 # 0.51875

2 Python 0.4875

3 python 0.40625

4 RT 0.26875

5 in 0.21875

6 - 0.2

7 to 0.19375

8 , 0.1875

9 for 0.175

10 and 0.1375

There are few features like `:` or `#` that are likely to be noise. Collecting more data is critical to smoothing out these issues.

Looking through these features gives us quite a few benefits. We could train people to recognize these tweets, look for commonalities (which give insight into a topic) or even get rid of features that make no sense. eg: RT appears quite high since it's a common Twitter phrase for retweet &rightarrow; an expert could decide to remove this word from the list, making the classifier less prone to noise.