# Challenge: Build your own NLP model

For this challenge, you will need to choose a corpus of data from nltk or another source that includes categories you can predict and create an analysis pipeline that includes the following steps:

Data cleaning / processing / language parsing
Create features using two different NLP methods: For example, BoW vs tf-idf.
Use the features to fit supervised learning models for each feature set to predict the category outcomes.
Assess your models using cross-validation and determine whether one model performed better.
Pick one of the models and try to increase accuracy by at least 5 percentage points.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import spacy
from nltk.corpus import state_union, stopwords
from collections import Counter

In [4]:
import nltk
nltk.download('state_union')

[nltk_data] Downloading package state_union to
[nltk_data]     C:\Users\Mike\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\state_union.zip.


True

In [13]:
obama = open("obama2016.txt")
trump = open("trump2019.txt")
obama = obama.read()
trump = trump.read()

In [14]:
# Parse using SpaCy
nlp = spacy.load('en')
obama_doc = nlp(obama)
trump_doc = nlp(trump)

In [16]:
# Group into sentences
trump_sents = [[sent, 'Trump'] for sent in trump_doc.sents]
obama_sents = [[sent, 'Obama'] for sent in obama_doc.sents]

# Combine
sentences = pd.DataFrame(trump_sents + obama_sents)
sentences.head()

Unnamed: 0,0,1
0,"(February, 5, ,, 2019, \n, 9:07, P.M., EST, \n\n)",Trump
1,"(THE, PRESIDENT, :, , Madam, Speaker, ,, Mr.,...",Trump
2,"(—, and, my, fellow, Americans, :, \n\n)",Trump
3,"(We, meet, tonight, at, a, moment, of, unlimit...",Trump
4,"(As, we, begin, a, new, Congress, ,, I, stand,...",Trump


In [17]:
# Look at excerpts from each 
print(trump_doc[:100])
print('\nTrump speech length:', len(trump_doc))

print('\n', obama_doc[:100])
print('\nObama speech length:', len(obama_doc))

February 5, 2019
9:07 P.M. EST

THE PRESIDENT:  Madam Speaker, Mr. Vice President, Members of Congress, the First Lady of the United States — (applause) — and my fellow Americans:

We meet tonight at a moment of unlimited potential.  As we begin a new Congress, I stand here ready to work with you to achieve historic breakthroughs for all Americans.

Millions of our fellow citizens are watching us now, gathered in this great chamber, hoping that we will govern not as

Trump speech length: 7498

 9:10 P.M. EST

Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans:

Tonight marks the eighth year that I’ve come here to report on the State of the Union. And for this final one, I’m going to try to make it a little shorter. (Applause.) I know some of you are antsy to get back to Iowa. (Laughter.) I've been there. I'll be shaking hands afterwards if you want some tips. (Laughter.)



Obama speech length: 7473


# Bag of Words Features

In [19]:
# Create bag of words function for each text
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(500)]

# Get bags 
trump_words = bag_of_words(trump_doc)
obama_words = bag_of_words(obama_doc)

# Combine bags to create common set of unique words
common_words = set(trump_words + obama_words)

In [20]:
# Create bag of words data frame using combined common words and sentences
def bow_features(sentences, common_words):
    
    # Build data frame
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentences in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentences
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
    
    return df

In [21]:
# Create bow features 
bow = bow_features(sentences, common_words)
bow.head()

Unnamed: 0,imperative,city,moment,focus,moral,hand,ask,story,modern,work,...,take,entrepreneur,want,question,this,find,intend,infrastructure,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(February, 5, ,, 2019, \n, 9:07, P.M., EST, \n\n)",Trump
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(THE, PRESIDENT, :, , Madam, Speaker, ,, Mr.,...",Trump
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(—, and, my, fellow, Americans, :, \n\n)",Trump
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(We, meet, tonight, at, a, moment, of, unlimit...",Trump
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,"(As, we, begin, a, new, Congress, ,, I, stand,...",Trump


# TF-IDF Features

In [33]:
# Grab sentence level documents in NLTK
obama = open('obama2016.txt')
trump = open('trump2019.txt')
obama = obama.read()
trump = trump.read()

In [36]:
def get_doc(sent):
    doc_info = []
    i = 0
    for sent in text_sents_clean:
        i += 1
        count = count_words(sents)
        temp = {'doc_id' : i, 'doc_lenth': count}
        doc_info.append(temp)
    return doc_info

In [37]:
# Create list of text 
obama_list = [" ".join(sents) for sents in obama]
trump_list = [" ".join(sents) for sents in trump]
joined = obama_list + trump_list

In [38]:
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, 
                             min_df=2, 
                             stop_words='english',   
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )

tfidf = vectorizer.fit_transform(joined).tocsr()

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [41]:
# Here is a list of the stopwords identified by NLTK.
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mike\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
import spacy
nlp = spacy.load('en')

# All the processing work is done here, so it may take a while.
obama_doc = nlp(obama)
trump_doc = nlp(trump)

In [43]:
# Let's explore the objects we've built.
print("The obama_doc object is a {} object.".format(type(obama_doc)))
print("It is {} tokens long".format(len(obama_doc)))
print("The first three tokens are '{}'".format(obama_doc[:3]))
print("The type of each token is {}".format(type(obama_doc[0])))

The obama_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 7473 tokens long
The first three tokens are '9:10 P.M. EST'
The type of each token is <class 'spacy.tokens.token.Token'>


In [44]:
# Let's explore the objects we've built.
print("The trump_doc object is a {} object.".format(type(trump_doc)))
print("It is {} tokens long".format(len(trump_doc)))
print("The first three tokens are '{}'".format(trump_doc[:3]))
print("The type of each token is {}".format(type(trump_doc[0])))

The trump_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 7498 tokens long
The first three tokens are 'February 5,'
The type of each token is <class 'spacy.tokens.token.Token'>


In [46]:
from collections import Counter

# Utility function to calculate how frequently words appear in the text.
def word_frequencies(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)
    
# The most frequent words:
obama_freq = word_frequencies(obama_doc).most_common(100)
trump_freq = word_frequencies(trump_doc).most_common(100)
print('Obama:', obama_freq)
print('Trump:', trump_freq)

Obama: [('the', 274), ('to', 206), ('of', 149), ('and', 142), ('that', 130), ('we', 121), ('a', 117), ('in', 99), ('\n\n', 86), ('our', 85), ('Applause', 84), ('is', 73), ('I', 69), ('’s', 68), ('for', 54), ('it', 53), ('And', 51), ('or', 51), ('who', 41), ('not', 41), ('on', 40), ('It', 37), ('have', 37), ('this', 36), ('do', 35), ('n’t', 34), ('us', 34), ('America', 33), ('as', 33), ('are', 31), ('We', 31), ('you', 30), ('they', 28), ('But', 26), ('people', 26), ('That', 26), ('their', 26), ('be', 25), ('will', 25), ('just', 24), ('world', 23), ('work', 22), ('more', 22), ('with', 22), ("'s", 22), ('American', 22), ('’ve', 21), ('can', 20), ('all', 20), ('make', 19), ('if', 19), ('want', 19), ('when', 19), ('by', 19), ('should', 19), ('up', 18), ('years', 18), ('change', 18), ('new', 18), ('year', 17), ('there', 17), ('because', 17), ('even', 17), ('out', 17), ('now', 16), ('economy', 16), ('Americans', 15), ('from', 15), ('better', 15), ('need', 15), ('every', 15), ('over', 15), ('h

In [48]:
# Use our optional keyword argument to remove stop words.
obama_freq = word_frequencies(obama_doc, include_stop=False).most_common(100)
trump_freq = word_frequencies(trump_doc, include_stop=False).most_common(100)
print('Obama:', obama_freq)
print('Trump:', trump_freq)

Obama: [('\n\n', 86), ('Applause', 84), ('I', 69), ('’s', 68), ('And', 51), ('It', 37), ('n’t', 34), ('America', 33), ('We', 31), ('But', 26), ('people', 26), ('That', 26), ('world', 23), ('work', 22), ("'s", 22), ('American', 22), ('’ve', 21), ('want', 19), ('years', 18), ('change', 18), ('new', 18), ('year', 17), ('economy', 16), ('Americans', 15), ('better', 15), ('need', 15), ('like', 13), ('way', 13), ('’re', 12), ('right', 12), ('future', 12), ('country', 12), ('job', 12), ('The', 11), ('going', 10), ("n't", 10), ('know', 9), ('believe', 9), ('system', 9), ('time', 9), ('workers', 9), ('past', 9), ('Our', 9), ('security', 9), ('politics', 9), ('lot', 9), ('ISIL', 9), ('families', 8), ('jobs', 8), ('vote', 8), ('’m', 7), ('got', 7), ('big', 7), ('care', 7), ('energy', 7), ('best', 7), ('good', 7), ('’ll', 7), ('power', 7), ('leadership', 7), ('So', 6), ('thing', 6), ('opportunity', 6), ('cut', 6), ('Now', 6), ('long', 6), ('everybody', 6), ('agree', 6), ('sure', 6), ('help', 6), (

In [49]:
# Pull out just the text from our frequency lists.
obama_common = [pair[0] for pair in obama_freq]
trump_common = [pair[0] for pair in trump_freq]

# Use sets to find the unique values in each top ten.
print('Unique to Obama:', set(obama_common) - set(trump_common))
print('Unique to Trump:', set(trump_common) - set(obama_common))

Unique to Obama: {'everybody', 'lot', 'job', 'ISIL', 'security', 'way', 'thing', 'long', 'politics', 'future', 'love', 'workers', '’re', 'big', 'hard', 'believe', 'For', 'opportunity', 'matter', 'energy', 'especially', 'spirit', 'good', 'change', 'sure', '’m', 'That', 'families', 'leadership', 'Let', 'fellow', 'place', "'s", 'start', '’ve', 'vote', 'help', 'military', 'care', 'best', '’ll', 'business', 'So', 'past', 'system', "n't", 'got', 'cut', 'better', 'needs', 'planet', 'power', 'seven', 'basic', 'want', 'political', 'right', 'Laughter', 'kids', 'need', 'voices', 'agree'}
Unique to Trump: {'drugs', 'trade', 'Elvin', 'southern', ' ', 'working', 'United', 'agenda', 'USA', 'possible', 'As', 'administration', 'Grace', 'class', 'life', 'Herman', 'million', 'freedom', 'Judah', 'young', 'Thank', 'illegal', 'ago', 'PRESIDENT', 'border', 'choose', 'They', 'law', 'When', 'secure', 'home', 'build', 'AUDIENCE', 'brave', 'decades', 'heroes', 'Alice', 'fight', 'To', 'immigration', 'historic', '

# Lemmas

So far we've just looked at whether certain words are present and how frequently they appear. We can process these words further to remove a little more noise from our data. Consider the words "think", "thought", and "thinking". They're related. They all share the same root word: the verb "think". Sometimes we want to focus on the fact that the act of thinking comes up a lot in data, and not have that information split across all the different forms of "think".

To focus in like this, we can reduce each word to its root, or lemma, and do our counts again. This time we're building a count of concepts rather than just words:

In [50]:
# Utility function to calculate how frequently lemas appear in the text.
def lemma_frequencies(text, include_stop=True):
    
    # Build a list of lemas.
    # Strip out punctuation and, optionally, stop words.
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    # Build and return a Counter object containing word counts.
    return Counter(lemmas)

# Instantiate our list of most common lemmas.
obama_lemma_freq = lemma_frequencies(obama_doc, include_stop=False).most_common(10)
trump_lemma_freq = lemma_frequencies(trump_doc, include_stop=False).most_common(10)
print('\nObama:', obama_lemma_freq)
print('Trump:', trump_lemma_freq)

# Again, identify the lemmas common to one text but not the other.
obama_lemma_common = [pair[0] for pair in obama_lemma_freq]
trump_lemma_common = [pair[0] for pair in trump_lemma_freq]
print('Unique to Obama:', set(obama_lemma_common) - set(trump_lemma_common))
print('Unique to Trump:', set(trump_lemma_common) - set(obama_lemma_common))


Obama: [('-PRON-', 155), ('applause', 89), ('\n\n', 86), ('’', 54), ('and', 51), ('be', 45), ('not', 44), ('year', 35), ('america', 33), ('work', 30)]
Trump: [(' ', 347), ('\n\n', 155), ('-PRON-', 127), ('applause', 104), ('year', 38), ('american', 34), ('thank', 31), ('’s', 28), ('america', 26), ('new', 22)]
Unique to Obama: {'not', 'be', 'work', '’', 'and'}
Unique to Trump: {'new', ' ', 'american', 'thank', '’s'}
