# Build an NLP Model

In [1]:
# Import Modules:
import time
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, auc, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

import spacy
# Source texts.
from nltk.corpus import stopwords, gutenberg, inaugural
from collections import Counter

# Aesthetics.
%matplotlib inline

  from numpy.core.umath_tests import inner1d


## Data cleaning / Processing / Language Processing

In [2]:
# Capture the speeches.
gutenberg_addresses = inaugural.fileids()

# We can view the available speeches.
for speech in gutenberg_addresses:
    print(speech)

1789-Washington.txt
1793-Washington.txt
1797-Adams.txt
1801-Jefferson.txt
1805-Jefferson.txt
1809-Madison.txt
1813-Madison.txt
1817-Monroe.txt
1821-Monroe.txt
1825-Adams.txt
1829-Jackson.txt
1833-Jackson.txt
1837-VanBuren.txt
1841-Harrison.txt
1845-Polk.txt
1849-Taylor.txt
1853-Pierce.txt
1857-Buchanan.txt
1861-Lincoln.txt
1865-Lincoln.txt
1869-Grant.txt
1873-Grant.txt
1877-Hayes.txt
1881-Garfield.txt
1885-Cleveland.txt
1889-Harrison.txt
1893-Cleveland.txt
1897-McKinley.txt
1901-McKinley.txt
1905-Roosevelt.txt
1909-Taft.txt
1913-Wilson.txt
1917-Wilson.txt
1921-Harding.txt
1925-Coolidge.txt
1929-Hoover.txt
1933-Roosevelt.txt
1937-Roosevelt.txt
1941-Roosevelt.txt
1945-Roosevelt.txt
1949-Truman.txt
1953-Eisenhower.txt
1957-Eisenhower.txt
1961-Kennedy.txt
1965-Johnson.txt
1969-Nixon.txt
1973-Nixon.txt
1977-Carter.txt
1981-Reagan.txt
1985-Reagan.txt
1989-Bush.txt
1993-Clinton.txt
1997-Clinton.txt
2001-Bush.txt
2005-Bush.txt
2009-Obama.txt


In [3]:
# Loading the speeches.
# Data source: http://www.nltk.org/nltk_data/
kennedy = inaugural.raw('1961-Kennedy.txt')
johnson = inaugural.raw('1965-Johnson.txt')



In [4]:
# Parse using SpaCy.
#nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
kennedy_doc = nlp(kennedy)
johnson_doc = nlp(johnson)

In [5]:
# Group into sentences.
kennedy_sents = [[sent, 'Kennedy'] for sent in kennedy_doc.sents]
johnson_sents = [[sent, 'Johnson'] for sent in johnson_doc.sents]

In [6]:
# Combine the sentences from the speeches into one dataframe.
sentences = pd.DataFrame(kennedy_sents + johnson_sents)

# View.
sentences.head()

Unnamed: 0,0,1
0,"(Vice, President, Johnson, ,, Mr., Speaker, ,,...",Kennedy
1,"(For, I, have, sworn, I, before, you, and, Alm...",Kennedy
2,"(The, world, is, very, different, now, .)",Kennedy
3,"(For, man, holds, in, his, mortal, hands, the,...",Kennedy
4,"(And, yet, the, same, revolutionary, beliefs, ...",Kennedy


In [7]:
sentences.tail()

Unnamed: 0,0,1
146,"(We, welcome, it, --, and, we, will, bend, it,...",Johnson
147,"(To, these, trusted, public, servants, and, to...",Johnson
148,"(But, you, must, look, within, your, own, hear...",Johnson
149,"(They, will, lead, you, best, of, all, ., \n\n)",Johnson
150,"(For, myself, ,, I, ask, only, ,, in, the, wor...",Johnson


In [8]:
kennedy_doc[:100]

Vice President Johnson, Mr. Speaker, Mr. Chief Justice, President Eisenhower, Vice President Nixon, President Truman, reverend clergy, fellow citizens, we observe today not a victory of party, but a celebration of freedom -- symbolizing an end, as well as a beginning -- signifying renewal, as well as change. For I have sworn I before you and Almighty God the same solemn oath our forebears l prescribed nearly a century and three quarters ago.

The world is very different now. For man holds in his mortal

In [9]:
johnson_doc[:100]

My fellow countrymen, on this occasion, the oath I have taken before you and before God is not mine alone, but ours together. We are one nation and one people. Our fate as a nation and our future as a people rest not upon one citizen, but upon all citizens.

This is the majesty and the meaning of this moment.

For every generation, there is a destiny. For some, history decides. For this generation, the choice must be our own.

Even now,

## Create features using BoW

In [10]:
# Utility function to create a list of the 500 most common words.
def bag_of_words(text):
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    return [item[0] for item in Counter(allwords).most_common(500)]

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    for i, sentence in enumerate(df['text_sentence']):
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        for word in words:
            df.loc[i, word] += 1
        if i % 50 == 0:
            print("Processing row {}".format(i))
    return df

In [11]:
# Set up the bags.
kennedy_words = bag_of_words(kennedy_doc)
johnson_words = bag_of_words(johnson_doc)

In [12]:
# Combine bags to create common set of unique words.
common_words = set(kennedy_words + johnson_words)

In [13]:
# Create BoW features.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150


Unnamed: 0,dot,final,achieve,colonial,grant,scholar,help,defeat,hatred,assure,...,common,freedom,steady,lifetime,leader,overburden,conquer,fight,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,"(Vice, President, Johnson, ,, Mr., Speaker, ,,...",Kennedy
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(For, I, have, sworn, I, before, you, and, Alm...",Kennedy
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, world, is, very, different, now, .)",Kennedy
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(For, man, holds, in, his, mortal, hands, the,...",Kennedy
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,"(And, yet, the, same, revolutionary, beliefs, ...",Kennedy


## Create features using TF-IDF

In [14]:
# Loading the speeches, this time as sentences.
# Data source: http://www.nltk.org/nltk_data/
kennedy = inaugural.sents('1961-Kennedy.txt')
johnson = inaugural.sents('1965-Johnson.txt')

In [15]:
# Create list of text.
kennedy_list = [" ".join(sent) for sent in kennedy]
johnson_list = [" ".join(sent) for sent in johnson]

In [16]:
# Combine.
combined = kennedy_list + johnson_list

In [17]:
# Vectorizer.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df = 0.5,
                             min_df = 2,
                             stop_words = 'english',
                             use_idf = True,
                             norm = u'l2',
                             smooth_idf = True)

tfidf = vectorizer.fit_transform(combined).tocsr()

In [18]:
mle_tfidf = vectorizer.fit_transform(combined)

## Assign Features & Target

In [19]:
# BoW.
X_bow = word_counts.drop(['text_sentence', 'text_source'], 1)
Y_bow = word_counts['text_source']

# tfidf.
X_tfidf = tfidf
Y_tfidf = ['Kennedy']*len(kennedy_list) + ['Johnson']*len(johnson_list)

## Supervised Learning - BoW

In [20]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn_bow = knn.fit(X_bow, Y_bow)
cvScores = cross_val_score(knn_bow, X_bow, Y_bow, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.74193548 0.70967742 0.64516129 0.68965517 0.65517241]
Mean CV: 0.6883203559510568
Std CV: 0.03547951197330906


In [21]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr_bow = lr.fit(X_bow, Y_bow)
cvScores = cross_val_score(lr_bow, X_bow, Y_bow, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.70967742 0.77419355 0.77419355 0.65517241 0.65517241]
Mean CV: 0.7136818687430477
Std CV: 0.05326551416943215


In [22]:
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier()
rfc_bow = rfc.fit(X_bow, Y_bow)
cvScores = cross_val_score(rfc_bow, X_bow, Y_bow, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.74193548 0.70967742 0.70967742 0.72413793 0.65517241]
Mean CV: 0.7081201334816463
Std CV: 0.02901158065047279


In [23]:
# Gradient Boosting.
from sklearn import ensemble
clf = ensemble.GradientBoostingClassifier()
clf_bow = clf.fit(X_bow, Y_bow)
cvScores = cross_val_score(clf_bow, X_bow, Y_bow, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.77419355 0.77419355 0.83870968 0.62068966 0.68965517]
Mean CV: 0.739488320355951
Std CV: 0.07596084371056748


# Supervised Learning - TFIDF

In [24]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn_bow = knn.fit(X_bow, Y_bow)
cvScores = cross_val_score(knn_bow, X_bow, Y_bow, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.74193548 0.70967742 0.64516129 0.68965517 0.65517241]
Mean CV: 0.6883203559510568
Std CV: 0.03547951197330906


In [25]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr_tfidf = lr.fit(X_tfidf, Y_tfidf)
cvScores = cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.63333333 0.66666667 0.65517241 0.68965517 0.64285714]
Mean CV: 0.6575369458128079
Std CV: 0.019605278709610614


In [26]:
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier()
rfc_tfidf = rfc.fit(X_tfidf, Y_tfidf)
cvScores = cross_val_score(rfc_tfidf, X_tfidf, Y_tfidf, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.66666667 0.7        0.82758621 0.65517241 0.67857143]
Mean CV: 0.7055993431855502
Std CV: 0.06277146523607541


In [27]:
# Gradient Boosting.
from sklearn import ensemble
clf = ensemble.GradientBoostingClassifier()
clf_tfidf = clf.fit(X_tfidf, Y_tfidf)
cvScores = cross_val_score(clf_tfidf, X_tfidf, Y_tfidf, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.73333333 0.76666667 0.86206897 0.68965517 0.64285714]
Mean CV: 0.7389162561576355
Std CV: 0.07432610813568558


Gradient Boosting with TF-IDF features are best performer, but had the most variability. GB with BoW had similar results. Random Forest BoW was second best & had more consistent scores.

## Try to increase accuracy of Gradient Boosting model

In [28]:
# Increase common words in BoW.

# Utility function to create a list of the 750 most common words.
def bag_of_words(text):
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    return [item[0] for item in Counter(allwords).most_common(750)]

# Set up the bags.
kennedy_words = bag_of_words(kennedy_doc)
johnson_words = bag_of_words(johnson_doc)

# Combine bags to create common set of unique words.
common_words = set(kennedy_words + johnson_words)

# Creating the new features.
word_counts_750 = bow_features(sentences, common_words)
word_counts_750.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150


Unnamed: 0,dot,final,achieve,colonial,grant,scholar,help,defeat,hatred,assure,...,common,freedom,steady,lifetime,leader,overburden,conquer,fight,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,"(Vice, President, Johnson, ,, Mr., Speaker, ,,...",Kennedy
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(For, I, have, sworn, I, before, you, and, Alm...",Kennedy
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, world, is, very, different, now, .)",Kennedy
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(For, man, holds, in, his, mortal, hands, the,...",Kennedy
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,"(And, yet, the, same, revolutionary, beliefs, ...",Kennedy


In [29]:
# Assign X & Y.
X_bow_750 = word_counts_750.drop(['text_sentence', 'text_source'], 1)
Y_bow_750 = word_counts_750['text_source']

In [30]:
# Gradient Boosting.
from sklearn import ensemble
clf = ensemble.GradientBoostingClassifier()
clf_bow_750 = clf.fit(X_bow_750, Y_bow_750)
cvScores = cross_val_score(clf_bow_750, X_bow_750, Y_bow_750, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.80645161 0.77419355 0.83870968 0.62068966 0.68965517]
Mean CV: 0.7459399332591768
Std CV: 0.07990214269582391


Nope, let's try decreasing the number of common words.

In [31]:
# Increase common words in BoW.

# Utility function to create a list of the 250 most common words.
def bag_of_words(text):
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    return [item[0] for item in Counter(allwords).most_common(250)]

# Set up the bags.
kennedy_words = bag_of_words(kennedy_doc)
johnson_words = bag_of_words(johnson_doc)

# Combine bags to create common set of unique words.
common_words = set(kennedy_words + johnson_words)

# Creating the new features.
word_counts_250 = bow_features(sentences, common_words)
word_counts_250.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150


Unnamed: 0,dot,final,achieve,colonial,scholar,help,hatred,assure,american,stand,...,family,clergy,require,state,victory,freedom,conquer,fight,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,0,"(Vice, President, Johnson, ,, Mr., Speaker, ,,...",Kennedy
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(For, I, have, sworn, I, before, you, and, Alm...",Kennedy
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, world, is, very, different, now, .)",Kennedy
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(For, man, holds, in, his, mortal, hands, the,...",Kennedy
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,"(And, yet, the, same, revolutionary, beliefs, ...",Kennedy


In [32]:
# Assign X & Y.
X_bow_250 = word_counts_250.drop(['text_sentence', 'text_source'], 1)
Y_bow_250 = word_counts_250['text_source']

In [33]:
# Gradient Boosting.
from sklearn import ensemble
clf = ensemble.GradientBoostingClassifier()
clf_bow_250 = clf.fit(X_bow_250, Y_bow_250)
cvScores = cross_val_score(clf_bow_250, X_bow_250, Y_bow_250, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.80645161 0.77419355 0.80645161 0.62068966 0.68965517]
Mean CV: 0.739488320355951
Std CV: 0.07316977987318643


Nope, similar results. Let's include punctuation in BoW.

In [34]:
# Utility function to create a list of the 500 most common words.
def bag_of_words(text):
    allwords = [token.lemma_
                for token in text
                if not token.is_stop]
    return [item[0] for item in Counter(allwords).most_common(500)]

In [35]:
# Set up the bags.
kennedy_words = bag_of_words(kennedy_doc)
johnson_words = bag_of_words(johnson_doc)

# Combine bags to create common set of unique words.
common_words = set(kennedy_words + johnson_words)

# Creating the new features.
word_counts_punct = bow_features(sentences, common_words)
word_counts_punct.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150


Unnamed: 0,dot,final,achieve,colonial,grant,scholar,help,defeat,hatred,assure,...,common,freedom,steady,lifetime,leader,overburden,conquer,fight,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,"(Vice, President, Johnson, ,, Mr., Speaker, ,,...",Kennedy
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(For, I, have, sworn, I, before, you, and, Alm...",Kennedy
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, world, is, very, different, now, .)",Kennedy
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(For, man, holds, in, his, mortal, hands, the,...",Kennedy
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,"(And, yet, the, same, revolutionary, beliefs, ...",Kennedy


In [36]:
# Assign X & Y.
X_bow_punct = word_counts_punct.drop(['text_sentence', 'text_source'], 1)
Y_bow_punct = word_counts_punct['text_source']

In [37]:
# Gradient Boosting.
from sklearn import ensemble
clf = ensemble.GradientBoostingClassifier()
clf_bow_punct = clf.fit(X_bow_punct, Y_bow_punct)
cvScores = cross_val_score(clf_bow_punct, X_bow_punct, Y_bow_punct, cv = 5)
print('CV:', cvScores)
print('Mean CV:', np.mean(cvScores))
print('Std CV:', np.std(cvScores))

CV: [0.80645161 0.77419355 0.83870968 0.62068966 0.68965517]
Mean CV: 0.7459399332591768
Std CV: 0.07990214269582391
