In [3]:
# Import modules.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import sklearn
import time

import spacy
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

# Aesthetics.
%matplotlib inline

In [4]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'. Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

In [6]:
# The Chapter indicator is idiosyncratic.
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

In [8]:
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [10]:
# Parse the cleaned novels.
#nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [12]:
# Group into sentences.
alice_sents = [[sent, 'Carroll'] for sent in alice_doc.sents]
persuasion_sents = [[sent, 'Austen'] for sent in persuasion_doc.sents]

In [14]:
# Combine the sentences from the 2 novels into 1 dataframe.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [15]:
sentences.shape

(5318, 2)

In [16]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
    return df

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [17]:
# Create our dataframe with features. This can take HOURS.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000


Unnamed: 0,corner,instantly,jar,pen,shape,cake,ball,acquaint,bristle,perception,...,graze,nonsense,english,disappear,thoughtful,lean,moment,green,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll


In [18]:
word_counts.shape

(5318, 3064)

# Bag of Words
1. Random Forest
2. Logistic Regression
3. Gradient Boosting

In [19]:
# Assign X & Y.
X = np.array(word_counts.drop(['text_sentence', 'text_source'], 1))
Y = word_counts['text_source']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

In [20]:
# Random Forest.
def bow_rf(X, Y):
    start_time = time.time()
    from sklearn import ensemble
    rfc = ensemble.RandomForestClassifier()
    train = rfc.fit(X_train, Y_train)
    print('Training set score:', rfc.score(X_train, Y_train))
    print('Testing set score:', rfc.score(X_test, Y_test))
    print('Time taken: {} seconds'.format('%.1f' % (time.time() - start_time)))

In [21]:
# Logistic Regression.
def bow_lr(X, Y):
    start_time = time.time()
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression()
    train = lr.fit(X_train, Y_train)
    print('Shape:', X_train.shape, Y_train.shape)
    print('Training set score:', lr.score(X_train, Y_train))
    print('Testing set score:', lr.score(X_test, Y_test))
    print('Time taken: {} seconds'.format('%.1f' % (time.time() - start_time)))

In [22]:
# Gradient Boosting.
def bow_gb(X, Y):
    start_time = time.time()
    from sklearn import ensemble
    clf = ensemble.GradientBoostingClassifier()
    train = clf.fit(X_train, Y_train)
    print('Training set score:', clf.score(X_train, Y_train))
    print('Testing set score:', clf.score(X_test, Y_test))
    print('Time taken: {} seconds'.format('%.1f' % (time.time() - start_time)))

In [23]:
# Output.
print('Random Forest:\n', bow_rf(X, Y))
print('Logistic Regression:\n', bow_lr(X, Y))
print('Gradient Boosting:\n', bow_gb(X, Y))

  from numpy.core.umath_tests import inner1d


Training set score: 0.9871473354231975
Testing set score: 0.8961466165413534
Time taken: 5.5 seconds
Random Forest:
 None
Shape: (3190, 3062) (3190,)
Training set score: 0.9579937304075236
Testing set score: 0.9158834586466166
Time taken: 0.7 seconds
Logistic Regression:
 None
Training set score: 0.886833855799373
Testing set score: 0.8735902255639098
Time taken: 35.5 seconds
Gradient Boosting:
 None


# Same model, new inputs
Feed the model a differnt novel by Jane Austen, _Emma_. Will it be able to distinguish Austen from Carroll with the same level of accuracy if we insert a different sample of Austen's writing? First, we need to process _Emma_ & combine with the Alice data.

In [24]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [25]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [26]:
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

# Emma is quite long, let's cut it down to the same length as Alice.
emma_sents = emma_sents[0:len(alice_sents)]

In [27]:
# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

print('done')

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
done


In [28]:
# Now we can model it!
# Let's use logistic regression again.

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
train = lr.fit(X_train, Y_train)

# Combine the Emma sentence data with the Alice data from the test set.
X_Emma_test = np.concatenate((
    X_train[Y_train[Y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([Y_train[Y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.6976137211036539


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1564,105
Carroll,706,307


Well look at that!  NLP approaches are generally effective on the same type of material as they were trained on. It looks like this model is actually able to differentiate multiple works by Austen from Alice in Wonderland.  Now the question is whether the model is very good at identifying Austen, or very good at identifying Alice in Wonderland, or both...

# Challenge 0: Improve Logistic Regression performance of 93%.
Options include:
1. Other modeling techniques (SVM)
2. Create more features that take advantage of the spaCy information (include grammar, phrases, POS, etc).
3. Create sentence-level features (number of words, amount of punctuation)
4. Including contextual information (length of previous and next sentences, words repeated from 1 sentence to the next, etc).

Design models on the test set, or use cross_validation with multiple folds, and see if you can improve accuracy > 90%.

In [45]:
# View our features again.
df = word_counts
print(df.shape)
df.head()

(5318, 3064)


Unnamed: 0,corner,instantly,jar,pen,shape,cake,ball,acquaint,bristle,perception,...,graze,nonsense,english,disappear,thoughtful,lean,moment,green,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll


In [36]:
# Support Vector Classifier
def bow_svc():
    start_time = time.time()
    from sklearn.svm import SVC
    svc = SVC()
    train = svc.fit(X_train, Y_train)
    print('Training set score:', svc.score(X_train, Y_train))
    print('Testing set score:', svc.score(X_test, Y_test))
    print('Time taken: {} seconds'.format('%.1f' % (time.time() - start_time)))

# View results.
print(bow_svc())

Training set score: 0.6824451410658308
Testing set score: 0.6917293233082706
Time taken: 89.6 seconds
None


In [64]:
# Creating more features taking advantage of spaCy info...

# Sentence length.
df['sentence_length'] = df.text_sentence.map(lambda x: len(x))

# Average word length.
df['average_word_length'] = [np.mean(
    [len(token) for token in df['text_sentence'][i]]) for i in range(0, len(df['text_sentence']))]

# Parts of Speech (POS).
df['n_propn'] = [sum([1 for x in t if x.pos_ == 'PROPN']) for t in df.text_sentence]
df['n_verb'] = [sum([1 for x in t if x.pos_ == 'VERB']) for t in df.text_sentence]
df['n_adp'] = [sum([1 for x in t if x.pos_ == 'ADP']) for t in df.text_sentence]
df['n_adj'] = [sum([1 for x in t if x.pos_ == 'ADJ']) for t in df.text_sentence]
df['n_adv'] = [sum([1 for x in t if x.pos_ == 'ADV']) for t in df.text_sentence]
df['n_noun'] = [sum([1 for x in t if x.pos_ == 'NOUN']) for t in df.text_sentence]
df['n_sym'] = [sum([1 for x in t if x.pos_ == 'SYM']) for t in df.text_sentence]
df['n_num'] = [sum([1 for x in t if x.pos_ == 'NUM']) for t in df.text_sentence]

In [66]:
# Reordering columns...
cols = list(df)
cols.insert(0, cols.pop(cols.index('text_source')))
cols.insert(1, cols.pop(cols.index('text_sentence')))
df = df.loc[:, cols]
df.head()

Unnamed: 0,text_source,text_sentence,corner,instantly,jar,pen,shape,cake,ball,acquaint,...,sentence_length,average_word_length,n_propn,n_verb,n_adp,n_adj,n_adv,n_noun,n_sym,n_num
0,Carroll,"(Alice, was, beginning, to, get, very, tired, ...",0,0,0,0,0,0,0,0,...,67,3.656716,2,13,8,3,3,12,0,0
1,Carroll,"(So, she, was, considering, in, her, own, mind...",0,0,0,0,0,0,0,0,...,63,3.730159,2,11,8,7,7,8,0,0
2,Carroll,"(There, was, nothing, so, VERY, remarkable, in...",0,0,0,0,0,0,0,0,...,33,3.393939,2,5,4,1,6,2,0,0
3,Carroll,"(Oh, dear, !)",0,0,0,0,0,0,0,0,...,3,2.333333,0,0,0,0,0,0,0,0
4,Carroll,"(I, shall, be, late, !, ')",0,0,0,0,0,0,0,0,...,6,2.333333,0,2,0,1,0,0,0,0


In [76]:
# Assign training & test sets.
X = np.array(df.drop(['text_source', 'text_sentence'], 1))
Y = df['text_source']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state = 0)

In [81]:
# Run models with new dataframe.
print('Logistic Regression:')
bow_lr(X, Y)
print('\n')

print('Random Forest:')
bow_rf(X, Y)
print('\n')

print('Gradient Boost:')
bow_gb(X, Y)
print('\n')

print('Support Vector Classifier:')
bow_svc()

Logistic Regression:
Shape: (3190, 3072) (3190,)
Training set score: 0.961755485893417
Testing set score: 0.9163533834586466
Time taken: 0.7 seconds


Random Forest:
Training set score: 0.9924764890282132
Testing set score: 0.8660714285714286
Time taken: 0.9 seconds


Gradient Boost:
Training set score: 0.8949843260188087
Testing set score: 0.8773496240601504
Time taken: 36.2 seconds


Support Vector Classifier:
Training set score: 0.6824451410658308
Testing set score: 0.6917293233082706
Time taken: 66.6 seconds


# Challenge 1: Comparing Alice/Persuasion/Austen vs any other work.
ie: <br>
1. _Alice in Wonderland_ vs any other work
2. _Persuasion_ vs any other work
3. Austen vs any other work

This will involve pulling a new book from the Project Gutenberg corpus (print(gutenberg.fileids()) for a list) and processing it.

In [83]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [109]:
# Persuasion vs Blake.
blake = gutenberg.raw('blake-poems.txt')

# Cleaning the text.
blake = re.sub(r'VOLUME \w+', '', blake)
blake = re.sub(r'CHAPTER \w+', '', blake)
blake = text_cleaner(blake)

# Parse the cleaned novels. This can take a bit.
blake_doc = nlp(blake)

# Group into sentences.
blake_sents = [[sent, "blake"] for sent in blake_doc.sents]

# Persuasion is quite long, let's cut it down to the same length as Blake.
persuasion_sents = persuasion_sents[0:len(blake_sents)]

In [111]:
# Combine the sentences from the 2 novels into 1 dataframe.
sentences1 = pd.DataFrame(blake_sents + persuasion_sents)
sentences1.head()

Unnamed: 0,0,1
0,"(SONGS, OF, INNOCENCE, AND, OF, EXPERIENCE, an...",blake
1,"(INTRODUCTION, Piping, down, the, valleys, wil...",blake
2,"(So, I, piped, with, merry, cheer, ., "")",blake
3,"(Piper, ,, pipe, that, song, again)",blake
4,"(;, "", So, I, piped, :, he, wept, to, hear, ., "")",blake


In [115]:
# Set up the bags.
#persuasionwords = bag_of_words(persuasion_doc)
blakewords = bag_of_words(blake_doc)

# Combine the bags to create a set of unique words.
common_words1 = set(persuasionwords + blakewords)

In [116]:
# Create our dataframe with features. This can take a while to run.
word_counts1 = bow_features(sentences1, common_words1)
word_counts1.head()

Processing row 0
Processing row 500


Unnamed: 0,corner,instantly,wanderer,walketh,pen,ball,vale,acquaint,perception,charter,...,impress'd,nonsense,distrust,disappear,thoughtful,lean,moment,green,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(SONGS, OF, INNOCENCE, AND, OF, EXPERIENCE, an...",blake
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(INTRODUCTION, Piping, down, the, valleys, wil...",blake
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, I, piped, with, merry, cheer, ., "")",blake
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Piper, ,, pipe, that, song, again)",blake
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(;, "", So, I, piped, :, he, wept, to, hear, ., "")",blake


In [117]:
# Assign training & test sets.
X = np.array(word_counts1.drop(['text_source', 'text_sentence'], 1))
Y = word_counts1['text_source']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state = 0)

In [118]:
# Run models with new dataframe.
print('Logistic Regression:')
bow_lr(X, Y)
print('\n')

print('Random Forest:')
bow_rf(X, Y)
print('\n')

print('Gradient Boost:')
bow_gb(X, Y)
print('\n')

print('Support Vector Classifier:')
bow_svc()

Logistic Regression:
Shape: (597, 2715) (597,)
Training set score: 0.9882747068676717
Testing set score: 0.9147869674185464
Time taken: 0.2 seconds


Random Forest:
Training set score: 0.9932998324958124
Testing set score: 0.87468671679198
Time taken: 0.2 seconds


Gradient Boost:
Training set score: 0.9179229480737019
Testing set score: 0.8596491228070176
Time taken: 2.5 seconds


Support Vector Classifier:
Training set score: 0.5125628140703518
Testing set score: 0.48120300751879697
Time taken: 3.1 seconds
