In [11]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk
nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to /Users/luke/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!

[93m    Linking successful[0m
    /anaconda3/lib/python3.6/site-packages/en_core_web_sm -->
    /anaconda3/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [12]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [13]:
# Select some options and load the data

whitman_leaves = gutenberg.raw('whitman-leaves.txt')
shakespeare_macbeth = gutenberg.raw('shakespeare-macbeth.txt')

In [14]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [15]:
whitman_leaves = text_cleaner(whitman_leaves)
shakespeare_macbeth = text_cleaner(shakespeare_macbeth)

In [16]:
# Taking a tweet-sized look at Whitman
whitman_leaves[0:280]

"Come, said my soul, Such verses for my Body let us write, (for we are one,) That should I after return, Or, long, long hence, in other spheres, There to some group of mates the chants resuming, (Tallying Earth's soil, trees, winds, tumultuous waves,) Ever with pleas'd smile I may"

In [17]:
# Same with Shakespeare
shakespeare_macbeth[0:280]

"Actus Primus. Scoena Prima. Thunder and Lightning. Enter three Witches. 1. When shall we three meet againe? In Thunder, Lightning, or in Raine? 2. When the Hurley-burley's done, When the Battaile's lost, and wonne 3. That will be ere the set of Sunne 1. Where the place? 2. Vpon t"

In [19]:
# Parse the cleaned works...
nlp = spacy.load('en')
whitman_doc = nlp(whitman_leaves)
shakespeare_doc = nlp(shakespeare_macbeth)

In [20]:
# Group into sentences.
leaves_sents = [[sent, "Whitman"] for sent in whitman_doc.sents]
macbeth_sents = [[sent, "Shakespeare"] for sent in shakespeare_doc.sents]

# Combine the sentences from the two works into one data frame.
sentences = pd.DataFrame(leaves_sents + macbeth_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Come, ,, said, my, soul, ,, Such, verses, for...",Whitman
1,"(That, should, I, after, return, ,, Or, ,, lon...",Whitman
2,"(Ever, with, pleas'd, smile, I, may, keep, on,...",Whitman
3,"(I, here, and, now, Signing, for, Soul, and, B...",Whitman
4,(One's),Whitman


In [21]:
# Utility function to create a list of the 2000 most common words. Augmented to exclude titles - 
# if not line.text.isupper()

def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence'][::100]):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
whitmanwords = bag_of_words(whitman_doc)
shakespearewords = bag_of_words(shakespeare_doc)

# Combine bags to create a set of unique words.
common_words = set(whitmanwords + shakespearewords)

In [26]:
common_words

{'lesse',
 'taste',
 'fetch',
 'strongly',
 'lecture',
 'stirre',
 'adhere',
 'mouth',
 'seat',
 'impassive',
 'region',
 'wander',
 'bird',
 '6',
 'precedent',
 'vast',
 'missouri',
 'lead',
 'make',
 'highly',
 'print',
 'branch',
 'honor',
 'yong',
 'live',
 'reuolt',
 'sweaty',
 'sexta',
 'stabs',
 'tangle',
 'cheere',
 'stock',
 'wisdom',
 'tear',
 'silver',
 'man',
 'cawdor',
 'clamor',
 'onward',
 'spiritual',
 'afternoon',
 'arrogant',
 'mur',
 'sake',
 'after',
 'always',
 'houre',
 'orbic',
 'registr',
 'thine',
 'include',
 'sleeper',
 'damnation',
 'seize',
 'nimbly',
 'wofull',
 'appar',
 'shaft',
 'repeat',
 'pulse',
 "hang'd",
 'note',
 'shoot',
 'ayde',
 'c.',
 'dreadfull',
 'chamber',
 'increase',
 "powr'd",
 'skinne',
 'hayle',
 'didst',
 'woman',
 'shut',
 'patch',
 'pleade',
 'permit',
 'neyther',
 'guilt',
 'guest',
 'might',
 'mighty',
 'pennant',
 'want',
 'wing',
 'briefely',
 'ready',
 'seeme',
 'regard',
 'perfectly',
 'dis',
 'treason',
 'sin',
 'slave',
 'dy

In [27]:
# create our data with features...
word_counts = bow_features(sentences, common_words)

Processing row 0
Processing row 50


In [28]:
# adding another feature using TextBlob to assess sentiment

from textblob import TextBlob

word_counts['text_sentence_sentiment_polarity'] = word_counts['text_sentence'].apply(str).apply(lambda x: TextBlob(x).sentiment.polarity)
word_counts['text_sentence_sentiment_subjectivity'] = word_counts['text_sentence'].apply(str).apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [29]:
word_counts['text_sentence_avg_word_length'] = word_counts['text_sentence'].apply(str).apply(lambda x: x.split())
word_counts['text_sentence_avg_word_length'] = word_counts['text_sentence_avg_word_length'].apply(lambda words: sum(len(word) for word in words) / len(words))

In [30]:
word_counts.head()

Unnamed: 0,lesse,taste,fetch,strongly,lecture,stirre,adhere,mouth,seat,impassive,...,continent,truths,stream,sit,macbeths,text_sentence,text_source,text_sentence_sentiment_polarity,text_sentence_sentiment_subjectivity,text_sentence_avg_word_length
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,"(Come, ,, said, my, soul, ,, Such, verses, for...",Whitman,0.0,0.5,3.75
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,"(That, should, I, after, return, ,, Or, ,, lon...",Whitman,-0.075,0.391667,5.25
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,"(Ever, with, pleas'd, smile, I, may, keep, on,...",Whitman,0.275,0.216667,4.058824
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,"(I, here, and, now, Signing, for, Soul, and, B...",Whitman,0.0,0.0,3.588235
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,(One's),Whitman,0.0,0.0,5.0


### BoW with Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 

Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

lr = LogisticRegression()
train = lr.fit(X_train, y_train)

print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(4569, 3445) (4569,)
Training set score: 0.744364193477785

Test set score: 0.7535280603872662


In [33]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

pred = lr.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[   1  747]
 [   4 2295]]
             precision    recall  f1-score   support

Shakespeare       0.20      0.00      0.00       748
    Whitman       0.75      1.00      0.86      2299

avg / total       0.62      0.75      0.65      3047



### SVM

In [34]:
# Using SVM as an alternative
from sklearn.svm import LinearSVC

svm = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
                intercept_scaling=1, loss='hinge', max_iter=1000,
                multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
                verbose=0)

train = svm.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', svm.score(X_train, y_train))
print('\nTest set score:', svm.score(X_test, y_test))

(4569, 3445) (4569,)
Training set score: 0.7463339899321515

Test set score: 0.7545126353790613


In [35]:
pred = svm.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[   0  748]
 [   0 2299]]
             precision    recall  f1-score   support

Shakespeare       0.00      0.00      0.00       748
    Whitman       0.75      1.00      0.86      2299

avg / total       0.57      0.75      0.65      3047



  'precision', 'predicted', average, warn_for)


### Gradient Boosting

In [36]:
from sklearn import ensemble

clf = ensemble.GradientBoostingClassifier()

train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

  from numpy.core.umath_tests import inner1d


Training set score: 0.7609980302035456

Test set score: 0.7505743354118806


In [37]:
pred = clf.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[  47  701]
 [  59 2240]]
             precision    recall  f1-score   support

Shakespeare       0.44      0.06      0.11       748
    Whitman       0.76      0.97      0.85      2299

avg / total       0.68      0.75      0.67      3047



### Takeaway

It would seem that gradient boosting is performing the best of the three models above, particularly with regard to correctly classifying Shakespeare. Even a 0.11 f1-score is significantly better than a 0.0 scores it garnered in SVM and Logistic Regression. 

Furthermore, judging by the confusion matrices, particularly in regression and SVM, it seems that we really aren't doing much Shakespeare guessing.

Although we're coming out of our tests looking at accuracy scores in the mid-70s, one must really question how good a measure accuracy is when a class imbalance such as the one above exists. For example, there are nearly 3 times as many instances of Whitman as there are Shakespeare, so guessing Whitman 100% of the time would lead to near 75% accuracy, which doesn't effectively evaluate the poor performance.