In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

## Bag of Word 

In [2]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'pg11.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [3]:
# utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [4]:
# load and clean the data.
moby = gutenberg.raw('melville-moby_dick.txt')
caesar = gutenberg.raw('shakespeare-caesar.txt')

moby = text_cleaner(moby)
caesar = text_cleaner(caesar)

In [5]:
# parse the novels
nlp = spacy.load('en')
moby_doc = nlp(moby)
caesar_doc = nlp(caesar)

In [6]:
# group into sentences
moby_sents = [[sent, 'Melville'] for sent in moby_doc.sents]
caesar_sents = [[sent, 'Shakespeare'] for sent in caesar_doc.sents]

# combine them into one data frame
sentences = pd.DataFrame(moby_sents + caesar_sents)
sentences.head()

Unnamed: 0,0,1
0,"(ETYMOLOGY, .)",Melville
1,"((, Supplied, by, a, Late, Consumptive)",Melville
2,"(Usher, to, a, Grammar, School, ))",Melville
3,"(The, pale, Usher, threadbare, in, coat, ,, he...",Melville
4,"(;, I, see, him, now, .)",Melville


In [7]:
# utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

In [8]:
# find the most common words
mobywords = bag_of_words(moby_doc)
caesarwords = bag_of_words(caesar_doc)

# combine the words to create set of unique words
common_words = set(mobywords + caesarwords)

In [9]:
# creates a data frame with features for each word in our common word set.
# each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

In [10]:
# create bag of words feature
word_counts = bow_features(sentences, common_words)
word_counts.head(3)

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000
Processing row 6500
Processing row 7000
Processing row 7500
Processing row 8000
Processing row 8500
Processing row 9000
Processing row 9500
Processing row 10000
Processing row 10500
Processing row 11000
Processing row 11500
Processing row 12000


Unnamed: 0,commanders,retentiue,cruell,yes,gold,lusty,winds,careful,write,fierce,...,acquaint,shooke,quite,breathlesse,precise,forge,daye,frequently,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(ETYMOLOGY, .)",Melville
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"((, Supplied, by, a, Late, Consumptive)",Melville
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Usher, to, a, Grammar, School, ))",Melville


In [11]:
# find the number of punctuation marks in each sentences
def number_of_punctuation(sentences):
    punct = []
    
    for i in range(len(sentences)):
        temp = []                          # temporary array to hold the tokens
        for token in sentences[0][i]:      # list the type of speech for each word in sentence
            temp.append(token.pos_)        

        temp = pd.DataFrame(temp)
        punct.append(len(temp[temp[0] == 'PUNCT']))    # number of punctuation
    
    return np.array(punct)

# create a feature for number of punctuation in a sentence
num_punct = number_of_punctuation(sentences)
word_counts['Num_Punct'] = num_punct

In [12]:
# find the number of verb in each sentences
def number_of_verb(sentences):
    verbs = []
    
    for i in range(len(sentences)):
        temp = []                          # temporary array to hold the tokens
        for token in sentences[0][i]:      # list the type of speech for each word in sentence
            temp.append(token.pos_)        

        temp = pd.DataFrame(temp)
        verbs.append(len(temp[temp[0] == 'VERB']))    # number of verbs
    
    return np.array(verbs)

# create a feature for number of punctuation in a sentence
num_verb = number_of_verb(sentences)
word_counts['Num_Verb'] = num_verb

In [13]:
# find the number of nouns in each sentences
def number_of_noun(sentences):
    nouns = []
    
    for i in range(len(sentences)):
        temp = []                          # temporary array to hold the tokens
        for token in sentences[0][i]:      # list the type of speech for each word in sentence
            temp.append(token.pos_)        

        temp = pd.DataFrame(temp)
        nouns.append(len(temp[temp[0] == 'NOUN']))    # number of nouns
    
    return np.array(nouns)

# create a feature for number of punctuation in a sentence
num_noun = number_of_noun(sentences)
word_counts['Num_Noun'] = num_noun

In [14]:
# find the number of adverbs in each sentences
def number_of_adverb(sentences):
    adverbs = []
    
    for i in range(len(sentences)):
        temp = []                          # temporary array to hold the tokens
        for token in sentences[0][i]:      # list the type of speech for each word in sentence
            temp.append(token.pos_)        

        temp = pd.DataFrame(temp)
        adverbs.append(len(temp[temp[0] == 'ADV']))    # number of punctuation
    
    return np.array(adverbs)

# create a feature for number of punctuation in a sentence
num_adv = number_of_adverb(sentences)
word_counts['Num_Adv'] = num_adv

In [15]:
# Find the number of adjectives in each sentences
def number_of_adjective(sentences):
    adjectives = []
    
    for i in range(len(sentences)):
        temp = []                          # temporary array to hold the tokens
        for token in sentences[0][i]:      # list the type of speech for each word in sentence
            temp.append(token.pos_)        

        temp = pd.DataFrame(temp)
        adjectives.append(len(temp[temp[0] == 'ADJ']))    # number of punctuation
    
    return np.array(adjectives)

# create a feature for number of punctuation in a sentence
num_adj = number_of_punctuation(sentences)
word_counts['Num_Adj'] = num_adj

In [16]:
# Number of words in a sentence
def words_in_sentence(sentences):
    words = []
    
    for i in range(len(sentences)):
        temp = []
        for token in sentences[0][i]:
            if not token.is_punct:
                temp.append(token)
        words.append(len(temp))
        
    return np.array(words)

# create a feature for number of words in a sentence
num_words = words_in_sentence(sentences)
word_counts['Num_Words'] = num_words

In [17]:
word_counts.head(3)

Unnamed: 0,commanders,retentiue,cruell,yes,gold,lusty,winds,careful,write,fierce,...,daye,frequently,text_sentence,text_source,Num_Punct,Num_Verb,Num_Noun,Num_Adv,Num_Adj,Num_Words
0,0,0,0,0,0,0,0,0,0,0,...,0,0,"(ETYMOLOGY, .)",Melville,1,0,0,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,"((, Supplied, by, a, Late, Consumptive)",Melville,1,1,0,0,1,5
2,0,0,0,0,0,0,0,0,0,0,...,0,0,"(Usher, to, a, Grammar, School, ))",Melville,1,0,0,0,1,5


**---**

## tf-idf

In [18]:
# reading in the data, this time in the form of paragraphs
moby_para = gutenberg.paras('melville-moby_dick.txt')
caesar_para = gutenberg.paras('shakespeare-caesar.txt')

    note: merge the two paragraphs instead of creating two separate feature data frames

In [31]:
# processing paragraphs for Moby Dick
moby_paragraph = []
for paragraph in moby_para:
    para = paragraph[0]     # remove the brackets
    # removing the double-dash from all words
    para = [re.sub(r'--','',word) for word in para]
    # forming each paragraph into a string and adding it to the list of strings.
    moby_paragraph.append(' '.join(para))
    
# processing paragraphs for Caesar    
caesar_paragraph = []
for paragraph in caesar_para:
    para = paragraph[0]     # remove the brackets
    # removing the double-dash from all words
    para = [re.sub(r'--','',word) for word in para]
    # Forming each paragraph into a string and adding it to the list of strings.
    caesar_paragraph.append(' '.join(para))

In [32]:
# store the paragraph into dataframes
df_moby = pd.DataFrame(moby_paragraph)
df_caesar = pd.DataFrame(np.array(caesar_paragraph))

df_moby['text_source'] = pd.Series(['Melville'] * df_moby.shape[0])
df_caesar['text_source'] = pd.Series(['Shakespear'] * df_caesar.shape[0])

In [33]:
df_moby

Unnamed: 0,0,text_source
0,[ Moby Dick by Herman Melville 1851 ],Melville
1,ETYMOLOGY .,Melville
2,( Supplied by a Late Consumptive Usher to a Gr...,Melville
3,"The pale Usher threadbare in coat , heart , b...",Melville
4,""" While you take in hand to school others , an...",Melville
5,""" WHALE .",Melville
6,""" WHALE .",Melville
7,"KETOS , GREEK .",Melville
8,EXTRACTS ( Supplied by a Sub - Sub - Librarian ).,Melville
9,It will be seen that this mere painstaking bur...,Melville


In [21]:
# concatenate the two data frame into one
df_para = pd.concat([df_moby, df_caesar], ignore_index = True)

    note: tweakin the tfidfvectorizer to increase features

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
# calculate the tf-idf scores
vectorizer = TfidfVectorizer(max_df = 0.5, # drop words that occur in more than half the paragraphs
                             min_df = 2, # only use words that appear at least twice
                             stop_words ='english', 
                             lowercase = True, # convert everything to lower case 
                             use_idf = True,# we definitely want to use inverse document frequencies in our weighting
                             norm = u'l2', # applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf = True # adds 1 to all document frequencies, as if an extra document existed that used every word once.  
                                             ## prevents divide-by-zero errors
                            )


# applying the vectorizer
para_tfidf = vectorizer.fit_transform(df_para[0])
print("Number of features for Moby Dick and Caesar: %d" % para_tfidf.get_shape()[1])

Number of features for Moby Dick and Caesar: 3907


In [23]:
# turn csr matrix into a dataframe
df_para_tfidf = pd.DataFrame(para_tfidf.toarray())
df_para_tfidf['text_source'] = df_para['text_source']

### Prepare the Model

In [80]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression

In [90]:
# set target variable and training/test sets
Y_bow = word_counts['text_source']
X_bow = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, Y_train, Y_test = train_test_split(X_bow, Y_bow, test_size = 0.2, random_state = 0)

In [91]:
print(X_train.shape)
print(X_test.shape)

(9924, 3449)
(2482, 3449)


** -- **

In [92]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [93]:
# split tf-idf vectorizers into training and test sets
X = np.array(df_para_tfidf.drop(['text_source'], 1))
Y = df_para_tfidf['text_source']
X_train_tfidf, X_test_tfidf, Y_train_tfidf, Y_test_tfidf = train_test_split(X, Y, test_size = 0.4, random_state = 0)

In [94]:
# SVD data reducer, set this to 700 in order to have 90% variance explained
svd = TruncatedSVD(700)
lsa = make_pipeline(svd, Normalizer(copy=False))

# run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.fit_transform(X_test_tfidf)

Y_train_lsa = Y_train_tfidf
Y_test_lsa = Y_test_tfidf

variance_explained = svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:", total_variance * 100)

# for cross-validation
X_lsa = lsa.fit_transform(X)
Y_lsa = Y

Percent variance captured by all components: 90.758667403


    note: the accuracy is so high because the text is from two different period of times. If using text from the same author, the results will be lower

##### Random Forest

In [95]:
# bag of word 
rfc = ensemble.RandomForestClassifier()
train = rfc.fit(X_train, Y_train)

print('Training set score:', rfc.score(X_train, Y_train))
print('\nTest set score:', rfc.score(X_test, Y_test))
print('\nCross Validation\n', cross_val_score(rfc, X_bow, Y_bow, cv = 5))

Training set score: 0.990528012898

Test set score: 0.925463336019

Cross Validation
 [ 0.90652699  0.9113618   0.93389762  0.92664248  0.91209677]


In [96]:
# tf-idf 
train = rfc.fit(X_train_lsa, Y_train_lsa)

print('Training set score:', rfc.score(X_train_lsa, Y_train_lsa))
print('\nTest set score:', rfc.score(X_test_lsa, Y_test_lsa))
print('\nCross Validation\n', cross_val_score(rfc, X_lsa, Y_lsa, cv = 5))

Training set score: 0.980207351555

Test set score: 0.875618374558

Cross Validation
 [ 0.96610169  0.90536723  0.90819209  0.97595474  0.96458924]


##### Logistic Regression

In [97]:
# bag of word
lr = LogisticRegression()
train = lr.fit(X_train, Y_train)

print('Training set score:', lr.score(X_train, Y_train))
print('\nTest set score:', lr.score(X_test, Y_test))
print('\nCross Validation\n', cross_val_score(lr, X_bow, Y_bow, cv = 5))

Training set score: 0.971584038694

Test set score: 0.953666398066

Cross Validation
 [ 0.92788074  0.94037067  0.94921403  0.93913744  0.93548387]


In [101]:
# tf-idf 
train = lr.fit(X_train_lsa, Y_train_lsa)

print('Training set score:', lr.score(X_train_lsa, Y_train_lsa))
print('\nTest set score:', lr.score(X_test_lsa, Y_test_lsa))
print('\nCross Validation\n', cross_val_score(lr, X_lsa, Y_lsa, cv = 5))

Training set score: 0.942035815269

Test set score: 0.765371024735

Cross Validation
 [ 0.94774011  0.89548023  0.88418079  0.9688826   0.90651558]


##### Gradient Boost

In [105]:
clf = ensemble.GradientBoostingClassifier()

train = clf.fit(X_train, Y_train)

print('Training set score:', clf.score(X_train, Y_train))
print('\nTest set score:', clf.score(X_test, Y_test))
print('\nCross Validation\n', cross_val_score(clf, X_bow, Y_bow, cv = 5))

Training set score: 0.911225312374

Test set score: 0.908138597905

Cross Validation
 [ 0.88678485  0.89967768  0.91132608  0.89762193  0.89637097]


In [104]:
# tf-idf 
train = clf.fit(X_train_lsa, Y_train_lsa)

print('Training set score:', clf.score(X_train_lsa, Y_train_lsa))
print('\nTest set score:', clf.score(X_test_lsa, Y_test_lsa))
print('\nCross Validation\n', cross_val_score(clf, X_lsa, Y_lsa, cv = 5))

Training set score: 0.982092365693

Test set score: 0.607067137809

Cross Validation
 [ 0.97457627  0.9420904   0.94350282  0.95756719  0.94334278]


    Seems like the original selection of training and test set is unaccurate representation of the model. However, after cross-validation, it seems to be working better.