In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
import nltk
# Launch the installer to download "gutenberg" and "stop words" corpora.
#nltk.download()

In [3]:
from nltk.corpus import gutenberg, stopwords
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [4]:
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')
print('Raw:\n',alice[0:100])

Raw:
 [Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


# Regex

[regex cheatsheet](https://www.debuggex.com/cheatsheet/regex/python)

In [5]:
pattern = "[\[].*?[\]]"
persuasion = re.sub(pattern, "", persuasion)
alice = re.sub(pattern, "", alice) 
print('Title removed:\n', alice[0:100])

Title removed:
 

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on


In [6]:
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
print('Chapter headings removed:\n', alice[0:100])

Chapter headings removed:
 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothin


In [7]:
persuasion = ' '.join(persuasion.split())
alice = ' '.join(alice.split())
print('Extra whitespace removed:\n', alice[0:100])

Extra whitespace removed:
 Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to


# SpaCy(thon)

In [8]:
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [9]:
import spacy
nlp = spacy.load('en')

In [10]:
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [11]:
print("The alice_doc object is a {} object.".format(type(alice_doc)))
print("It is {} tokens long".format(len(alice_doc)))
print("The first three tokens are '{}'".format(alice_doc[:3]))
print("The type of each token is {}".format(type(alice_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 34420 tokens long
The first three tokens are 'Alice was beginning'
The type of each token is <class 'spacy.tokens.token.Token'>


# Tokens

In [12]:
from collections import Counter

def word_freq(text, include_stop=True):
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
    return Counter(words)

In [13]:
alice_freq = word_freq(alice_doc)
persuasion_freq = word_freq(persuasion_doc)

In [14]:
alice_freq.most_common(10)

[('the', 1524),
 ('and', 796),
 ('to', 724),
 ('a', 611),
 ('I', 534),
 ('it', 524),
 ('she', 508),
 ('of', 499),
 ('said', 453),
 ('Alice', 394)]

In [15]:
alice_freq = word_freq(alice_doc, include_stop=False).most_common(10)
persuasion_freq = word_freq(persuasion_doc, include_stop=False).most_common(10)

In [16]:
alice_freq

[('said', 453),
 ('Alice', 394),
 ("n't", 205),
 ("'s", 190),
 ('little', 124),
 ('like', 84),
 ('know', 83),
 ('went', 83),
 ('thought', 74),
 ('Queen', 73)]

In [17]:
alice_common = [pair[0] for pair in alice_freq]
persuasion_common = [pair[0] for pair in persuasion_freq]

In [18]:
print('Unique to Alice: ', set(alice_common)-set(persuasion_common))
print('Unique to Persuasion: ', set(persuasion_common)-set(alice_common))

Unique to Alice:  {'went', "n't", 'thought', 'Alice', 'like', 'said', 'Queen', 'know'}
Unique to Persuasion:  {'Mr', 'Mrs', 'good', 'Wentworth', 'Anne', 'Captain', 'Lady', 'Elliot'}


# Lemmas

In [22]:
from collections import Counter

def lemma_freq(text, include_stop=True):
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
    return Counter(lemmas)

In [23]:
alice_lemma_freq = lemma_freq(alice_doc, include_stop=False).most_common(10)
persuasion_lemma_freq = lemma_freq(persuasion_doc, include_stop=False).most_common(10)

# Sentences

In [24]:
sentences = list(alice_doc.sents)
example_sentence = sentences[2]
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

# Parts of Speech

In [25]:
# homographs
print(nlp("I need a break")[3].pos_)
print(nlp("I need to break the glass")[3].pos_)

NOUN
VERB


In [26]:
# View the part of speech for some tokens in our sentence.
print('\nParts of speech:')
for token in example_sentence[:9]:
    print(token.orth_, token.pos_)


Parts of speech:
There ADV
was VERB
nothing NOUN
so ADP
VERY PROPN
remarkable ADJ
in ADP
that DET
; PUNCT


In [27]:
# Dependencies
print('\nDependencies:')
for token in example_sentence[:9]:
    print(token.orth_, token.dep_, token.head.orth_)


Dependencies:
There expl was
was ROOT was
nothing attr was
so advmod remarkable
VERY compound remarkable
remarkable amod nothing
in prep remarkable
that pobj in
; punct was


In [28]:
# Entities
# Extract the first ten entities.
entities = list(alice_doc.ents)[0:10]
for entity in entities:
    print(entity.label_, ' '.join(t.orth_ for t in entity))

PERSON Alice
PERSON Alice
PERSON White Rabbit
ORG VERY
PERSON Alice
ORG VERY
PERSON Rabbit
PERSON Rabbit
EVENT WATCH
ORG POCKET


In [29]:
# All of the uniqe entities spaCy thinks are people.
people = [entity.text for entity in list(alice_doc.ents) if entity.label_ == "PERSON"]
print(set(people))

{'Beau', 'Conqueror', 'Pat', 'Jack', 'Dodo', 'Cat', 'King', 'Soles', 'WILLIAM', 'Last', 'Tillie;', 'Curiouser', 'Longitude', 'Rabbit', 'Latitude', 'Cheshire Puss', 'Pepper', 'ONE', 'Morcar', 'Brandy', 'Tortoise', 'Lizard', 'Queen', 'Father William', 'Pigeon', 'Twinkle', 'Treacle', 'Mabel', 'ALICE', 'Edgar Atheling', 'Mouse', 'Duck', 'Mock Turtle', 'Alice', 'Eaglet', 'Dinn', 'Silence', 'Shakespeare', 'THAT', "W. RABBIT'", 'Ma', 'ALL', 'Footman', 'Seaography', 'Mary Ann', 'Beautiful', 'Majesty', 'Tut', 'Shy', 'THESE', "Mary Ann!'", 'Dormouse', 'THIS', 'Hush', 'Drawling', 'Dinah', 'WHAT', 'Ahem', 'Knave', 'ME', 'Lory', 'Mine', 'Idiot', 'O Mouse', 'Cheshire Cat', 'Gryphon', "Rabbit's--'Pat", 'Bill', 'Mock', 'Hjckrrh', 'Game', 'William', 'Begin', 'Soup', 'Hare', 'Tis', 'VOICE', 'Edwin', 'Prizes', 'then!--Bill', 'Hand', 'White Rabbit'}
