<a href="https://colab.research.google.com/github/Gaukhar-ai/for_my_Thinkful_work/blob/master/alice_in_wonderland_book_token_count_lemma_frequency_headings%2Cspaces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from collections import Counter
import nltk
import spacy
import re

In [None]:
nltk.download('gutenberg')

#!python -m spacy download en

In [6]:
from nltk.corpus import gutenberg

#grab and process the raw data
print(gutenberg.fileids())

persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

#print the first 100 characters of Alice
print('\nRaw:\n', alice[0:100])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

Raw:
 [Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [7]:
#pattern matches all text between square brackets

In [12]:
pattern = '[\[].*?[\]]'
persuasion = re.sub(pattern, '', persuasion)
alice = re.sub(pattern, '', alice)

#print the 100 first characters of Alice again:
print('Title removed:', alice[0:100]) 
print('Title removed:', persuasion[0:100]) 

Title removed: 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothin
Title removed: 





Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who,
for his own amusement, n


In [13]:
#match and remove chapter headings:
persuasion = re.sub(r'Chapter .*', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

print('Chapter headings removed:', alice[0:100])
print('Chapter headings removed:', persuasion[0:100])

Chapter headings removed: 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothin
Chapter headings removed: 





Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who,
for his own amusement, n


In [16]:
#remove newlines and other extra whitespace by splitting and rejoining
persuasion = ' '.join(persuasion.split())
alice = ' '.join(alice.split())

print('Extra whitespace removed:', alice[0:100])
print('Title removed:', alice[0:100]) 

Extra whitespace removed: Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to
Title removed: Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to


In [17]:

#TOKENIZATION:
#each individual meaningful piece from a text is a token and the process of breaking up the text into these peices is called tokenization

In [18]:
nlp = spacy.load('en')

#processing work
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [19]:
#explore the objects built

In [21]:
print('The alice_doc object is a {} object.'.format(type(alice_doc)))
print('It is {} tokens long'.format(len(alice_doc)))
print("The first three tokens are '{}'".format(alice_doc[:3]))
print('The type of each token is {}'. format(type(alice_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 34408 tokens long
The first three tokens are 'Alice was beginning'
The type of each token is <class 'spacy.tokens.token.Token'>


In [22]:
#remove stop words with spacy:

In [23]:
alice_without_stopwords = [token for token in alice_doc if not token.is_stop]
persuasion_without_stopwords = [token for token in persuasion_doc if not token.is_stop]

In [24]:
#UTILITY FUNCTION TO CALCULATE HOW FREQUENTLY WORDS APPEAR IN THE TEXT:

In [25]:
def word_frequencies(text):

#build a list of words, strip out punctuation:
  words = []
  for token in text:
    if not token.is_punct:
      words.append(token.text)
#build and return a 'Counter' object containing word counts 
  return Counter(words)

In [26]:
#instantiate the list of the most common words
alice_word_freq = word_frequencies(alice_without_stopwords).most_common(10)
persuasion_word_freq = word_frequencies(persuasion_without_stopwords).most_common(10)
print('\nAlice:', alice_word_freq)
print('Persuasion:', persuasion_word_freq)


Alice: [('said', 453), ('Alice', 394), ('little', 124), ('like', 84), ('went', 83), ('know', 83), ('thought', 74), ('Queen', 73), ('time', 68), ('King', 61)]
Persuasion: [('Anne', 496), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('Mr', 254), ('Wentworth', 217), ('Lady', 191), ('good', 181), ('little', 175), ('Charles', 166)]


In [27]:
#LEMMATIZATION: think, thouth, thinking = think is the stem

In [28]:
#UTILITY FUNCTION TO CALCULATE HOW FREQUENTLY EACH LEMMA APPEARS IN THE TEXT

In [30]:
def lemma_frequencies(text):
  #build a list of lemmas, strip out punctuation
  lemmas = []
  for token in text:
    if not token.is_punct:
      lemmas.append(token.lemma_)

      #build and return a 'Counter' object containing lemma counts 
  return Counter(lemmas)

#instantiate your list of most common lemmas
alice_lemma_freq = lemma_frequencies(alice_without_stopwords).most_common(10)
persuasion_lemma_freq = lemma_frequencies(persuasion_without_stopwords).most_common(10)
print('\nAlice:', alice_lemma_freq)
print('Persuasion:', persuasion_lemma_freq)



Alice: [('say', 476), ('Alice', 394), ('think', 130), ('go', 130), ('little', 124), ('look', 105), ('know', 103), ('come', 96), ('like', 92), ('begin', 91)]
Persuasion: [('Anne', 496), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('think', 258), ('Mr', 254), ('know', 252), ('good', 222), ('Wentworth', 215), ('Lady', 191)]


In [31]:
alice_lemma_common = [pair[0] for pair in alice_lemma_freq]
persuasion_lemma_common = [pair[0] for pair in persuasion_lemma_freq]
print('Unique to Alice:', set(alice_lemma_common) - set(persuasion_lemma_common))
print('Unique to Persuasion:', set(persuasion_lemma_common) - set(alice_lemma_freq))

Unique to Alice: {'say', 'little', 'go', 'like', 'look', 'come', 'Alice', 'begin'}
Unique to Persuasion: {'know', 'good', 'Lady', 'Captain', 'Mrs', 'Elliot', 'think', 'Wentworth', 'Mr', 'Anne'}


In [32]:
#initial exploration of sentences
sentences = list(alice_doc.sents)
print('Alice in Wonderland has {} sentences.'.format(len(sentences)))

example_sentence = [2]
print('Here is an example: \n{}\n'.format(example_sentence))



Alice in Wonderland has 1989 sentences.
Here is an example: 
[2]



In [None]:
#doc.sents gives each sentence as a span ojbect. 

In [None]:
#some metrics around the sentence:
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print(("There are {} words in this sentence, and {} of them are""unique.").format(len(example_words), len(unique_words)))