In [3]:
import nltk 
from nltk.book import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import brown
from nltk.corpus import inaugural
from nltk.stem import PorterStemmer
from collections import Counter
import string

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [4]:
# Assignment 19 -----------------------------------------------------------------------------

def remove(text):

    """
    Function takes as an input string, removes punctuation, stopwords and reduces inflected words to their stems,
    then returns a list of the remaining words.

    Parameters:
            text(String): initial string

    Return:
            cleared(List[String]): a list consisting of every word from input text without punctuation,
                                   stopwords and inflected words reduced to their stems

    """

    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+|[^\w\s]+')
    tokens = tokenizer.tokenize(text)

    # Join the tokens back into a string without punctuation
    #nopunct_text = ' '.join([token for token in tokens if token.isalnum()])
    nopunct_text = [token for token in tokens if token.isalnum()]

    stop_words = set(stopwords.words('english'))

    no_stopwords = [token for token in nopunct_text if token.lower() not in stop_words]

    stemmer = PorterStemmer()
    cleared = [stemmer.stem(token) for token in no_stopwords]

    return cleared
    #return ' '.join([token for token in stemmed_tokens])

with open('hound.txt', 'r') as f:
    text = f.read()

word_counts = Counter(remove(text))

freq_words = [word for word, count in word_counts.items() if count >= 100]
print("Words occuring at least 100 times in book", freq_words)

Words occuring at least 100 times in book ['baskervil', 'may', 'holm', 'sir', 'henri', 'stapleton', 'dr', 'watson', 'upon', 'moor', 'man', 'well', 'know', 'said', 'could', 'one', 'us', 'would', 'come', 'work', 'see']


In [5]:
texts = [text1, text2, text3, text4, text5, text6, text7, text8, text9]

In [6]:
# Assignment 20 -----------------------------------------------------------------------------

count7 = texts[6].count('Sunday')
count9 = texts[8].count('Sunday')

print("Occurences of word 'Sunday' in text 7: ", count7)
print("Occurences of word 'Sunday' in text 9: ", count9)

Occurences of word 'Sunday' in text 7:  2
Occurences of word 'Sunday' in text 9:  98


In [10]:
# Assignment 21 -----------------------------------------------------------------------------

words_text7 = set(txt.lower() for txt in texts[6])
words_text9 = set(txt.lower() for txt in texts[8])
only9 = words_text9 - words_text7
print("Words that appear in text9 but not in text7: ", only9)

Words that appear in text9 but not in text7:  {'proportions', 'tusitala', 'exaggerating', 'surf', 'leaden', 'dreary', 'throb', 'feeble', 'galloping', 'towards', 'solitary', 'twentieth', 'congratulate', 'odours', 'catholics', 'tricks', 'eccentricity', 'bore', 'foaming', 'cruelty', 'melted', 'slung', 'buff', 'funniest', 'frustration', 'passionate', 'beneficent', 'emblazoned', 'embarrassment', 'appearances', 'horribly', 'blackguard', 'remembering', 'conversational', 'crumpled', 'centuries', 'casuistry', 'footsore', ".'", 'earth', 'scope', 'awakened', 'oddity', 'deaf', 'symbolism', 'placidity', 'paces', 'parcel', 'firelight', 'taps', 'moderns', 'plainly', 'steeply', 'mist', 'cart', 'beak', 'organ', 'sarcastically', 'companionable', 'bodies', 'hiding', 'bray', 'breakneck', 'foppish', 'loud', 'antic', 'pleasant', 'presumptuous', 'burning', 'cries', 'flames', 'bloodless', 'dreamers', 'love', 'ice', 'sealed', 'equestrian', 'dies', 'blinked', 'lamps', 'blinking', 'wrapped', 'myself', 'assyrian'

In [11]:
# Assignment 22 -----------------------------------------------------------------------------

words_sets = [set(remove(" ".join(text))) for text in texts]
common_words = set.intersection(*words_sets)
print("Words present in every text (1 to 9) from nltk.book: ", common_words)

Words present in every text (1 to 9) from nltk.book:  {'person', 'call', 'mani', 'open', 'cut', 'like', 'larg', 'true', 'hill', 'talk', 'lot', 'build', 'may', 'good', 'watch', 'seek', 'age', 'well', 'mind', 'east', 'still', 'old', 'dress', 'young', 'lead', 'great', 'meet', 'love', 'life', 'forward', 'pleas', 'one', 'work', 'son', 'abl', 'dark', 'green', 'guard', 'understand', 'south', 'mine', 'speed', 'hous', 'friend', 'must', 'readi', 'would', 'hear', 'heart', 'man', 'home', 'run', 'stand', 'art', 'mother', 'look', 'find', 'live', 'morn', 'countri', 'never', 'woman', 'toward', 'two', 'door', 'thing', 'fair', 'year', 'full', 'head', 'go', 'near', 'food', 'without', 'land', 'time', 'littl', 'least', 'forget', 'walk', 'need', 'away', 'long', 'town', 'high', 'hand', 'think', 'ride', 'let', 'boy', 'could', 'kind', 'togeth', 'night', 'alway', 'beauti', 'send', 'feet', 'enough', 'women', 'get', 'day', 'tower', 'eye', 'answer', 'fine', 'laugh', 'happi', 'give', 'busi'}


In [12]:
# Assignment 23 -----------------------------------------------------------------------------

sentences = nltk.sent_tokenize(' '.join(texts[1]))
longest_sentence = max(sentences, key = len)

print("Longest sentence in Jane Austin's 'Sense and sensibility': ", longest_sentence)

Longest sentence in Jane Austin's 'Sense and sensibility':  I am sure you will be glad to hear , as likewise dear Mrs . Jennings , I spent two happy hours with him yesterday afternoon , he would not hear of our parting , though earnestly did I , as I thought my duty required , urge him to it for prudence sake , and would have parted for ever on the spot , would he consent to it ; but he said it should never be , he did not regard his mother ' s anger , while he could have my affections ; our prospects are not very bright , to be sure , but we must wait , and hope for the best ; he will be ordained shortly ; and should it ever be in your power to recommend him to any body that has a living to bestow , am very sure you will not forget us , and dear Mrs . Jennings too , trust she will speak a good word for us to Sir John , or Mr . Palmer , or any friend that may be able to assist us .-- Poor Anne was much to blame for what she did , but she did it for the best , so I say nothing ; hope Mr