In [1]:
import nltk
from nltk.corpus import gutenberg 
moby_raw = gutenberg.raw('melville-moby_dick.txt') 

In [2]:
def example_one():
    from nltk.tokenize import word_tokenize
    return len(word_tokenize(moby_raw)) 

In [3]:
print ('{:,}'.format(example_one()))

255,028


In [4]:
def example_two():    
    return len(set(nltk.word_tokenize(moby_raw)))

In [5]:
print ('{:,}'.format(example_two()))

20,742


In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/maks/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from nltk.stem import WordNetLemmatizer

def example_three():
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w,'v') for w in nltk.word_tokenize(moby_raw)]
    return len(set(lemmatized))

In [8]:
print ('{:,}'.format(example_three()))

16,887


In [9]:
def answer_one():
    total_tokens = example_one()
    unique_tokens = example_two()
    lexical_diversity = unique_tokens / total_tokens
    return lexical_diversity

print('Lexical Diversity:', answer_one())


Lexical Diversity: 0.08133224587104161


In [10]:
from nltk.tokenize import word_tokenize

def answer_two():
    tokens = word_tokenize(moby_raw)
    total_tokens = len(tokens)
    whale_count = sum(1 for token in tokens if token.lower() == 'whale')
    Whale_count = sum(1 for token in tokens if token == 'Whale')
    total_whale_count = whale_count + Whale_count
    percentage = (total_whale_count / total_tokens) * 100
    return percentage

In [11]:
answer_two()

0.5317063224430258

In [12]:
from nltk.probability import FreqDist

def answer_three():
    tokens = word_tokenize(moby_raw)
    freq_dist = FreqDist(tokens)
    most_common = freq_dist.most_common(20)
    return most_common

print('Top 20 most frequently occurring tokens:', answer_three())

Top 20 most frequently occurring tokens: [(',', 19204), ('the', 13715), ('.', 7306), ('of', 6513), ('and', 6010), ('a', 4545), ('to', 4515), (';', 4173), ('in', 3908), ('that', 2978), ('his', 2459), ('it', 2196), ('I', 2113), ('!', 1767), ('is', 1722), ('--', 1713), ('with', 1659), ('he', 1658), ('was', 1639), ('as', 1620)]


In [13]:
def answer_four():
    tokens = word_tokenize(moby_raw)
    freq_dist = FreqDist(tokens)
    filtered_tokens = [token for token, freq in freq_dist.items() if len(token) > 5 and freq > 150]
    sorted_filtered_tokens = sorted(filtered_tokens)
    return sorted_filtered_tokens

print('Tokens with length > 5 and frequency > 150:', answer_four())

Tokens with length > 5 and frequency > 150: ['Captain', 'Pequod', 'Queequeg', 'Starbuck', 'almost', 'before', 'himself', 'little', 'seemed', 'should', 'though', 'through', 'whales', 'without']


In [14]:
def answer_five():
    tokens = word_tokenize(moby_raw)
    longest_word = max(tokens, key=len)
    return longest_word, len(longest_word)

print('Longest word and its length:', answer_five())

Longest word and its length: ("twelve-o'clock-at-night", 23)


In [15]:
import re

def answer_six():
    tokens = word_tokenize(moby_raw)
    freq_dist = FreqDist(tokens)
    filtered_words = [(word, freq) for word, freq in freq_dist.items() if freq > 2000 and re.match("^[a-zA-Z0-9]+$", word)]
    sorted_filtered_words = sorted(filtered_words, key=lambda x: x[1], reverse=True)
    return sorted_filtered_words

print('Unique words with frequency > 2000:', answer_six())

Unique words with frequency > 2000: [('the', 13715), ('of', 6513), ('and', 6010), ('a', 4545), ('to', 4515), ('in', 3908), ('that', 2978), ('his', 2459), ('it', 2196), ('I', 2113)]


In [16]:
from nltk.tokenize import sent_tokenize, word_tokenize

def answer_seven():
    sentences = sent_tokenize(moby_raw)
    num_tokens_per_sentence = [len(word_tokenize(sentence)) for sentence in sentences]
    average_tokens_per_sentence = sum(num_tokens_per_sentence) / len(num_tokens_per_sentence)
    return average_tokens_per_sentence

print('Average number of tokens per sentence:', answer_seven())

Average number of tokens per sentence: 25.88591149005278


In [17]:
def answer_eight():
    tokens = nltk.word_tokenize(moby_raw)
    pos_tags = nltk.pos_tag(tokens)
    
    pos_counts = {}
    
    for word, tag in pos_tags:
        if tag in pos_counts:
            pos_counts[tag] += 1
        else:
            pos_counts[tag] = 1
    
    pos_counts_list = [(tag, count) for tag, count in pos_counts.items()]
    
    sorted_pos_counts = sorted(pos_counts_list, key=lambda x: x[1], reverse=True)
    
    return sorted_pos_counts[:5]

print('Top 5 most frequent parts of speech:', answer_eight())

Top 5 most frequent parts of speech: [('NN', 32727), ('IN', 28662), ('DT', 25879), (',', 19204), ('JJ', 17613)]


In [18]:
def answer_nine(default_words=['cormulent', 'incendenece', 'validrate']):
    correct_spellings = nltk.corpus.words.words()
    recommendations = []
    
    for word in default_words:
        candidates = [w for w in correct_spellings if w.startswith(word[0])]
        
        edit_distances = [(w, nltk.edit_distance(word, w, transpositions=True)) for w in candidates]

        recommendation = min(edit_distances, key=lambda x: x[1])[0]
        
        recommendations.append(recommendation)
    
    return [word for word in recommendations]

print(answer_nine(['cormulent', 'incendenece', 'validrate']))

['corpulent', 'intendence', 'validate']
