# Part1 - Analyzing Moby Dick

In [1]:
import nltk
import pandas as pd
import numpy as np

In [2]:
with open("data/moby.txt", "r") as f:
    moby_raw = f.read()

moby_tokens = nltk.word_tokenize(moby_raw)
text1 = nltk.Text(moby_tokens)

## Examples

### Example 1: How many tokens (words and punctuation symbols) are in text1?

In [3]:
def example_one():
    return len(nltk.word_tokenize(moby_raw))

example_one()

255028

### Example 2: How many unique tokens (unique words and punctuation) does text1 have?

In [4]:
def example_two():
    return len(set(nltk.word_tokenize(moby_raw)))

example_two()

20742

### Example 3: After lemmatizing the verbs, how many unique tokens does text1 have?

In [5]:
from nltk.stem import WordNetLemmatizer

def example_three():
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w, "v") for w in text1]

    return len(set(lemmatized))

example_three()

16887

## Questions

### Question 1: What is the lexical diversity of the given text input? (i.e. ratio of unique tokens to the total number of tokens)

In [6]:
def answer_one():
    unique_tokens = example_two()
    total_words = example_one()
    return unique_tokens / total_words

answer_one()

0.08133224587104161

### Question 2: What percentage of tokens is 'whale'or 'Whale'?

In [8]:
def answer_two():
    from nltk.book import FreqDist
    dist = FreqDist(moby_tokens)
    count = dist["whale"] + dist["Whale"]
    return count / len(moby_tokens)

answer_two()

0.004125037250811676

### Question 3: What are the 20 most frequently occurring (unique) tokens in the text? What is their frequency?

In [26]:
def answer_three():
    from nltk.book import FreqDist
    unique_tokens = list(set(nltk.word_tokenize(moby_raw)))
    dist = FreqDist(moby_tokens)
    data = [(token, dist[token]) for token in unique_tokens]
    data.sort(key = lambda tup: tup[1], reverse = True)
    return data[:20]

answer_three()

[(',', 19204),
 ('the', 13715),
 ('.', 7306),
 ('of', 6513),
 ('and', 6010),
 ('a', 4545),
 ('to', 4515),
 (';', 4173),
 ('in', 3908),
 ('that', 2978),
 ('his', 2459),
 ('it', 2196),
 ('I', 2113),
 ('!', 1767),
 ('is', 1722),
 ('--', 1713),
 ('with', 1659),
 ('he', 1658),
 ('was', 1639),
 ('as', 1620)]

### Question 4: What tokens have a length of greater than 5 and frequency of more than 150?

In [27]:
def answer_four():
    from nltk.book import FreqDist
    unique_tokens = list(set(nltk.word_tokenize(moby_raw)))
    dist = FreqDist(moby_tokens)
    freqwords = [w for w in unique_tokens if len(w) > 5 and dist[w] > 150]
    return sorted(freqwords, key = lambda tup: tup[0])

answer_four()

['Captain',
 'Pequod',
 'Queequeg',
 'Starbuck',
 'almost',
 'before',
 'himself',
 'little',
 'should',
 'seemed',
 'though',
 'through',
 'whales',
 'without']

### Question 5: Find the longest word in text1 and that word's length.

In [30]:
def answer_five():
    unique_tokens = list(set(nltk.word_tokenize(moby_raw)))
    tokens_length = [(token, len(token)) for token in unique_tokens]
    tokens_length.sort(key = lambda tup: tup[1], reverse = True)
    return tokens_length[0]

answer_five()

("twelve-o'clock-at-night", 23)

### Question 6: What unique words have a frequency of more than 2000? What is their frequency?

In [42]:
def answer_six():
    from nltk.book import FreqDist
    unique_tokens = list(set(nltk.word_tokenize(moby_raw)))
    words = [token for token in unique_tokens if token.isalpha()]
    dist = FreqDist(moby_tokens)

    output = [(dist[word], word) for word in words if dist[word] > 2000]
    output.sort(key = lambda tup: tup[0], reverse = True)
    return output

answer_six()

[(13715, 'the'),
 (6513, 'of'),
 (6010, 'and'),
 (4545, 'a'),
 (4515, 'to'),
 (3908, 'in'),
 (2978, 'that'),
 (2459, 'his'),
 (2196, 'it'),
 (2113, 'I')]

### Question 7: What is the average number of tokens per sentence?

In [51]:
def answer_seven():
    from functools import reduce
    moby_sent = nltk.sent_tokenize(moby_raw)
    lengths = [len(nltk.word_tokenize(sent)) for sent in moby_sent]
    return reduce(lambda x, y: x + y, lengths) / len(lengths)

answer_seven()

25.88591149005278

### Question 8: What are the 5 most frequent parts of speech in this text? What is their frequency?

In [80]:
def answer_eight():
    moby_sent = nltk.sent_tokenize(moby_raw)
    moby_sent_tokens = [nltk.word_tokenize(sent) for sent in moby_sent]
    list_of_pos_lists = [nltk.pos_tag(token) for token in moby_sent_tokens]
    poss_count = dict()

    for pos_list in list_of_pos_lists:
        for pos in pos_list:
            try:
                if pos[1] not in poss_count.keys():
                    poss_count[pos[1]] = 1
                else:
                    poss_count[pos[1]] += 1
            except:
                poss_count[pos[1]] = 1

    final_list = list(zip(list(poss_count.keys()), list(poss_count.values())))
    final_list.sort(key = lambda tup: tup[1], reverse = True)
    return final_list[:5]

answer_eight()

[('NN', 33156), ('IN', 28605), ('DT', 25986), (',', 19204), ('JJ', 17417)]