# Text Mining of Moby Dick

In this project, nltk is used to explore the Herman Melville novel Moby Dick. 

In [1]:
import nltk
import pandas as pd
import numpy as np

with open('moby.txt', 'r') as f:
    moby_raw = f.read()
    
moby_tokens = nltk.word_tokenize(moby_raw)
text1 = nltk.Text(moby_tokens)

### Total Number of Tokens (words and punctuation symbols) in text


In [2]:
len(moby_tokens) 

254989

### Number of *Unique* Tokens


In [3]:
len(set(moby_tokens)) 


20755

### Total number of  unique tokens in *lematized* text1


In [4]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w,'v') for w in text1]
len(set(lemmatized))


16900

### Lexical Diversity

(ratio of unique tokens to the total number of tokens)


In [5]:
diversity = len(set(moby_tokens))/len(moby_tokens)
diversity
   

0.08139566804842562

### Fraction of Tokens that are 'Whale' or 'whale'


In [6]:
import re
    
match_pattern = re.findall(r'\b[wW]hale\b', moby_raw)
whale = (len(match_pattern)/len(moby_tokens))*100
whale


0.4659024506939515

### The 26 Most Frequent (unique) Tokens & Their Frequency


In [7]:
from nltk.probability import FreqDist

moby_list = list(moby_tokens)
FD = FreqDist(moby_list)
FD.most_common(26)


[(',', 19204),
 ('the', 13715),
 ('.', 7308),
 ('of', 6513),
 ('and', 6010),
 ('a', 4545),
 ('to', 4515),
 (';', 4173),
 ('in', 3908),
 ('that', 2978),
 ('his', 2459),
 ('it', 2196),
 ('I', 2097),
 ('!', 1767),
 ('is', 1722),
 ('--', 1713),
 ('with', 1659),
 ('he', 1658),
 ('was', 1639),
 ('as', 1620),
 ("''", 1615),
 ("'s", 1585),
 ('``', 1456),
 ('all', 1444),
 ('for', 1413),
 ('this', 1280)]

### Tokens with a Length > 5 & Frequency > 160


In [8]:
#returns a list of the tokens that match the above constraints sorted by frequency

freq_words = [w for w in FD.keys() if len(w) > 5  and  FD[w] > 160]
sorted(freq_words)


['Captain',
 'Pequod',
 'Queequeg',
 'Starbuck',
 'almost',
 'before',
 'himself',
 'little',
 'seemed',
 'should',
 'though',
 'through',
 'whales']

### The Longest Word and Its Length


In [9]:
text1 = nltk.Text(moby_tokens)
moby_list = list(text1)
longest_word = max(moby_list, key=len)
(longest_word, len(longest_word))


("twelve-o'clock-at-night", 23)

### Unique Tokens with Frequency > 2000


In [10]:
#returns a sorted list of tuples of the form `(frequency, word)`

def most_frequent():
    
    moby_list = list(moby_tokens)
    FD = FreqDist(moby_list)
    
    word_list = [w for w in FD.keys() if w.isalpha() and FD[w] > 2000]
    frequency_list = [FD[w] for w in word_list]
    
    tuple_list = list(zip(frequency_list, word_list))
    tuple_list.sort(key=lambda x: x[0], reverse = True)
    
    return tuple_list

most_frequent()

[(13715, 'the'),
 (6513, 'of'),
 (6010, 'and'),
 (4545, 'a'),
 (4515, 'to'),
 (3908, 'in'),
 (2978, 'that'),
 (2459, 'his'),
 (2196, 'it'),
 (2097, 'I')]

### Average Number of Tokens Per Sentence


In [11]:
moby_sentences = nltk.sent_tokenize(moby_raw)
len(moby_tokens)/len(moby_sentences)
    

25.881952902963864

### The 5 Most Frequent Parts of Speech


In [12]:
#return a sorted list of tuples of the form `(part_of_speech, frequency)`

def frequent_pos():
    from collections import Counter
    
    moby_tokens = nltk.word_tokenize(moby_raw.lower())
    tagged = nltk.pos_tag(moby_tokens)
    counts = Counter(tag for word,tag in tagged) 
    
    return counts.most_common(5)

frequent_pos()

[('NN', 39860), ('IN', 28831), ('DT', 26033), ('JJ', 19562), (',', 19204)]