In [1]:
import nltk
from nltk import pos_tag, RegexpParser
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

# Custom functions
def word_sentence_tokenize(text):
    sentences = sent_tokenize(text)
    word_tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    return word_tokenized_sentences

def np_chunk_counter(chunked_text):
    np_chunks = []
    for tree in chunked_text:
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
            np_chunks.append(' '.join(word for word, tag in subtree.leaves()))
    return Counter(np_chunks).most_common()

def vp_chunk_counter(chunked_text):
    vp_chunks = []
    for tree in chunked_text:
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'VP'):
            vp_chunks.append(' '.join(word for word, tag in subtree.leaves()))
    return Counter(vp_chunks).most_common()

# Import text data
file_name = "dorian_gray.txt"  # Change to "the_iliad.txt" for The Iliad
text = open(file_name, encoding='utf-8').read().lower()

# Tokenize sentences and words
word_tokenized_text = word_sentence_tokenize(text)

# Print a single word tokenized sentence
single_word_tokenized_sentence = word_tokenized_text[100]
print("Single Word Tokenized Sentence:", single_word_tokenized_sentence)

# Part-of-speech tagging
pos_tagged_text = [pos_tag(sentence) for sentence in word_tokenized_text]

# Print a single part-of-speech tagged sentence
single_pos_sentence = pos_tagged_text[100]
print("Single POS Tagged Sentence:", single_pos_sentence)

# Define chunk grammars
np_chunk_grammar = """
    NP: {<DT>?<JJ>*<NN>}
"""
vp_chunk_grammar = """
    VP: {<VB.*><NP|PP|CLAUSE>*}
"""

# Create chunk parsers
np_chunk_parser = RegexpParser(np_chunk_grammar)
vp_chunk_parser = RegexpParser(vp_chunk_grammar)

# Chunk sentences
np_chunked_text = [np_chunk_parser.parse(sentence) for sentence in pos_tagged_text]
vp_chunked_text = [vp_chunk_parser.parse(sentence) for sentence in pos_tagged_text]

# Analyze and print most common NP and VP chunks
most_common_np_chunks = np_chunk_counter(np_chunked_text)
print("Most Common NP Chunks:", most_common_np_chunks)

most_common_vp_chunks = vp_chunk_counter(vp_chunked_text)
print("Most Common VP Chunks:", most_common_vp_chunks)


Single Word Tokenized Sentence: ['it', 'seems', 'to', 'be', 'the', 'one', 'thing', 'that', 'can', 'make', 'modern', 'life', 'mysterious', 'or', 'marvellous', 'to', 'us', '.']
Single POS Tagged Sentence: [('it', 'PRP'), ('seems', 'VBZ'), ('to', 'TO'), ('be', 'VB'), ('the', 'DT'), ('one', 'CD'), ('thing', 'NN'), ('that', 'WDT'), ('can', 'MD'), ('make', 'VB'), ('modern', 'JJ'), ('life', 'NN'), ('mysterious', 'JJ'), ('or', 'CC'), ('marvellous', 'JJ'), ('to', 'TO'), ('us', 'PRP'), ('.', '.')]
