In [36]:
from nltk.stem import WordNetLemmatizer
import nltk
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk import RegexpParser, pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
 
nltk.download('stopwords')
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/magnusde93/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/magnusde93/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/magnusde93/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/magnusde93/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/magnusde93/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
lemmatizer = WordNetLemmatizer()

with open("abigel.txt", "r") as file:
    text = file.read()
    
tokenized_words = word_tokenize(text)
lemmatized_words = []

for i in tokenized_words:
    lemmatized_words.append(lemmatizer.lemmatize(i))
    
print("10 of the most common lemmatized tokens are", Counter(lemmatized_words).most_common(10))

10 of the most common lemmatized tokens are [(',', 15084), ('a', 6353), ('.', 4605), ('az', 2719), ('hogy', 1867), ('nem', 1777), ('és', 1698), ('–', 1491), ('is', 1310), ('volt', 923)]


10 of the most common tokens are [(',', 15085), ('a', 6355), ('.', 4731), ('az', 2719), ('hogy', 1867), ('nem', 1777), ('és', 1698), ('is', 1313), ('volt', 925), ('meg', 707)] from assignment 1.

This just shows that my regex was not as bad as I thought, or that my text didn't have anything complicated in it. The "-" is not a part of my tokens which is something I forgot to accomodate for.

In [25]:
with open("abigel.txt", "r") as file:
    text = file.read()

tokenized_words = word_tokenize(text)
tokens_lower = [token.lower() for token in tokenized_words]
stop_words = set(stopwords.words('hungarian'))

filtered_words = []

for w in tokens_lower:
    if w not in stop_words:
        filtered_words.append(w)


print("10 of the most common tokens are", Counter(tokenized_words).most_common(10))
print("10 of the most common tokens are", Counter(lemmatized_words).most_common(10))

10 of the most common tokens are [(',', 15084), ('a', 6353), ('.', 4605), ('az', 2719), ('hogy', 1867), ('nem', 1777), ('és', 1698), ('–', 1491), ('is', 1310), ('volt', 923)]
10 of the most common tokens are [(',', 15084), ('a', 6353), ('.', 4605), ('az', 2719), ('hogy', 1867), ('nem', 1777), ('és', 1698), ('–', 1491), ('is', 1310), ('volt', 923)]


The tokens without the stop words consist mainly of punctuation marks. There are a few words in there, is (as well), ha (if), gina and zsuzsanna (both names).
When we remove the stop words from the text we can focus on the important words.

### The utility of leematization.
Normalizing text by reducing words to their standard form.

Reducing the vocabulary size.

NLP tasks are more accurate.

### Lemmas are more useful.
When the meaning of words are essential in a task.

When using a search engine.

For topic modeling or document clustering.

### Lemmas are not more useful.
When we have names of people, places

When reducing words to their stems is more appropriate.

When we have languages with complex morphology

In [32]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag

with open('of-mice-and-men.txt', 'r') as file:
    text = file.read()
    

sentences = sent_tokenize(text)

tagged_sentences = []
for sentence in sentences:
    words = word_tokenize(sentence)
    tags = pos_tag(words, lang='eng') 
    tagged_sentences.append(tags)

In [34]:
from nltk.chunk import RegexpParser

def extract_noun_phrases(tagged_sentence):
    grammar = r'NP: {<DT>?<JJ>*<NN.*>+}' 
    chunk_parser = RegexpParser(grammar)
    tree = chunk_parser.parse(tagged_sentence)
    noun_phrases = []
    for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
        noun_phrases.append(' '.join(word for word, tag in subtree.leaves()))
    return noun_phrases

def extract_verb_phrases(tagged_sentence):
    grammar = r'VP: {<VB.*><NP|PP>*}'
    chunk_parser = RegexpParser(grammar)
    tree = chunk_parser.parse(tagged_sentence)
    verb_phrases = []
    for subtree in tree.subtrees(filter=lambda t: t.label() == 'VP'):
        verb_phrases.append(' '.join(word for word, tag in subtree.leaves()))
    return verb_phrases

In [35]:
all_noun_phrases = []
all_verb_phrases = []

for tagged_sentence in tagged_sentences:
    noun_phrases = extract_noun_phrases(tagged_sentence)
    verb_phrases = extract_verb_phrases(tagged_sentence)
    all_noun_phrases.extend(noun_phrases)
    all_verb_phrases.extend(verb_phrases)

noun_phrase_counts = Counter(all_noun_phrases)
verb_phrase_counts = Counter(all_verb_phrases)

top_noun_phrases = noun_phrase_counts.most_common(10)
top_verb_phrases = verb_phrase_counts.most_common(10)

print("Top 10 Noun Phrases:", top_noun_phrases)
print("Top 10 Verb Phrases:", top_verb_phrases)


Top 10 Noun Phrases: [('George', 422), ('Lennie', 336), ('Slim', 115), ('Curley', 109), ('Candy', 85), ('’ t', 67), ('nothing', 67), ('Crooks', 63), ('face', 40), ('hand', 40)]
Top 10 Verb Phrases: [('said', 391), ('was', 312), ('do', 199), ('got', 197), ('ai', 151), ('get', 138), ("'s", 126), ('’', 121), ('looked', 93), ('go', 89)]


In [47]:
text1 = "I saw the man with the telescope."

sentences = sent_tokenize(text1)

tagged_sentences = []
for sentence in sentences:
    words = word_tokenize(sentence)
    tags = pos_tag(words, lang='eng') 
    tagged_sentences.append(tags)

print(tagged_sentences)

[[('I', 'PRP'), ('saw', 'VBD'), ('the', 'DT'), ('man', 'NN'), ('with', 'IN'), ('the', 'DT'), ('telescope', 'NN'), ('.', '.')]]


This could lead to two possible outcomes.

"I saw the man with the telescope" (where "telescope" is the direct object)

"I saw the man who had the telescope" (where "with the telescope" is a prepositional phrase modifying "man")

In [58]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def split_text_into_paragraphs(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize a list to store the paragraphs
    paragraphs = []
    
    # Initialize a variable to store the current paragraph
    current_paragraph = ""
    
    for sentence in sentences:
        # Add the current sentence to the current paragraph
        current_paragraph += sentence + " "
        
        # Check if the current sentence ends a paragraph
        if sentence.endswith("."):
            # Add the current paragraph to the list and reset the current paragraph
            paragraphs.append(current_paragraph)
            current_paragraph = ""
    
    return paragraphs

# Read the large text file
with open('of-mice-and-men.txt', 'r', encoding='utf-8') as file:
    corpus_text = file.read()

# Preprocess the text to create the corpus
corpus = split_text_into_paragraphs(corpus_text)

In [59]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

index = defaultdict(list)

for idx, paragraph in enumerate(corpus):
    
    words = word_tokenize(paragraph.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]
    
    for word in words:
        index[word].append(idx)

In [63]:
def search(query, index, corpus):
    
    query = [lemmatizer.lemmatize(word) for word in word_tokenize(query.lower()) if word.isalnum() and word not in stop_words]
    
    matching_documents = set()
    
    for term in query:
        if term in index:
            matching_documents.update(index[term])
    
    results = [corpus[idx] for idx in sorted(matching_documents)]
    
    return results

George = search('George', index, corpus)
Slim = search('Slim', index, corpus)
print(George)
print(Slim)

['"You drink some, George. ', 'George unslung his bindle and dropped it gently on the bank. ', '"Look, George. ', 'Look what I done." George knelt beside the pool and drank from his hand with quick\nscoops. ', 'Lennie, who had\nbeen watching, imitated George exactly. ', 'He pushed himself back,\ndrew up his knees, embraced them, looked over to George to see\nwhether he had it just right. ', 'He pulled his hat down a little more\nover his eyes, the way George’s hat was. ', 'George stared morosely at the water. ', '"George?" "Yeah, what ya want?" "Where we goin’, George?" The little man jerked down the brim of his hat and scowled over\nat Lennie. ', 'Honest to God I\ndid, George." "O.K.- O.K. ', 'I\nremember about the rabbits, George." "The hell with the rabbits. ', '"Why sure, George. ', 'You remember about us goin’ into\nMurray and Ready’s, and they give us work cards and bus\ntickets?" "Oh, sure, George. ', 'He said gently, "George.... ', 'George looked sharply at him. ', 'What you go

### Parsing
Parsing can help systems like Google Translate with understanding the grammatical structure of a sentence in the source language.

It is crucial for question answering systems like IBM.

It helps with text-to-speech systems.

It helps Google understand the queries from users.

It is crucial for Siri and Alexa and more chatbots and virtual assistants.

### Lemmatization
It can help improve sentiment analysis by reducing inflected words to their base form.

It can be beneficial for keyword extraction by grouping similar words together. It makes it easier to identify the main keywords.

It can aid automatic paraphrasing by simplifying the text and reducing word variations.

### Problems
I did indeed run into a problem when I wanted to parse my Hungarian book. The problem with NLTK is that it only accepts English and Russian which is why I used an english book for that assignment. I might look for a Hungarian parser for the next project if it involves parsing.