<a href="https://colab.research.google.com/github/HeleneFabia/nlp-exploration/blob/main/notebooks/nlp_with_nltk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP with NLTK

In [1]:
import nltk

## Tokenizing

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download("punkt")

In [23]:
# splitting a text into sentences and words
example_string = (
    "Societies construct patterns of behavior by deeming certain " 
    "actions or concepts as acceptable or unacceptable. These patterns of " 
    "behavior within a given society are known as societal norms. Societies, "
    "and their norms, undergo gradual and perpetual changes."
    )

sentences = sent_tokenize(example_string)
words = word_tokenize(sentences[0])
print(sentences)
print(words)

['Societies construct patterns of behavior by deeming certain actions or concepts as acceptable or unacceptable.', 'These patterns of behavior within a given society are known as societal norms.', 'Societies, and their norms, undergo gradual and perpetual changes.']
['Societies', 'construct', 'patterns', 'of', 'behavior', 'by', 'deeming', 'certain', 'actions', 'or', 'concepts', 'as', 'acceptable', 'or', 'unacceptable', '.']


## Filtering Stop Words

In [None]:
from nltk.corpus import stopwords
nltk.download("stopwords")

In [20]:
text = "I'm a happy person, at least most of the time."
words = word_tokenize(text)
print(words)

stop_words = set(stopwords.words("english"))
filtered_list = [word for word in words if word.casefold() not in stop_words]
# .casefold() to make it case-insensitive
print(filtered_list)

['I', "'m", 'a', 'happy', 'person', ',', 'at', 'least', 'most', 'of', 'the', 'time', '.']
["'m", 'happy', 'person', ',', 'least', 'time', '.']


## Stemming
(= reducing a word to its root)

In [29]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")
words = word_tokenize(sentences[0])
stemmed_words = [stemmer.stem(word) for word in words]
print("Unstemmed:", words)
print("Stemmed:", stemmed_words)

# understemming: when two related words should be reduced to the same stem but are not (FN)
# overstemming: when two unrelated words are reduced to the same stem but should not be (FP)

Unstemmed: ['Societies', 'construct', 'patterns', 'of', 'behavior', 'by', 'deeming', 'certain', 'actions', 'or', 'concepts', 'as', 'acceptable', 'or', 'unacceptable', '.']
Stemmed: ['societi', 'construct', 'pattern', 'of', 'behavior', 'by', 'deem', 'certain', 'action', 'or', 'concept', 'as', 'accept', 'or', 'unaccept', '.']


## Parts of Speech (POS) tagging 
(noun, pronoun, adjective, verb, adverb, preposition, conjunction, interjection, determiner)

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

In [35]:
for w, pos in nltk.pos_tag(words):
  print(f"{w} --- {pos}")

# nltk.help.upenn_tagset() for POS descriptions

Societies --- NNS
construct --- VBP
patterns --- NNS
of --- IN
behavior --- NN
by --- IN
deeming --- VBG
certain --- JJ
actions --- NNS
or --- CC
concepts --- NNS
as --- IN
acceptable --- JJ
or --- CC
unacceptable --- JJ
. --- .


## Lemmatizing
(= reduce word to its core meaning, but returns an actual word rather than only a fragment when stemming)

lemma = word that represents a whole group of words (= lexeme)

e.g. "swimming" --> lemma is "swim", "swimming" is part of the lexeme

In [None]:
nltk.download('wordnet')

In [66]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("beautifully"))
print(lemmatizer.lemmatize("boats"))
print(lemmatizer.lemmatize("loved"))
print(lemmatizer.lemmatize("loved", pos="v"))
print(lemmatizer.lemmatize("worst"))
print(lemmatizer.lemmatize("worst", pos="a"))

beautifully
boat
loved
love
worst
bad


## Chunking
(= identify phrases)

uses POS tags to group and chunk words

In [4]:
quote = "It's a dangerous business, Frodo, going out your door."
words = nltk.word_tokenize(quote)
pos_tags = nltk.pos_tag(words)

chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"  # rule combination for how sentences should be chunked
# chunk noun phrases (NP)
# start with optional (?) determiner (DT)
# have any number (*) of adjectives (JJ)
# end with a noun (NN)

chunk_parser = nltk.RegexpParser(chunk_grammar)
tree = chunk_parser.parse(pos_tags)