LIN 373 UT Austin :: Jessy Li
Some examples and wording taken from: https://www.guru99.com/nltk-tutorial.html

# Basic NLTK usages

In [None]:
# nltk.download('all') # 3.2 gig space!! only run if you have the space

In [None]:
import nltk

## Word Tokenize

In [None]:
nltk.download('punkt')

sentence = "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29."
print(sentence.split())

from nltk import word_tokenize
print(word_tokenize(sentence))

## Sentence Tokenize

In [None]:
paragraph = """Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group. Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a director of this British industrial conglomerate."""
print(paragraph)

In [None]:
print(paragraph.split("."))

In [None]:
from nltk import sent_tokenize
print(sent_tokenize(paragraph))

In [None]:
## Using word tokenization with sentence tokenization
new_data = []
for sentence in sent_tokenize(paragraph):
    new_data.append(word_tokenize(sentence))
print(new_data[0])
print(new_data[1])

# Lemmatization and Steminning

Stemming is a kind of normalization for words. Normalization is a technique where a set of words in a sentence are converted into a sequence to shorten its lookup. The words which have the same meaning but have some variation according to the context or sentence are normalized. https://text-processing.com/demo/stem/

There are stemmers for: 
 - Arabic
 - Danish
 - Dutch
 - English
 - Finnish
 - French
 - German
 - Hungarian
 - Italian
 - Norwegian
 - Porter
 - Portuguese
 - Romanian
 - Russian
 - Spanish
 - Swedish



In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
porter_stemmer  = PorterStemmer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Stemming for {} is {}".format(w,porter_stemmer.stem(w)))  

Lemmatization usually refers to the morphological analysis of words, which aims to remove inflectional endings. It helps in returning the base or dictionary form of a word, which is known as the lemma. The NLTK Lemmatization method is based on WorldNet's built-in morph function. Text preprocessing includes both stemming as well as lemmatization.

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))  

## POS Tagging

 | Abbreviation | Meaning | 
 | --- | --- | 
 | CC | coordinating conjunction | 
 | CD | cardinal digit | 
 | DT | determiner | 
 | EX | existential there | 
 | FW | foreign word | 
 | IN | preposition/subordinating conjunction | 
 | JJ | adjective (large) | 
 | JJR | adjective, comparative (larger) | 
 | JJS | adjective, superlative (largest) | 
 | LS | list market | 
 | MD | modal (could, will) | 
 | NN | noun, singular (cat, tree) | 
 | NNS | noun plural (desks) | 
 | NNP | proper noun, singular (sarah) | 
 | NNPS | proper noun, plural (indians or americans) | 
 | PDT | predeterminer (all, both, half) | 
 | POS | possessive ending (parent\ 's) | 
 | PRP | personal pronoun (hers, herself, him,himself) | 
 | PRP$ | possessive pronoun (her, his, mine, my, our ) | 
 | RB | adverb (occasionally, swiftly) | 
 | RBR | adverb, comparative (greater) | 
 | RBS | adverb, superlative (biggest) | 
 | RP | particle (about) | 
 | TO | infinite marker (to) | 
 | UH | interjection (goodbye) | 
 | VB | verb (ask) | 
 | VBG | verb gerund (judging) | 
 | VBD | verb past tense (pleaded) | 
 | VBN | verb past participle (reunified) | 
 | VBP | verb, present tense not 3rd person singular(wrap) | 
 | VBZ | verb, present tense with 3rd person singular (bases) | 
 | WDT | wh-determiner (that, what) | 
 | WP | wh- pronoun (who) | 
 | WRB | wh- adverb (how) | 

In [None]:
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
print(new_data[0])
tagged = pos_tag(new_data[0])

## Named Entity Recognition using chunking

a type of shallow parsing (only two levels) that takes in a POS tagged sentence and predicts the named entities in a text


In [None]:
nltk.download('words')
nltk.download('maxent_ne_chunker')

from nltk import ne_chunk
# sent = ## sentence tokenization

sent = "Mr Vinken is chairman of the Dutch publishing group."
print(sent)
print(sent)
tree = ne_chunk(tagged)
print(tree)

## WordNet

In [None]:
from nltk.corpus import wordnet
synonyms = []
antonyms = []

for syn in wordnet.synsets("active"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print("Synonyms for active:", set(synonyms))
print("Antonyms for active:", set(antonyms))

# Other python processing packages
- https://spacy.io/
    - English, German, French, Spanish, Portuguese, Italian, Dutch, Greek, Norwegian Bokmal, Lithuanian
- https://guides.library.upenn.edu/japanesetext
    - Japanese
- https://pypi.org/project/polyglot/
	- Tokenization (165 Languages)
	- Language detection (196 Languages)
	- Named Entity Recognition (40 Languages)
	- Part of Speech Tagging (16 Languages)
	- Sentiment Analysis (136 Languages)
	- Word Embeddings (137 Languages)
	- Morphological analysis (135 Languages)
	- Transliteration (69 Languages)