In [None]:
#==============================================================================
# CellStrat Hub Pack - Natural Language Processing
# Compatible tier : Free Tier or above 
# Kernel : Pytorch 1.9 
#==============================================================================

In [None]:
#==============================================================================
# If any library needs to be installed, install with following command :-
# pip install <library-name>
# This pip command should be in an independent cell with no other code or comments in this cell.
#==============================================================================


In [None]:
#==============================================================================
# Download the English package by running this command in the terminal:-
# python3 -m spacy download en_core_web_sm
#==============================================================================


## Corpora, Tokens, and Types

All NLP methods, be they classic or modern, begin with a text dataset, also called a **corpus** (plural: corpora). A corpus usually contains raw text (in ASCII or UTF-8) and any metadata associated with the text. The raw text is a sequence of characters (bytes), but most times it is useful to group those characters into contiguous units called tokens. 

The process of breaking a text down into tokens is called tokenization. For example, there are six tokens in the Esperanto sentence “**Mary, don’t slap the green witch**.”

In [None]:
#======================================================================================
# Basic concepts of NLP:
# Corpora, Tokens, and Types
# Text Corpora
# Unigrams, Bigrams, Trigrams, …, N-grams
# Lemmatization
# Stop Words
#======================================================================================


#### Install spacy library

In [None]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [None]:
#==============================================================================
#Import the spacy library
#==============================================================================

import spacy
#load the general english library
nlp = spacy.load('en_core_web_sm')


In [None]:
#==============================================================================
#Example of using Tweet tokenizer to tokenize the tweets
#==============================================================================

import nltk
from nltk.tokenize import TweetTokenizer
tweet=u"Snow White and the Seven Degrees"
    #MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees']


In [None]:
#==============================================================================
#Download the gutenberg corpus
#=============================================================================
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Text Corpora

NLTK includes a small selection of texts from the Project Gutenberg electronic text archive, which contains some 25,000 free electronic books, hosted at http://www.gutenberg.org/

In [None]:

#gutenberg corpus has lot of different text files 
print(nltk.corpus.gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [None]:
#==============================================================================
#Point to austen-emm.txt file
#==============================================================================

emma = nltk.corpus.gutenberg.words('austen-emma.txt')

In [None]:
#The austen-emma.tx is having tokenized words
emma

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [None]:
#Print the length of emma document ( # of words)
len(emma)

192427

In [None]:
#==============================================================================
#Toeknized sequences from the shakespeare-macbeth document
#==============================================================================
from nltk.corpus import gutenberg
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences

[['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ...]

In [None]:
#==============================================================================
#Print a specific sequence
#==============================================================================
macbeth_sentences[1116]

['Double',
 ',',
 'double',
 ',',
 'toile',
 'and',
 'trouble',
 ';',
 'Fire',
 'burne',
 ',',
 'and',
 'Cauldron',
 'bubble']

### Corpora in Other Languages

In [None]:
#==============================================================================
#Download the Indian package
#==============================================================================

nltk.download('indian')

[nltk_data] Downloading package indian to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/indian.zip.


True

In [None]:
#==============================================================================
#Display he hindi words
#==============================================================================

nltk.corpus.indian.words('hindi.pos')

['पूर्ण', 'प्रतिबंध', 'हटाओ', ':', 'इराक', 'संयुक्त', ...]

## Unigrams, Bigrams, Trigrams, …, N-grams

In [None]:
#==============================================================================
#Return the unigrams(one word),bigrams(two words),trigrams (three words..)
#==============================================================================

def n_grams(text, n):
    '''
    takes tokens or text, returns a list of n-grams
    '''
    return [text[i:i+n] for i in range(len(text)-n+1)]

cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']
print(n_grams(cleaned, 2))


[['mary', ','], [',', "n't"], ["n't", 'slap'], ['slap', 'green'], ['green', 'witch'], ['witch', '.']]


## bigrams

In [None]:
#==============================================================================
#Split a given word into bi grams
#==============================================================================

name ='methanol'
[name[i:i+2] for i in range(len(name)-1)]

['me', 'et', 'th', 'ha', 'an', 'no', 'ol']

#### Install ngrams Library

In [None]:
pip install ngrams

Collecting ngrams
  Downloading ngrams-1.0.3.tar.gz (1.3 kB)
Building wheels for collected packages: ngrams
  Building wheel for ngrams (setup.py) ... [?25ldone
[?25h  Created wheel for ngrams: filename=ngrams-1.0.3-py3-none-any.whl size=1590 sha256=74ed27cf6fd9ce547118f5f3cf2ad4e76c09cf01ebf8c2616986b7bcdcb50d29
  Stored in directory: /home/ubuntu/.cache/pip/wheels/54/4e/39/0d67a8d09e359697785cb9cb2ca6c075b4c9671cad607df2df
Successfully built ngrams
Installing collected packages: ngrams
Successfully installed ngrams-1.0.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
#==============================================================================
#Inbuilt library can be used to produce n-grams
#==============================================================================

import nltk
from nltk.util import ngrams
#==============================================================================
# Function to generate n-grams from sentences.
#==============================================================================

def extract_ngrams(data, num):
    n_grams = ngrams(nltk.word_tokenize(data), num)
    return [ ' '.join(grams) for grams in n_grams]
 
data = 'This class is part of a advance NLP course @CellStrat.'
 
print("1-gram: ", extract_ngrams(data, 1))
print("2-gram: ", extract_ngrams(data, 2))
print("3-gram: ", extract_ngrams(data, 3))
print("4-gram: ", extract_ngrams(data, 4))


1-gram:  ['This', 'class', 'is', 'part', 'of', 'a', 'advance', 'NLP', 'course', '@', 'CellStrat', '.']
2-gram:  ['This class', 'class is', 'is part', 'part of', 'of a', 'a advance', 'advance NLP', 'NLP course', 'course @', '@ CellStrat', 'CellStrat .']
3-gram:  ['This class is', 'class is part', 'is part of', 'part of a', 'of a advance', 'a advance NLP', 'advance NLP course', 'NLP course @', 'course @ CellStrat', '@ CellStrat .']
4-gram:  ['This class is part', 'class is part of', 'is part of a', 'part of a advance', 'of a advance NLP', 'a advance NLP course', 'advance NLP course @', 'NLP course @ CellStrat', 'course @ CellStrat .']


#Lemmatization

In [None]:
#==============================================================================
#Lemmatization is the process of converting to the root word
#==============================================================================
doc = nlp(u"he was running late")
for token in doc:
    print('{} --> {}'.format(token, token.lemma_))


he --> he
was --> be
running --> run
late --> late


## Lemmatization using NLTK


In [None]:
#==============================================================================
#Use the wordnet package 
#WordNet® is a large lexical database of English. Nouns, verbs, adjectives and adverbs are grouped into sets of cognitive synonyms (synsets), each expressing a distinct concept.
#==============================================================================

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
 
wnl = WordNetLemmatizer()
text = ['She gripped the armrest as he passed two cars at a time.',
        'Her car was in full view.',
        'A number of cars carried out of state license plates.']
 
output = []
for sentence in text:
    output.append(" ".join([wnl.lemmatize(i) for i in sentence.split()]))
 
for item in output:
    print(item)
 
print("*" * 10)
print(wnl.lemmatize('jumps', 'n'))
print(wnl.lemmatize('jumping', 'v'))
print(wnl.lemmatize('jumped', 'v'))
 
print("*" * 10)
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('happiest', 'a'))
print(wnl.lemmatize('easiest', 'a'))

She gripped the armrest a he passed two car at a time.
Her car wa in full view.
A number of car carried out of state license plates.
**********
jump
jump
jump
**********
sad
happy
easy


#Stop Words

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

text = "This class is part of a advance NLP course @CellStrat."
text_tokens = word_tokenize(text)

tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]

print(tokens_without_sw)

['This', 'class', 'part', 'advance', 'NLP', 'course', '@', 'CellStrat', '.']


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Categorizing Words: POS Tagging

## Using NLTK tagger

In [None]:
#NLTK Tagger
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
#==============================================================================
#import the libraries
#==============================================================================

import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
stop_words = set(stopwords.words('english')) 

In [None]:
#input a given text
txt ='I saw a girl with a telescope.'

In [None]:
#==============================================================================
#Toeknize the sentences
#==============================================================================
tokenized = sent_tokenize(txt) 
for i in tokenized: 
      
    # Word tokenizers is used to find the words  
    # and punctuation in a string 
    wordsList = nltk.word_tokenize(i) 
  
    # removing stop words from wordList 
    wordsList = [w for w in wordsList if not w in stop_words]  
  
    #  Using a Tagger. Which is part-of-speech  
    # tagger or POS-tagger.  
    tagged = nltk.pos_tag(wordsList) 
  
    print(tagged) 

[('I', 'PRP'), ('saw', 'VBD'), ('girl', 'JJ'), ('telescope', 'NN'), ('.', '.')]


# Parsing

In [None]:
#==============================================================================

#Np stands for Noun Phrase
#Display only the noun phrases

#==============================================================================
doc  = nlp(u"I saw a girl with a telescope.")
for chunk in doc.noun_chunks:
    print ('{} - {}'.format(chunk, chunk.label_))


I - NP
a girl - NP
a telescope - NP
