# Natural Language Toolkit (NLTK)

In [1]:
####PLEASE EXECUTE THESE COMMANDS BEFORE PROCEEDING####

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/minor/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/minor/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /Users/minor/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
#Tokenization -- Text into word tokens; Paragraphs into sentences;
from nltk.tokenize import sent_tokenize 
  
text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
sent_tokenize(text)

['Hello everyone.',
 'Welcome to Intro to Machine Learning Applications.',
 'We are now learning important basics of NLP.']

In [3]:
import nltk.data 
  
german_tokenizer = nltk.data.load('tokenizers/punkt/PY3/german.pickle') 
  
text = 'Wie geht es Ihnen? Mir geht es gut.'
german_tokenizer.tokenize(text)

['Wie geht es Ihnen?', 'Mir geht es gut.']

In [4]:
from nltk.tokenize import word_tokenize 
  
text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
word_tokenize(text)

['Hello',
 'everyone',
 '.',
 'Welcome',
 'to',
 'Intro',
 'to',
 'Machine',
 'Learning',
 'Applications',
 '.',
 'We',
 'are',
 'now',
 'learning',
 'important',
 'basics',
 'of',
 'NLP',
 '.']

In [5]:
from nltk.tokenize import TreebankWordTokenizer 
  
tokenizer = TreebankWordTokenizer() 
tokenizer.tokenize(text)

['Hello',
 'everyone.',
 'Welcome',
 'to',
 'Intro',
 'to',
 'Machine',
 'Learning',
 'Applications.',
 'We',
 'are',
 'now',
 'learning',
 'important',
 'basics',
 'of',
 'NLP',
 '.']

In [6]:
# n-grams using pure Python

import re

def generate_ngrams(text, n):
    # Convert to lowercases
    text = text.lower()
    
    # Replace all none alphanumeric characters with spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in text.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
print(text)
generate_ngrams(text, n=2)

Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP.


['hello everyone',
 'everyone welcome',
 'welcome to',
 'to intro',
 'intro to',
 'to machine',
 'machine learning',
 'learning applications',
 'applications we',
 'we are',
 'are now',
 'now learning',
 'learning important',
 'important basics',
 'basics of',
 'of nlp']

In [7]:
# n-grams using NLTK

import re
from nltk.util import ngrams

text = text.lower()
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
tokens = [token for token in text.split(" ") if token != ""]
output = list(ngrams(tokens, 3))
print(output)

[('hello', 'everyone', 'welcome'), ('everyone', 'welcome', 'to'), ('welcome', 'to', 'intro'), ('to', 'intro', 'to'), ('intro', 'to', 'machine'), ('to', 'machine', 'learning'), ('machine', 'learning', 'applications'), ('learning', 'applications', 'we'), ('applications', 'we', 'are'), ('we', 'are', 'now'), ('are', 'now', 'learning'), ('now', 'learning', 'important'), ('learning', 'important', 'basics'), ('important', 'basics', 'of'), ('basics', 'of', 'nlp')]


In [8]:
#Text Normalization

# Lowercasing
text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
lowert = text.lower()
uppert = text.upper()

print(lowert)
print(uppert)

hello everyone. welcome to intro to machine learning applications. we are now learning important basics of nlp.
HELLO EVERYONE. WELCOME TO INTRO TO MACHINE LEARNING APPLICATIONS. WE ARE NOW LEARNING IMPORTANT BASICS OF NLP.


In [9]:
# Stemming

from nltk.stem import PorterStemmer 

ps = PorterStemmer() 
  
# choose some words to be stemmed 
words = ["hike", "hikes", "hiked", "hiking", "hikers", "hiker"] 
  
for w in words: 
    print(w, " : ", ps.stem(w))

hike  :  hike
hikes  :  hike
hiked  :  hike
hiking  :  hike
hikers  :  hiker
hiker  :  hiker


In [10]:
# Porter stemming

from nltk.stem import PorterStemmer
import re
   
ps = PorterStemmer() 
text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
print(text)

#Tokenize and stem the words
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
tokens = [token for token in text.split(" ") if token != ""]

i=0
while i<len(tokens):
  tokens[i]=ps.stem(tokens[i])
  i=i+1

#merge all the tokens to form a long text sequence 
text2 = ' '.join(tokens) 

print(text2)

Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP.
hello everyon welcom to intro to machin learn applic we are now learn import basic of nlp


In [11]:
# Snowball stemming

from nltk.stem.snowball import SnowballStemmer
import re
   
ss = SnowballStemmer("english")
text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
print(text)


#Tokenize and stem the words
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
tokens = [token for token in text.split(" ") if token != ""]

i=0
while i<len(tokens):
  tokens[i]=ss.stem(tokens[i])
  i=i+1

#merge all the tokens to form a long text sequence 
text2 = ' '.join(tokens) 

print(text2)

Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP.
hello everyon welcom to intro to machin learn applic we are now learn import basic of nlp


In [12]:
# Stopword removal

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."

stop_words = set(stopwords.words('english')) 
word_tokens = word_tokenize(text) 
  
filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  
filtered_sentence = [] 
  
for w in word_tokens: 
    if w not in stop_words: 
        filtered_sentence.append(w) 
  
print(word_tokens) 
print(filtered_sentence) 

text2 = ' '.join(filtered_sentence)

['Hello', 'everyone', '.', 'Welcome', 'to', 'Intro', 'to', 'Machine', 'Learning', 'Applications', '.', 'We', 'are', 'now', 'learning', 'important', 'basics', 'of', 'NLP', '.']
['Hello', 'everyone', '.', 'Welcome', 'Intro', 'Machine', 'Learning', 'Applications', '.', 'We', 'learning', 'important', 'basics', 'NLP', '.']


In [16]:
# Part-of-Speech tagging
import nltk

def preprocess(sentence):
    sentence = nltk.word_tokenize(sentence)
    sentence = nltk.pos_tag(sentence)
    return sentence

preprocess('GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside 40 million developers.')

[('GitHub', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('development', 'NN'),
 ('platform', 'NN'),
 ('inspired', 'VBN'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('way', 'NN'),
 ('you', 'PRP'),
 ('work', 'VBP'),
 ('.', '.'),
 ('From', 'IN'),
 ('open', 'JJ'),
 ('source', 'NN'),
 ('to', 'TO'),
 ('business', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('can', 'MD'),
 ('host', 'VB'),
 ('and', 'CC'),
 ('review', 'VB'),
 ('code', 'NN'),
 (',', ','),
 ('manage', 'NN'),
 ('projects', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('build', 'VB'),
 ('software', 'NN'),
 ('alongside', 'RB'),
 ('40', 'CD'),
 ('million', 'CD'),
 ('developers', 'NNS'),
 ('.', '.')]