<a href="https://colab.research.google.com/github/JampaniDivyaSree/NLP/blob/main/NLP_tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction to Natural Language Processing

In this workbook, at a high-level we will learn about text tokenization; text normalization such as lowercasing, stemming; part-of-speech tagging; Named entity recognition;





In [None]:
####PLEASE EXECUTE THESE COMMANDS BEFORE PROCEEDING####
import nltk  #Natural Language Toolkit
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#Tokenization -- Text into word tokens; Paragraphs into sentences;
from nltk.tokenize import sent_tokenize

text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning the important basics of NLP."
sent_tokenize(text)



['Hello everyone.',
 'Welcome to Intro to Machine Learning Applications.',
 'We are now learning the important basics of NLP.']

In [None]:
import nltk.data

german_tokenizer = nltk.data.load('tokenizers/punkt/PY3/german.pickle')

text = 'Wie geht es Ihnen? Mir geht es gut.'
german_tokenizer.tokenize(text)


In [None]:
#Tokenization -- Text into word tokens; Paragraphs into words;
from nltk.tokenize import word_tokenize

text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
word_tokenize(text)



['Hello',
 'everyone',
 '.',
 'Welcome',
 'to',
 'Intro',
 'to',
 'Machine',
 'Learning',
 'Applications',
 '.',
 'We',
 'are',
 'now',
 'learning',
 'important',
 'basics',
 'of',
 'NLP',
 '.']

In [None]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(text)


###n-grams vs tokens

##### n-grams are contiguous sequences of n-items in a sentence. N can be 1, 2 or any other positive integers, although usually we do not consider very large N because those n-grams rarely appears in many different places.

##### Tokens do not have any conditions on contiguity

In [None]:
#Using pure python

import re

def generate_ngrams(text, n):
    # Convert to lowercases
    text = text.lower()

    # Replace all none alphanumeric characters with spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

    # Break sentence in the token, remove empty tokens
    tokens = [token for token in text.split(" ") if token != ""]

    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
print(text)
generate_ngrams(text, n=2)

In [None]:
#Using NLTK import ngrams

import re
from nltk.util import ngrams

text = text.lower()
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
tokens = [token for token in text.split(" ") if token != ""]
output = list(ngrams(tokens, 3))
print(output)

[('hello', 'everyone', 'welcome'), ('everyone', 'welcome', 'to'), ('welcome', 'to', 'intro'), ('to', 'intro', 'to'), ('intro', 'to', 'machine'), ('to', 'machine', 'learning'), ('machine', 'learning', 'applications'), ('learning', 'applications', 'we'), ('applications', 'we', 'are'), ('we', 'are', 'now'), ('are', 'now', 'learning'), ('now', 'learning', 'important'), ('learning', 'important', 'basics'), ('important', 'basics', 'of'), ('basics', 'of', 'nlp')]


In [None]:
#Text Normalization

#Lowercasing
text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
lowert = text.lower()
uppert = text.upper()

print(lowert)
print(uppert)

hello everyone. welcome to intro to machine learning applications. we are now learning important basics of nlp.
HELLO EVERYONE. WELCOME TO INTRO TO MACHINE LEARNING APPLICATIONS. WE ARE NOW LEARNING IMPORTANT BASICS OF NLP.


In [None]:
#Text Normalization
#stemming
#Porter stemmer is a famous stemming approach

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

# choose some words to be stemmed
words = ["hike", "hikes", "hiked", "hiking", "hikers", "hiker"]

for w in words:
    print(w, " : ", ps.stem(w))


hike  :  hike
hikes  :  hike
hiked  :  hike
hiking  :  hike
hikers  :  hiker
hiker  :  hiker


In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

ps = PorterStemmer()
text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
print(text)


#Tokenize and stem the words
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
tokens = [token for token in text.split(" ") if token != ""]

i=0
while i<len(tokens):
  tokens[i]=ps.stem(tokens[i])
  i=i+1

#merge all the tokens to form a long text sequence
text2 = ' '.join(tokens)

print(text2)

In [None]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import re

ss = SnowballStemmer("english")
text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."
print(text)


#Tokenize and stem the words
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
tokens = [token for token in text.split(" ") if token != ""]

i=0
while i<len(tokens):
  tokens[i]=ss.stem(tokens[i])
  i=i+1

#merge all the tokens to form a long text sequence
text2 = ' '.join(tokens)

print(text2)

In [None]:
 #Stopwords removal

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text = "Hello everyone. Welcome to Intro to Machine Learning Applications. We are now learning important basics of NLP."

stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

text2 = ' '.join(filtered_sentence)

['Hello', 'everyone', '.', 'Welcome', 'to', 'Intro', 'to', 'Machine', 'Learning', 'Applications', '.', 'We', 'are', 'now', 'learning', 'important', 'basics', 'of', 'NLP', '.']
['Hello', 'everyone', '.', 'Welcome', 'Intro', 'Machine', 'Learning', 'Applications', '.', 'We', 'learning', 'important', 'basics', 'NLP', '.']


In [None]:
#Part-of-Speech tagging -

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

text = 'Rudolph Smith bought 1000 shares of tesla inc in May 2022'

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
print(sent)


[('Rudolph', 'NNP'), ('Smith', 'NNP'), ('bought', 'VBD'), ('1000', 'CD'), ('shares', 'NNS'), ('of', 'IN'), ('tesla', 'NN'), ('inc', 'NN'), ('in', 'IN'), ('May', 'NNP'), ('2022', 'CD')]


In [None]:
#Named entity recognition
#spaCy is an NLP Framework -- easy to use and having ability to use neural networks

import en_core_web_sm
nlp = en_core_web_sm.load()

text = 'Mr.Rudolph Smith bought 1000 shares of Tesla inc in May 2024'

doc = nlp(text)
print(doc.ents)
print([(X.text, X.label_) for X in doc.ents])

(Rudolph Smith, 1000, Tesla inc, May 2024)
[('Rudolph Smith', 'PERSON'), ('1000', 'CARDINAL'), ('Tesla inc', 'ORG'), ('May 2024', 'DATE')]


In [None]:
#Sentiment analysis

In [None]:
#Topic modeling

In [None]:
#Word embeddings


#Class exercise

#### 1. Read a file from its URL
#### 2. Extract the text and tokenize it meaningfully into words.
#### 3. Print the entire text combined after tokenization.
#### 4. Perform stemming using both porter and snowball stemmers. Which one works the best? Why?
#### 5. Remove stopwords
#### 6. Identify the top-10 unigrams based on their frequency.


In [None]:

#Load the file first
!wget https://www.dropbox.com/s/o8lxi6yrezmt5em/reviews.txt


--2021-11-28 13:25:56--  https://www.dropbox.com/s/o8lxi6yrezmt5em/reviews.txt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:6022:18::a27d:4212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/o8lxi6yrezmt5em/reviews.txt [following]
--2021-11-28 13:25:57--  https://www.dropbox.com/s/raw/o8lxi6yrezmt5em/reviews.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucafc5669651649180f2312c34bd.dl.dropboxusercontent.com/cd/0/inline/Ba0oeZqKZ-brxGhb21ApjGMDrYkS3dPabH3kI2nCZF0WGsw_2Bo0pt46GBn8laTbq6T_3cB6R_K-BQnCJHx8GrvySVoDqJysqyThcNxnB9VtweIJ_ei1Zl1ZxjuB3IuNy1DHll3E4ngfE_kIpsCVNrlQ/file# [following]
--2021-11-28 13:25:57--  https://ucafc5669651649180f2312c34bd.dl.dropboxusercontent.com/cd/0/inline/Ba0oeZqKZ-brxGhb21ApjGMDrYkS3dPabH3kI2nCZF0WGsw_2Bo0pt46GBn8laTbq6T_3cB6R_K-BQnCJHx8GrvySVoD