# **Text Preprocessing for NLP**

**Import example text:**

In [None]:
# Import libraries to read example text

import pandas as pd
import requests

url = 'https://raw.githubusercontent.com/theleadio/datascience_demo/master/grab-spac.txt'

response = requests.get(url)
text = response.text

In [None]:
print(text)

SINGAPORE: Grab Holdings, the largest ride-hailing and food delivery firm in Southeast Asia, clinched a merger on Tuesday with special-purpose acquisition company Altimeter Growth Corp securing a valuation of nearly US$40 billion and paving the way for a coveted U.S. listing.

The merger, the biggest blank-check company deal ever, underscores the frenzy on Wall Street as shell firms have raised $99 billion in the United States so far this year after a record $83 billion in 2020.

As part of Singapore-based Grab's agreement with the SPAC backed by Altimeter Capital, investors such as Temasek Holdings, BlackRock, Fidelity International, Abu Dhabi's Mubadala and Malaysia's Permodalan Nasional Bhd will participate in a $4 billion private investment in public equity offering.

Funds managed by Altimeter Capital will lead the investment with $750 million.

"Institutional investors looking for Asian consumer internet exposure are keen to diversify their allocation beyond a handful of companie

# **Removing punctuations:**

In [None]:
# Removing punctuations

punctuations = '''\n!”#$%&’()*+,-./:;=>?@[\]^_`{|}~'''

for character in text:
  if character in punctuations:
    text = text.replace(character, ' ')

print(text)

SINGAPORE  Grab Holdings  the largest ride hailing and food delivery firm in Southeast Asia  clinched a merger on Tuesday with special purpose acquisition company Altimeter Growth Corp securing a valuation of nearly US 40 billion and paving the way for a coveted U S  listing   The merger  the biggest blank check company deal ever  underscores the frenzy on Wall Street as shell firms have raised  99 billion in the United States so far this year after a record  83 billion in 2020   As part of Singapore based Grab's agreement with the SPAC backed by Altimeter Capital  investors such as Temasek Holdings  BlackRock  Fidelity International  Abu Dhabi's Mubadala and Malaysia's Permodalan Nasional Bhd will participate in a  4 billion private investment in public equity offering   Funds managed by Altimeter Capital will lead the investment with  750 million   "Institutional investors looking for Asian consumer internet exposure are keen to diversify their allocation beyond a handful of companie

# **Convert text to lower case/upper case:**

In [None]:
# convert text to lower case

text = text.lower()
print(text)

NameError: ignored

# **Tokenization:**

In [None]:
# tokenization

tokens = text.split(' ')
print(tokens)

['singapore', '', 'grab', 'holdings', '', 'the', 'largest', 'ride', 'hailing', 'and', 'food', 'delivery', 'firm', 'in', 'southeast', 'asia', '', 'clinched', 'a', 'merger', 'on', 'tuesday', 'with', 'special', 'purpose', 'acquisition', 'company', 'altimeter', 'growth', 'corp', 'securing', 'a', 'valuation', 'of', 'nearly', 'us', '40', 'billion', 'and', 'paving', 'the', 'way', 'for', 'a', 'coveted', 'u', 's', '', 'listing', '', '', 'the', 'merger', '', 'the', 'biggest', 'blank', 'check', 'company', 'deal', 'ever', '', 'underscores', 'the', 'frenzy', 'on', 'wall', 'street', 'as', 'shell', 'firms', 'have', 'raised', '', '99', 'billion', 'in', 'the', 'united', 'states', 'so', 'far', 'this', 'year', 'after', 'a', 'record', '', '83', 'billion', 'in', '2020', '', '', 'as', 'part', 'of', 'singapore', 'based', "grab's", 'agreement', 'with', 'the', 'spac', 'backed', 'by', 'altimeter', 'capital', '', 'investors', 'such', 'as', 'temasek', 'holdings', '', 'blackrock', '', 'fidelity', 'international', 

# **Stopwords:** Using own list of words

In [None]:
# remove stop words 

url = 'https://raw.githubusercontent.com/theleadio/datascience_demo/master/stopwords.txt'
response = requests.get(url)
stopwords = response.text.splitlines()

NameError: ignored

In [None]:
print(len(stopwords))
print(stopwords)

NameError: ignored

In [None]:
tokens = [t for t in tokens if t not in stopwords and len(t) > 0]

In [None]:
print(tokens)

# **Removing URLs:**

In [None]:
# remove url

text_url = """
This is a random sentence that has some sorta url that looks like these >>> http://google.com & https://www.facebook.com/ etc.
"""

import re
def remove_urls(text):

 url_pattern = r'https?://\S+|www\.\S+'
 without_urls = re.sub(pattern=url_pattern, repl=' ', string=text) 
 return without_urls

update_text_url = remove_urls(text_url)
print(update_text_url)


This is a random sentence that has some sorta url that looks like these >>>   &   etc.



# **Removing numbers:**

In [None]:
# Removing numbers

def remove_numbers(text):

	number_pattern = r'\d+'
	without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
	return without_number

text_2 = remove_numbers(text_url)
print(text_2)



This is a Random Sentence that don't make sense because I bought   apples and   oranges today. More info can be found here >>> http://google.com & https://www.facebook.com/ etc.



# **Stemming:** Using NLTK

**Importing NLTK**

In [None]:
import nltk
nltk.download('popular')

# all-corpora >> All the corpora
# all-nltk >> All packages available on nltk_data gh-pages branch
# all >> All packages
# book >> Everything used in the NLTK Book
# popular >> Popular packages
# test >> Packages for running tests

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk import word_tokenize

def porter_stemmer(text):
	
	tokens = word_tokenize(text) # word tokenization

	for index in range(len(tokens)):
		
		stem_word = stemmer.stem(tokens[index]) # stem word to each word
		tokens[index] = stem_word # update tokens list with stem word

	return ' '.join(tokens) # join list with space separator as string

stemmer = PorterStemmer() # initialize porter stemmer object
text_ex_stem = "She bought 10 red apples and 10 oranges from the nearby grocer." # example text for stemming technique
stem_result = porter_stemmer(text_ex_stem)
print(stem_result)

she bought 10 red appl and 10 orang from the nearbi grocer .


# **Lemmatization:** Using NLTK

In [None]:
from nltk.stem import WordNetLemmatizer

def lemmatization(text):
	
	tokens = word_tokenize(text) 	# word tokenization

	for index in range(len(tokens)):
   
		lemma_word = lemma.lemmatize(tokens[index]) # lemma word
		tokens[index] = lemma_word

	return ' '.join(tokens)

lemma = WordNetLemmatizer() # initialize lemmatizer object

lemma_result = lemmatization(text_ex_stem)
print(lemma_result)

She bought 10 red apple and 10 orange from the nearby grocer .


# **Stopwords:** Using NLTK (Remove stopwords)

In [None]:
import nltk
nltk_stopwords = nltk.corpus.stopwords.words('english')

print(len(nltk_stopwords))
print(nltk_stopwords)

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [None]:
from nltk.tokenize import word_tokenize
 
ex_sentence = "This is a sample sentence showing off the stop words filtration."
 
word_tokens = word_tokenize(ex_sentence)
 
clean_sentence = [w for w in word_tokens if not w.lower() in nltk_stopwords]

clean_sentence = []
 
for w in word_tokens:
    if w not in nltk_stopwords:
        clean_sentence.append(w)
 
print(word_tokens)
print(clean_sentence)

['This', 'is', 'a', 'sample', 'sentence', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', 'showing', 'stop', 'words', 'filtration', '.']


# **Stopwords:** Using NLTK (Adding a single word to stopwords)

In [None]:
# Adding a single word to nltk_stopwords

nltk_stopwords.append('grab')

print(len(nltk_stopwords))
print(nltk_stopwords)

180
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

# **Stopwords:** Using NLTK (Removing a single word from stopwords)

In [None]:
# Removing a single word to nltk_stopwords

nltk_stopwords.remove('i')

print(len(nltk_stopwords))
print(nltk_stopwords)

179
['me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 't

# **Stopwords:** Using Spacy (Remove stopwords)

In [None]:
! pip install spacy==2.1.3
! python -m spacy download en_core_web_sm

In [None]:
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS

print(len(STOP_WORDS))
print(STOP_WORDS)

326
{'some', 'fifteen', 'are', 'down', 'first', 'where', 'why', 'ours', 'thereafter', 'this', '’ve', 'am', 'herself', 'himself', 'afterwards', 'back', 'often', 'among', 'those', 'being', 'full', 'even', 'thereby', 'you', 'formerly', 'which', 'before', 'will', "'m", 'themselves', 'to', 'two', 'within', 'becoming', "'ve", 'whereupon', 'another', 'same', 'via', 'part', 'one', 'she', 'whether', 'ca', 'here', 'during', 'anyhow', 'behind', 'been', 'our', 'now', 'other', '‘ll', 'by', "'re", 'see', 'twelve', 'n‘t', '‘re', 'doing', 'as', 'these', 'unless', '‘d', 'further', 'many', 'whoever', 'did', 'across', 'ever', 'because', 'four', 'forty', 'might', 'eight', 'yet', 'well', 'done', 'nowhere', 'noone', 'most', 'too', 'about', 'such', 'least', 'neither', 'please', 'alone', 'in', 'used', 'her', 'amount', 'latter', 'so', 'five', 'whereby', 'except', 'somewhere', 'and', 'through', 'had', 'upon', 'under', 'made', 'several', 'was', 'call', 'becomes', 'whither', 'indeed', 'anywhere', 'while', 'give',

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
sentence = nlp("This data science bootcamp is so much fun")
print(sentence)

non_stopwords = [non_stopwords.text for non_stopwords in sentence if not non_stopwords.is_stop]
print(non_stopwords)

stopwords = [stopwords.text for stopwords in sentence if stopwords.is_stop]
print(stopwords)

This data science bootcamp is so much fun
['data', 'science', 'bootcamp', 'fun']
['This', 'is', 'so', 'much']


# **N-grams:** Using NLTK

In [None]:
import nltk
from nltk.util import ngrams

# Function to generate n-grams from sentences.
def extract_ngrams(text, num):

    n_grams = ngrams(nltk.word_tokenize(text), num)
    return [ ' '.join(grams) for grams in n_grams]
 
print("1-gram: ", extract_ngrams(text, 1))
print("2-gram: ", extract_ngrams(text, 2))
print("3-gram: ", extract_ngrams(text, 3))
print("4-gram: ", extract_ngrams(text, 4))

NameError: ignored

# **Removing emoticons :) :/ ;( :** Using Python Package

In [None]:
# Removing emoticons using https://pypi.org/project/emot/

!pip install emot 

In [None]:
from emot.emo_unicode import UNICODE_EMO, EMOTICONS # Function for removing emoticons 

def remove_emoticons(text):

    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in   EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text) # Example

remove_emoticons("This product ;) is so awesome! :) :) :) :)")

'This product  is so awesome!    '

# **Removing emoticons:** 😀😀

In [None]:
# Removing emojis

def remove_emoji(text):
	
	emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)

	without_emoji = emoji_pattern.sub(r'',text)
	return without_emoji

ex_emoji = "😀😀 This bootcamp 🧰 is so awesome!😀😀 " # example text for emoji removing technique

emoji_result = remove_emoji(ex_emoji) # calling function
print(emoji_result)

 This bootcamp  is so awesome! 


# **Spelling Check:** Using Python Package > https://pypi.org/project/autocorrect/

In [None]:
!pip install autocorrect

In [None]:
# Correcting spelling mistakes using autocorrect package - Python here: https://pypi.org/project/autocorrect/

from autocorrect import Speller
from nltk import word_tokenize

def spell_autocorrect(text): # spelling correction using spellchecker
	
	correct_spell_words = []

	spell_corrector = Speller(lang='en') # initialize Speller object for english language with 'en'
	for word in word_tokenize(text):
		
		correct_word = spell_corrector(word)
		correct_spell_words.append(correct_word)

	correct_spelling = ' '.join(correct_spell_words)
	return correct_spelling

wrong_spelling = "This is anoter exapl for spell correction"
correct_spelling = spell_autocorrect(wrong_spelling)
print(correct_spelling)

This is another example for spell correction


# **Spelling Check:** Using textblob > https://textblob.readthedocs.io/en/dev/#

In [None]:
!pip install textblob # https://textblob.readthedocs.io/en/dev/#

In [None]:
from textblob import Word
 
word = Word('percieve')
word.spellcheck()

[('perceive', 1.0)]

# **Spelling Check:** Using NLTK

In [None]:
# Downloading and importing
# package 'words' from nltk corpus

nltk.download('words')
from nltk.corpus import words
  
correct_words = words.words()

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

In [None]:
# list of incorrect spellings
# that need to be corrected 
incorrect_words=['happpy', 'azmaing', 'intelliengt']
  
# loop for finding correct spellings
# based on jaccard distance
# and printing the correct word
for word in incorrect_words:
    temp = [(jaccard_distance(set(ngrams(word, 2)),
                              set(ngrams(w, 2))),w)
            for w in correct_words if w[0]==word[0]]
    print(sorted(temp, key = lambda val:val[0])[0][1])

happy
amazing
intelligent


# **Part of Speech Tagging (POS):** Using NLTK

In [None]:
from nltk import word_tokenize, pos_tag

In [None]:
sentence = "They refuse to permit us to obtain a refuse permit"
pos_sentence = word_tokenize(sentence) # Tokenization with NLTK

pos = pos_tag(pos_sentence)
print(pos)

[('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('a', 'DT'), ('refuse', 'NN'), ('permit', 'NN')]
