In [11]:
import unidecode
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup

In [12]:
text = """<p>Text preprocessing is the process of turning a document’s ﬂow of text into the quantiﬁable chunks needed for analysis. 
The initial step in text preprocessing is text indexing, which is the process of converting a text or texts into a list of words
. <br> Since a text or texts are given as unstructured forms by itself or themselves essentially, it's almost impossible to process 
its raw form directly by using a computer program. <br> In other words, text indexing means the process of segmenting a text which 
consists of sentences into individual words. <br> Note, this isn't exhaustive.</p>"""


### HTML Tags Removal

In [13]:
# create an instance of beautiful soup
soup = BeautifulSoup(text, 'html.parser')

In [14]:
# get the text
cleaned_text = soup.get_text()

In [15]:
cleaned_text

"Text preprocessing is the process of turning a document’s ﬂow of text into the quantiﬁable chunks needed for analysis. \nThe initial step in text preprocessing is text indexing, which is the process of converting a text or texts into a list of words\n.  Since a text or texts are given as unstructured forms by itself or themselves essentially, it's almost impossible to process \nits raw form directly by using a computer program.  In other words, text indexing means the process of segmenting a text which \nconsists of sentences into individual words.  Note, this isn't exhaustive."

### Fixing Contractions

In [16]:
import contractions

In [17]:
# fix the contractions

cleaned_text = contractions.fix(cleaned_text)

In [18]:
cleaned_text

'Text preprocessing is the process of turning a document’s ﬂow of text into the quantiﬁable chunks needed for analysis. \nThe initial step in text preprocessing is text indexing, which is the process of converting a text or texts into a list of words\n.  Since a text or texts are given as unstructured forms by itself or themselves essentially, it is almost impossible to process \nits raw form directly by using a computer program.  In other words, text indexing means the process of segmenting a text which \nconsists of sentences into individual words.  Note, this is not exhaustive.'

### Lowercasing text

In [19]:
# convert the text to lower case
cleaned_text=cleaned_text.lower()


In [20]:
cleaned_text

'text preprocessing is the process of turning a document’s ﬂow of text into the quantiﬁable chunks needed for analysis. \nthe initial step in text preprocessing is text indexing, which is the process of converting a text or texts into a list of words\n.  since a text or texts are given as unstructured forms by itself or themselves essentially, it is almost impossible to process \nits raw form directly by using a computer program.  in other words, text indexing means the process of segmenting a text which \nconsists of sentences into individual words.  note, this is not exhaustive.'

### Tokenization

In [24]:
# tokenize into sentences
cleaned_tokens = sent_tokenize(cleaned_text)

In [25]:
cleaned_tokens

['text preprocessing is the process of turning a document’s ﬂow of text into the quantiﬁable chunks needed for analysis.',
 'the initial step in text preprocessing is text indexing, which is the process of converting a text or texts into a list of words\n.',
 'since a text or texts are given as unstructured forms by itself or themselves essentially, it is almost impossible to process \nits raw form directly by using a computer program.',
 'in other words, text indexing means the process of segmenting a text which \nconsists of sentences into individual words.',
 'note, this is not exhaustive.']

In [28]:
clean_words= [word_tokenize(sent) for sent in cleaned_tokens]
clean_words

[['text',
  'preprocessing',
  'is',
  'the',
  'process',
  'of',
  'turning',
  'a',
  'document',
  '’',
  's',
  'ﬂow',
  'of',
  'text',
  'into',
  'the',
  'quantiﬁable',
  'chunks',
  'needed',
  'for',
  'analysis',
  '.'],
 ['the',
  'initial',
  'step',
  'in',
  'text',
  'preprocessing',
  'is',
  'text',
  'indexing',
  ',',
  'which',
  'is',
  'the',
  'process',
  'of',
  'converting',
  'a',
  'text',
  'or',
  'texts',
  'into',
  'a',
  'list',
  'of',
  'words',
  '.'],
 ['since',
  'a',
  'text',
  'or',
  'texts',
  'are',
  'given',
  'as',
  'unstructured',
  'forms',
  'by',
  'itself',
  'or',
  'themselves',
  'essentially',
  ',',
  'it',
  'is',
  'almost',
  'impossible',
  'to',
  'process',
  'its',
  'raw',
  'form',
  'directly',
  'by',
  'using',
  'a',
  'computer',
  'program',
  '.'],
 ['in',
  'other',
  'words',
  ',',
  'text',
  'indexing',
  'means',
  'the',
  'process',
  'of',
  'segmenting',
  'a',
  'text',
  'which',
  'consists',
  'o

In [26]:
# tokenize into individual words
cleaned_word_tokens = word_tokenize(cleaned_text)
cleaned_word_tokens

['text',
 'preprocessing',
 'is',
 'the',
 'process',
 'of',
 'turning',
 'a',
 'document',
 '’',
 's',
 'ﬂow',
 'of',
 'text',
 'into',
 'the',
 'quantiﬁable',
 'chunks',
 'needed',
 'for',
 'analysis',
 '.',
 'the',
 'initial',
 'step',
 'in',
 'text',
 'preprocessing',
 'is',
 'text',
 'indexing',
 ',',
 'which',
 'is',
 'the',
 'process',
 'of',
 'converting',
 'a',
 'text',
 'or',
 'texts',
 'into',
 'a',
 'list',
 'of',
 'words',
 '.',
 'since',
 'a',
 'text',
 'or',
 'texts',
 'are',
 'given',
 'as',
 'unstructured',
 'forms',
 'by',
 'itself',
 'or',
 'themselves',
 'essentially',
 ',',
 'it',
 'is',
 'almost',
 'impossible',
 'to',
 'process',
 'its',
 'raw',
 'form',
 'directly',
 'by',
 'using',
 'a',
 'computer',
 'program',
 '.',
 'in',
 'other',
 'words',
 ',',
 'text',
 'indexing',
 'means',
 'the',
 'process',
 'of',
 'segmenting',
 'a',
 'text',
 'which',
 'consists',
 'of',
 'sentences',
 'into',
 'individual',
 'words',
 '.',
 'note',
 ',',
 'this',
 'is',
 'not',
 '

### Removal of Stopwords

In [30]:
# create an instance of english stopwords

stop_word=stopwords.words('english')
stop_word

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [33]:
# remove the stopwords from the text
cleaned_text= [word for sent in clean_words for word in sent if word not in stop_word]
cleaned_text

['text',
 'preprocessing',
 'process',
 'turning',
 'document',
 '’',
 'ﬂow',
 'text',
 'quantiﬁable',
 'chunks',
 'needed',
 'analysis',
 '.',
 'initial',
 'step',
 'text',
 'preprocessing',
 'text',
 'indexing',
 ',',
 'process',
 'converting',
 'text',
 'texts',
 'list',
 'words',
 '.',
 'since',
 'text',
 'texts',
 'given',
 'unstructured',
 'forms',
 'essentially',
 ',',
 'almost',
 'impossible',
 'process',
 'raw',
 'form',
 'directly',
 'using',
 'computer',
 'program',
 '.',
 'words',
 ',',
 'text',
 'indexing',
 'means',
 'process',
 'segmenting',
 'text',
 'consists',
 'sentences',
 'individual',
 'words',
 '.',
 'note',
 ',',
 'exhaustive',
 '.']

In [34]:
#Removing Punctuations
import string

In [35]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [37]:
cleaned_text= [word for word in cleaned_text if word not in string.punctuation]
cleaned_text

['text',
 'preprocessing',
 'process',
 'turning',
 'document',
 '’',
 'ﬂow',
 'text',
 'quantiﬁable',
 'chunks',
 'needed',
 'analysis',
 'initial',
 'step',
 'text',
 'preprocessing',
 'text',
 'indexing',
 'process',
 'converting',
 'text',
 'texts',
 'list',
 'words',
 'since',
 'text',
 'texts',
 'given',
 'unstructured',
 'forms',
 'essentially',
 'almost',
 'impossible',
 'process',
 'raw',
 'form',
 'directly',
 'using',
 'computer',
 'program',
 'words',
 'text',
 'indexing',
 'means',
 'process',
 'segmenting',
 'text',
 'consists',
 'sentences',
 'individual',
 'words',
 'note',
 'exhaustive']

In [38]:
cleaned_text= [word for word in cleaned_text if word.isalpha()]
cleaned_text

['text',
 'preprocessing',
 'process',
 'turning',
 'document',
 'ﬂow',
 'text',
 'quantiﬁable',
 'chunks',
 'needed',
 'analysis',
 'initial',
 'step',
 'text',
 'preprocessing',
 'text',
 'indexing',
 'process',
 'converting',
 'text',
 'texts',
 'list',
 'words',
 'since',
 'text',
 'texts',
 'given',
 'unstructured',
 'forms',
 'essentially',
 'almost',
 'impossible',
 'process',
 'raw',
 'form',
 'directly',
 'using',
 'computer',
 'program',
 'words',
 'text',
 'indexing',
 'means',
 'process',
 'segmenting',
 'text',
 'consists',
 'sentences',
 'individual',
 'words',
 'note',
 'exhaustive']

### Stemming

In [39]:
# create an instance of the stemmer
stem= PorterStemmer()


In [41]:
# Apply the stemmer on the text

clean_stem =[stem.stem(word) for word in cleaned_text]

In [42]:
clean_stem

['text',
 'preprocess',
 'process',
 'turn',
 'document',
 'ﬂow',
 'text',
 'quantiﬁ',
 'chunk',
 'need',
 'analysi',
 'initi',
 'step',
 'text',
 'preprocess',
 'text',
 'index',
 'process',
 'convert',
 'text',
 'text',
 'list',
 'word',
 'sinc',
 'text',
 'text',
 'given',
 'unstructur',
 'form',
 'essenti',
 'almost',
 'imposs',
 'process',
 'raw',
 'form',
 'directli',
 'use',
 'comput',
 'program',
 'word',
 'text',
 'index',
 'mean',
 'process',
 'segment',
 'text',
 'consist',
 'sentenc',
 'individu',
 'word',
 'note',
 'exhaust']

### Lemmatization

In [43]:
# create an instance of the lemmatizer
wm=WordNetLemmatizer()

In [44]:
# Apply the lemmatizer on the text
cleaned_lema= [wm.lemmatize(word) for word in cleaned_text]
cleaned_lema

['text',
 'preprocessing',
 'process',
 'turning',
 'document',
 'ﬂow',
 'text',
 'quantiﬁable',
 'chunk',
 'needed',
 'analysis',
 'initial',
 'step',
 'text',
 'preprocessing',
 'text',
 'indexing',
 'process',
 'converting',
 'text',
 'text',
 'list',
 'word',
 'since',
 'text',
 'text',
 'given',
 'unstructured',
 'form',
 'essentially',
 'almost',
 'impossible',
 'process',
 'raw',
 'form',
 'directly',
 'using',
 'computer',
 'program',
 'word',
 'text',
 'indexing',
 'mean',
 'process',
 'segmenting',
 'text',
 'consists',
 'sentence',
 'individual',
 'word',
 'note',
 'exhaustive']