# Text Pre-processing 


## 1- Importing libraries

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import nltk

## 2- Basics (Preprocessing)

###  2.1- NLTK Corpora    

In [7]:
from nltk.corpus import product_reviews_1

#Each dataset contains text in text files and to read any file we need to know its name.

product_reviews_1.fileids()


['Apex_AD2600_Progressive_scan_DVD player.txt',
 'Canon_G3.txt',
 'Creative_Labs_Nomad_Jukebox_Zen_Xtra_40GB.txt',
 'Nikon_coolpix_4300.txt',
 'Nokia_6610.txt',
 'README.txt']

In [8]:
# Will read raw text from this file
product_review_raw = product_reviews_1.raw('Apex_AD2600_Progressive_scan_DVD player.txt')
product_review_raw[:750] 
#We are setting upper limit otherwise it will product the big output with lots of scrolling 

'*****************************************************************************\n* Annotated by: Minqing Hu and Bing Liu, 2004.\n*\t\tDepartment of Computer Sicence\n*               University of Illinois at Chicago              \n*\n* Product name: Apex AD2600 Progressive-scan DVD player\n* Review Source: amazon.com\n*\n* See Readme.txt to find the meaning of each symbol. \n*****************************************************************************\n\n[t] troubleshooting ad-2500 and ad-2600 no picture scrolling b/w . \n##repost from january 13 , 2004 with a better fit title . \n##does your apex dvd player only play dvd audio without video ? \n##or does it play audio and video but scrolling in black and white ? \n##before you try to return the player or was'

In [9]:
# Will break down file in sentences
product_review_sents = product_reviews_1.sents('Apex_AD2600_Progressive_scan_DVD player.txt')
product_review_sents

[['repost', 'from', 'january', '13', ',', '2004', 'with', 'a', 'better', 'fit', 'title', '.'], ['does', 'your', 'apex', 'dvd', 'player', 'only', 'play', 'dvd', 'audio', 'without', 'video', '?'], ...]

In [10]:
# Will break down file in words
product_review_words = product_reviews_1.words('Apex_AD2600_Progressive_scan_DVD player.txt')
product_review_words

['repost', 'from', 'january', '13', ',', '2004', ...]

### 2.2- Stopwords

In [11]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
print(stoplist)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:
#Let's check the difference between product_reviews_1 length with or without stop_words

print(f'word length with stopwords {len(product_review_words)}')
product_review_wo_stopwords = [word for word in product_review_words if not word in stoplist]
print(f'word length without stopwords {len(product_review_wo_stopwords)}')

word length with stopwords 12593
word length without stopwords 7190


In [15]:
#A 'Token' is a single entity of whole entity we are referreing to.
#We can perform sentence and word split in below way:

from nltk.tokenize import sent_tokenize, word_tokenize

print(f'Word Tokens - \n{sent_tokenize(product_review_raw[750:1250])}\n\n\n')
print(f'Sentence Tokens - \n{word_tokenize(product_review_raw[750:1250])}')


Word Tokens - 
['te hours calling apex tech support , or run the player over with your car , try these simple troubleshooting ideas first .', '##no picture : \n##hopefully you still have the remote control .', '##if you tossed it out the window , you need to fetch it .', '##using the remote control , press the i/p button located on the bottom right corner of the remote .', '##the i/p button switches the tv display between interlace and progressive .', '##if this doesnt bring back the picture , try pressing this button with']



Sentence Tokens - 
['te', 'hours', 'calling', 'apex', 'tech', 'support', ',', 'or', 'run', 'the', 'player', 'over', 'with', 'your', 'car', ',', 'try', 'these', 'simple', 'troubleshooting', 'ideas', 'first', '.', '#', '#', 'no', 'picture', ':', '#', '#', 'hopefully', 'you', 'still', 'have', 'the', 'remote', 'control', '.', '#', '#', 'if', 'you', 'tossed', 'it', 'out', 'the', 'window', ',', 'you', 'need', 'to', 'fetch', 'it', '.', '#', '#', 'using', 'the', 'remote

### 2.4- Stemming & Lemmatization

They both are used for text normalization. Stemming basically removes the redundancy by bringing everything in its simple form for example 'dancing' & 'dancer' becomes 'dance' in this. On the other hand, Lemmatization does the morphological analysis and keeps part of speech into consideration. This can be better understood by examples :

In [16]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()
word_lemmatizer = WordNetLemmatizer()


for w in word_tokenize(product_review_raw[750:1250]):
    print(f'Actual Word - {w}')
    print(f'Stem - {porter_stemmer.stem(w)}')
    print(f'Lemma - {word_lemmatizer.lemmatize(w)}\n')

Actual Word - te
Stem - te
Lemma - te

Actual Word - hours
Stem - hour
Lemma - hour

Actual Word - calling
Stem - call
Lemma - calling

Actual Word - apex
Stem - apex
Lemma - apex

Actual Word - tech
Stem - tech
Lemma - tech

Actual Word - support
Stem - support
Lemma - support

Actual Word - ,
Stem - ,
Lemma - ,

Actual Word - or
Stem - or
Lemma - or

Actual Word - run
Stem - run
Lemma - run

Actual Word - the
Stem - the
Lemma - the

Actual Word - player
Stem - player
Lemma - player

Actual Word - over
Stem - over
Lemma - over

Actual Word - with
Stem - with
Lemma - with

Actual Word - your
Stem - your
Lemma - your

Actual Word - car
Stem - car
Lemma - car

Actual Word - ,
Stem - ,
Lemma - ,

Actual Word - try
Stem - tri
Lemma - try

Actual Word - these
Stem - these
Lemma - these

Actual Word - simple
Stem - simpl
Lemma - simple

Actual Word - troubleshooting
Stem - troubleshoot
Lemma - troubleshooting

Actual Word - ideas
Stem - idea
Lemma - idea

Actual Word - first
Stem - first
Lem

### 2.5- Part of Speech Tagging

Also know as POS Taggin or POST. Why POST requried ? Because same sentence or paragraph can have the same word in different grammatically contexts and it is not a good idea to consider the second occurrence as redundancy, so as a solution we prefer tagging each word with its Part of Speech to make it grammatically unique. Consider below example, here all above words are not grammatically same.

    The heavens are above. (Adverb)

    The moral code of conduct is above the civil code of conduct. (Proposition)

    Our blessings come from above. (Noun)


In [17]:
product_review_raw_word = word_tokenize(product_review_raw)
nltk.pos_tag(product_review_raw_word)

[('*****************************************************************************',
  'JJ'),
 ('*', 'NNP'),
 ('Annotated', 'VBN'),
 ('by', 'IN'),
 (':', ':'),
 ('Minqing', 'NNP'),
 ('Hu', 'NNP'),
 ('and', 'CC'),
 ('Bing', 'NNP'),
 ('Liu', 'NNP'),
 (',', ','),
 ('2004', 'CD'),
 ('.', '.'),
 ('*', 'CC'),
 ('Department', 'NNP'),
 ('of', 'IN'),
 ('Computer', 'NNP'),
 ('Sicence', 'NNP'),
 ('*', 'NNP'),
 ('University', 'NNP'),
 ('of', 'IN'),
 ('Illinois', 'NNP'),
 ('at', 'IN'),
 ('Chicago', 'NNP'),
 ('*', 'NNP'),
 ('*', 'NNP'),
 ('Product', 'NNP'),
 ('name', 'NN'),
 (':', ':'),
 ('Apex', 'NNP'),
 ('AD2600', 'NNP'),
 ('Progressive-scan', 'JJ'),
 ('DVD', 'NNP'),
 ('player', 'NN'),
 ('*', 'NNP'),
 ('Review', 'NNP'),
 ('Source', 'NNP'),
 (':', ':'),
 ('amazon.com', 'NN'),
 ('*', 'VBZ'),
 ('*', 'NNP'),
 ('See', 'NNP'),
 ('Readme.txt', 'NNP'),
 ('to', 'TO'),
 ('find', 'VB'),
 ('the', 'DT'),
 ('meaning', 'NN'),
 ('of', 'IN'),
 ('each', 'DT'),
 ('symbol', 'NN'),
 ('.', '.'),
 ('**********************

## 3-Feature Extraction (Vectorization)

We can not use text directly to train our models. We need to convert it in the form of features, only then it can be used to train any model for desired outcome and we know very well that most of the models respond to the numeric features very well. So we need to bring all these text representations in the form of numbers.

There are two popular approaches to extract features from texts:

    Count the number of occurrece of each word in a document.
    Calculate the frequency of each word occurrence out of all word in a document.

Few most commonly used techniqus to perform feature extraction are:
1. Bag of Words
2. TF-IDF (Term Frequency - Inverse Document Frequency)

### 3.1- Bag of Words

Bag of words is one of the simplest approaches of feature extraction, here we simply keep the frequency count of all unique words and consider it as a feature. Example:

Suppose we have below sentences (also referred as documents):

        Must have a subject and a verb.
        Must express a complete thought.
        Must only have one clause.

Feature extraction we need to perform are:

1. Identify Unique words Unique words from all documents are: must, have, a, subject, and, verb, express, complete, thought, only, one, clause

2. Perform Vectorization we need to find the frequency count of each unique word and if it is not there then we need to put 0. For eg vector for first document can be formed as:

    must - 1
    have - 1
    subject - 1
    and - 1
    verb - 1
    express - 0
    complete - 0
    thought - 0
    only - 0
    one - 0
    clause - 0

So, it will become

        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]

similar way doucument2 and document3 will become:

        [1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0]
        [1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1]



In [18]:
#scikit-learn library provides CountVectorizer class to perform this action

from sklearn.feature_extraction.text import CountVectorizer
# initialize sample document
sample_documents = ['Must have a subject and a verb','Must express a complete thought','Must only have one clause']
# instantiate
vectorizer = CountVectorizer()
vectorizer.fit(sample_documents)
# summarize
print(f':: vector vocabulary - {vectorizer.vocabulary_}\n')
# encode document
vector = vectorizer.transform(sample_documents)
# summarize encoded vector
print(f':: vector shape - {vector.shape}\n')
print(f':: vector list - {vector.toarray()}')


:: vector vocabulary - {'must': 5, 'have': 4, 'subject': 8, 'and': 0, 'verb': 10, 'express': 3, 'complete': 2, 'thought': 9, 'only': 7, 'one': 6, 'clause': 1}

:: vector shape - (3, 11)

:: vector list - [[1 0 0 0 1 1 0 0 1 0 1]
 [0 0 1 1 0 1 0 0 0 1 0]
 [0 1 0 0 1 1 1 1 0 0 0]]


### 3.2- Term Frequency – Inverse Document Frequency (TF – IDF)

In [19]:


from sklearn.feature_extraction.text import TfidfVectorizer
# initialize sample document
sample_documents = ['Must have a subject and a verb','Must express a complete thought','Must only have one clause']
# instantiate
vectorizer = TfidfVectorizer()
vectorizer.fit(sample_documents)
# summarize
print(f':: vector vocabulary - {vectorizer.vocabulary_}\n')
# encode document
vector = vectorizer.transform(sample_documents)
# summarize encoded vector
print(f':: vector shape - {vector.shape}\n')
print(f':: vector list - {vector.toarray()}')



:: vector vocabulary - {'must': 5, 'have': 4, 'subject': 8, 'and': 0, 'verb': 10, 'express': 3, 'complete': 2, 'thought': 9, 'only': 7, 'one': 6, 'clause': 1}

:: vector shape - (3, 11)

:: vector list - [[0.50461134 0.         0.         0.         0.38376993 0.29803159
  0.         0.         0.50461134 0.         0.50461134]
 [0.         0.         0.54645401 0.54645401 0.         0.32274454
  0.         0.         0.         0.54645401 0.        ]
 [0.         0.50461134 0.         0.         0.38376993 0.29803159
  0.50461134 0.50461134 0.         0.         0.        ]]


### 2.3- Tokenization

It is the most popular method to perform feature extraction. To understand better let's understand TF and IDF separately.

Term Frequency: Simply finds out the frequency of a word in document.
Inverse Document Frequency: Assigns a lower weight to the words which appear most frequently. It basically depicts the rarity of the word in all documents.