**Text Analytics**
1. Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document Frequency.

# Setup

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to C:\Users\Kanchan
[nltk_data]     Chintalwar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\Kanchan
[nltk_data]     Chintalwar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kanchan
[nltk_data]     Chintalwar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to C:\Users\Kanchan
[nltk_data]     Chintalwar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package omw-1.4 to C:\Users\Kanchan
[nltk_data]     Chintalwar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.
[nltk_data] Error with downloaded zip file


False

In [58]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial import distance

In [35]:
with open('./paragraph.txt') as f:
    paragraph = f.read()
    paragraph = paragraph.lower()

In [36]:
paragraph

'part-of-speech (pos) tagging is a popular natural language processing process which refers to categorizing words in a text (corpus) in correspondence with a particular part of speech, depending on the definition of the word and its context.\nlemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item. lemmatization is similar to stemming but it brings context to the words.\nstemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma. stemming is important in natural language understanding (nlu) and natural language processing (nlp).'

## Tokenization 

In [37]:
sentences = sent_tokenize(paragraph)

In [38]:
print("Total Number of sentences in paragraph is ", len(sentences))

Total Number of sentences in paragraph is  5


In [39]:
print("Sentences are: - \n", sentences)

Sentences are: - 
 ['part-of-speech (pos) tagging is a popular natural language processing process which refers to categorizing words in a text (corpus) in correspondence with a particular part of speech, depending on the definition of the word and its context.', 'lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item.', 'lemmatization is similar to stemming but it brings context to the words.', 'stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma.', 'stemming is important in natural language understanding (nlu) and natural language processing (nlp).']


In [40]:
words = word_tokenize(paragraph)
print("Total Number of words in paragraph is ", len(words))
print("Words are: - \n", words)

Total Number of words in paragraph is  127
Words are: - 
 ['part-of-speech', '(', 'pos', ')', 'tagging', 'is', 'a', 'popular', 'natural', 'language', 'processing', 'process', 'which', 'refers', 'to', 'categorizing', 'words', 'in', 'a', 'text', '(', 'corpus', ')', 'in', 'correspondence', 'with', 'a', 'particular', 'part', 'of', 'speech', ',', 'depending', 'on', 'the', 'definition', 'of', 'the', 'word', 'and', 'its', 'context', '.', 'lemmatization', 'is', 'the', 'process', 'of', 'grouping', 'together', 'the', 'different', 'inflected', 'forms', 'of', 'a', 'word', 'so', 'they', 'can', 'be', 'analyzed', 'as', 'a', 'single', 'item', '.', 'lemmatization', 'is', 'similar', 'to', 'stemming', 'but', 'it', 'brings', 'context', 'to', 'the', 'words', '.', 'stemming', 'is', 'the', 'process', 'of', 'reducing', 'a', 'word', 'to', 'its', 'word', 'stem', 'that', 'affixes', 'to', 'suffixes', 'and', 'prefixes', 'or', 'to', 'the', 'roots', 'of', 'words', 'known', 'as', 'a', 'lemma', '.', 'stemming', 'is', 

#### Full forms of all the abrevations used in POS Tagging

## POS Tagging (Parts-of-speech tagging)

In [41]:
tagged = nltk.pos_tag(words)

In [42]:
print('POS Tagged form of words which are tokenize :')
for tag in tagged:
    print(tag)

POS Tagged form of words which are tokenize :
('part-of-speech', 'NN')
('(', '(')
('pos', 'NN')
(')', ')')
('tagging', 'NN')
('is', 'VBZ')
('a', 'DT')
('popular', 'JJ')
('natural', 'JJ')
('language', 'NN')
('processing', 'NN')
('process', 'NN')
('which', 'WDT')
('refers', 'VBZ')
('to', 'TO')
('categorizing', 'VBG')
('words', 'NNS')
('in', 'IN')
('a', 'DT')
('text', 'NN')
('(', '(')
('corpus', 'NN')
(')', ')')
('in', 'IN')
('correspondence', 'NN')
('with', 'IN')
('a', 'DT')
('particular', 'JJ')
('part', 'NN')
('of', 'IN')
('speech', 'NN')
(',', ',')
('depending', 'VBG')
('on', 'IN')
('the', 'DT')
('definition', 'NN')
('of', 'IN')
('the', 'DT')
('word', 'NN')
('and', 'CC')
('its', 'PRP$')
('context', 'NN')
('.', '.')
('lemmatization', 'NN')
('is', 'VBZ')
('the', 'DT')
('process', 'NN')
('of', 'IN')
('grouping', 'VBG')
('together', 'RB')
('the', 'DT')
('different', 'JJ')
('inflected', 'JJ')
('forms', 'NNS')
('of', 'IN')
('a', 'DT')
('word', 'NN')
('so', 'IN')
('they', 'PRP')
('can', 'MD')

## Stop word removal

In [43]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [44]:
word_tokens = []
for word in words:
    if word not in stop_words:
        word_tokens.append(word)
print(word_tokens)

['part-of-speech', '(', 'pos', ')', 'tagging', 'popular', 'natural', 'language', 'processing', 'process', 'refers', 'categorizing', 'words', 'text', '(', 'corpus', ')', 'correspondence', 'particular', 'part', 'speech', ',', 'depending', 'definition', 'word', 'context', '.', 'lemmatization', 'process', 'grouping', 'together', 'different', 'inflected', 'forms', 'word', 'analyzed', 'single', 'item', '.', 'lemmatization', 'similar', 'stemming', 'brings', 'context', 'words', '.', 'stemming', 'process', 'reducing', 'word', 'word', 'stem', 'affixes', 'suffixes', 'prefixes', 'roots', 'words', 'known', 'lemma', '.', 'stemming', 'important', 'natural', 'language', 'understanding', '(', 'nlu', ')', 'natural', 'language', 'processing', '(', 'nlp', ')', '.']


## POS Tagging after removal of stop words

In [45]:
tagged = nltk.pos_tag(word_tokens)

In [46]:
print('POS Tagged form of words which are tokenize and from which stop words are removed:')
for tag in tagged:
    print(tag)

POS Tagged form of words which are tokenize and from which stop words are removed:
('part-of-speech', 'NN')
('(', '(')
('pos', 'NN')
(')', ')')
('tagging', 'VBG')
('popular', 'JJ')
('natural', 'JJ')
('language', 'NN')
('processing', 'NN')
('process', 'NN')
('refers', 'NNS')
('categorizing', 'VBG')
('words', 'NNS')
('text', 'NN')
('(', '(')
('corpus', 'NN')
(')', ')')
('correspondence', 'NN')
('particular', 'JJ')
('part', 'NN')
('speech', 'NN')
(',', ',')
('depending', 'VBG')
('definition', 'NN')
('word', 'NN')
('context', 'NN')
('.', '.')
('lemmatization', 'NN')
('process', 'NN')
('grouping', 'VBG')
('together', 'RB')
('different', 'JJ')
('inflected', 'JJ')
('forms', 'NNS')
('word', 'NN')
('analyzed', 'VBN')
('single', 'JJ')
('item', 'NN')
('.', '.')
('lemmatization', 'NN')
('similar', 'JJ')
('stemming', 'NN')
('brings', 'NNS')
('context', 'JJ')
('words', 'NNS')
('.', '.')
('stemming', 'VBG')
('process', 'NN')
('reducing', 'VBG')
('word', 'NN')
('word', 'NN')
('stem', 'NN')
('affixes',

### Stemming

In [47]:
ps = PorterStemmer()

In [48]:
print('Results of Stemming')
stemmed = {word: ps.stem(word) for word in word_tokens}
for pair in stemmed.items():
    print('{0} --> {1}'.format(pair[0], pair[1]))

Results of Stemming
part-of-speech --> part-of-speech
( --> (
pos --> po
) --> )
tagging --> tag
popular --> popular
natural --> natur
language --> languag
processing --> process
process --> process
refers --> refer
categorizing --> categor
words --> word
text --> text
corpus --> corpu
correspondence --> correspond
particular --> particular
part --> part
speech --> speech
, --> ,
depending --> depend
definition --> definit
word --> word
context --> context
. --> .
lemmatization --> lemmat
grouping --> group
together --> togeth
different --> differ
inflected --> inflect
forms --> form
analyzed --> analyz
single --> singl
item --> item
similar --> similar
stemming --> stem
brings --> bring
reducing --> reduc
stem --> stem
affixes --> affix
suffixes --> suffix
prefixes --> prefix
roots --> root
known --> known
lemma --> lemma
important --> import
understanding --> understand
nlu --> nlu
nlp --> nlp


### Lemmatization

In [49]:
lemmatizer = WordNetLemmatizer()

In [50]:
print('Results of Lemmatization')
lemmatized = {word: lemmatizer.lemmatize(word) for word in word_tokens}
for pair in lemmatized.items():
    print('{0} --> {1}'.format(pair[0], pair[1]))

Results of Lemmatization
part-of-speech --> part-of-speech
( --> (
pos --> po
) --> )
tagging --> tagging
popular --> popular
natural --> natural
language --> language
processing --> processing
process --> process
refers --> refers
categorizing --> categorizing
words --> word
text --> text
corpus --> corpus
correspondence --> correspondence
particular --> particular
part --> part
speech --> speech
, --> ,
depending --> depending
definition --> definition
word --> word
context --> context
. --> .
lemmatization --> lemmatization
grouping --> grouping
together --> together
different --> different
inflected --> inflected
forms --> form
analyzed --> analyzed
single --> single
item --> item
similar --> similar
stemming --> stemming
brings --> brings
reducing --> reducing
stem --> stem
affixes --> affix
suffixes --> suffix
prefixes --> prefix
roots --> root
known --> known
lemma --> lemma
important --> important
understanding --> understanding
nlu --> nlu
nlp --> nlp


### Term Frequency and Inverse Term Frequency

In [51]:
def arr_convert_1d(arr):
    arr = np.array(arr)
    arr = np.concatenate( arr, axis=0 )
    arr = np.concatenate( arr, axis=0 )
    return arr

In [52]:
cos = []
def cosine(trans):
    cos.append(cosine_similarity(trans[0], trans[1]))

In [53]:
manhatten = []
def manhatten_distance(trans):
    manhatten.append(pairwise_distances(trans[0], trans[1], metric = 'manhattan'))

In [54]:
euclidean = []
def euclidean_function(vectors):
    euc=euclidean_distances(vectors[0], vectors[1])
    euclidean.append(euc)

In [55]:
def tfidf(str1, str2):
    vect = TfidfVectorizer()
    vect.fit(word_tokens)
    corpus = [str1,str2]
    trans = vect.transform(corpus)
    euclidean_function(trans)
    cosine(trans)
    manhatten_distance(trans)
    return convert()

In [56]:
def convert():
    dataf = pd.DataFrame()
    lis2 = arr_c onvert_1d(manhatten)
    dataf['manhatten'] = lis2
    lis2 = arr_convert_1d(cos)
    dataf['cos_sim'] = lis2
    lis2 = arr_convert_1d(euclidean)
    dataf['euclidean'] = lis2
    return dataf

In [60]:
str1 = 'Stemming'
str2 = 'Lemmatization'
newData = tfidf(str1,str2);
print(newData);

   manhatten  cos_sim  euclidean
0        1.0      0.0   1.000000
1        2.0      0.0   1.414214
