# Text Analytics
1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document
Frequency.

In [42]:
"""
Sample data = "Hi, this is text analysis practical ! Hope you will enjoy it."
1. Cleaning = ,.!?#$&""''[]{}()
2. Tokenization = ['Hi', 'this']
3. POS tagging = It return the part of speech
4. Stop word removal = stop words like is, the , a, etc
5. Stemming = playing => play, articles => article , stremming based on some rules, analysis
6. Lemmatization = search in dictionary and get actual word mice => mouse, playing => play
"""

'\nSample data = "Hi, this is text analysis practical ! Hope you will enjoy it."\n1. Cleaning = ,.!?#$&""\'\'[]{}()\n2. Tokenization = [\'Hi\', \'this\']\n3. POS tagging = It return the part of speech\n4. Stop word removal = stop words like is, the , a, etc\n5. Stemming = playing => play, articles => article , stremming based on some rules, analysis\n6. Lemmatization = search in dictionary and get actual word mice => mouse, playing => play\n'

In [43]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
import re
from nltk import word_tokenize, pos_tag, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akhilesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akhilesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Akhilesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Akhilesh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Cleaning

In [62]:
sample_data = "Hi, this is text analysis practical ! Hoping you enjoyed it. Do you liked it? playing, gaming, mice, words, running, boring, better, went"

def clean_data(data):
    text = re.sub(r'[^A-Za-z\s]', '', data)
    text = text.lower()
    return text

cleaned_data = clean_data(sample_data)
cleaned_data

'hi this is text analysis practical  hoping you enjoyed it do you liked it playing gaming mice words running boring better went'

### Tokenization

In [63]:
tokens = word_tokenize(cleaned_data)
tokens

['hi',
 'this',
 'is',
 'text',
 'analysis',
 'practical',
 'hoping',
 'you',
 'enjoyed',
 'it',
 'do',
 'you',
 'liked',
 'it',
 'playing',
 'gaming',
 'mice',
 'words',
 'running',
 'boring',
 'better',
 'went']

### POS Tagging (parts of speech)

In [64]:
pos_tags = pos_tag(tokens)
pos_tags

[('hi', 'NN'),
 ('this', 'DT'),
 ('is', 'VBZ'),
 ('text', 'JJ'),
 ('analysis', 'NN'),
 ('practical', 'JJ'),
 ('hoping', 'NN'),
 ('you', 'PRP'),
 ('enjoyed', 'VBD'),
 ('it', 'PRP'),
 ('do', 'VB'),
 ('you', 'PRP'),
 ('liked', 'VB'),
 ('it', 'PRP'),
 ('playing', 'VBG'),
 ('gaming', 'VBG'),
 ('mice', 'JJ'),
 ('words', 'NNS'),
 ('running', 'VBG'),
 ('boring', 'NN'),
 ('better', 'RBR'),
 ('went', 'VBD')]

### Stop words removal

In [65]:
stop_words = set(stopwords.words('english'))
"""
Method 1 : 
def remove_stwd(tokens, stopwords, list):
    for word in tokens:
        if word not in stopwords:
            list.append(word)
    return list
print(remove_stwd(tokens, stop_words, []))
"""

#Method 2

rem_stwords = [word for word in tokens if word not in stop_words]
rem_stwords

['hi',
 'text',
 'analysis',
 'practical',
 'hoping',
 'enjoyed',
 'liked',
 'playing',
 'gaming',
 'mice',
 'words',
 'running',
 'boring',
 'better',
 'went']

### Stemming

In [66]:
stemmer = PorterStemmer()
stemmed_word = [stemmer.stem(word) for word in rem_stwords]
stemmed_word

['hi',
 'text',
 'analysi',
 'practic',
 'hope',
 'enjoy',
 'like',
 'play',
 'game',
 'mice',
 'word',
 'run',
 'bore',
 'better',
 'went']

### Lemmatizing

In [67]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in rem_stwords]
lemmatized_words

['hi',
 'text',
 'analysis',
 'practical',
 'hoping',
 'enjoyed',
 'liked',
 'playing',
 'gaming',
 'mouse',
 'word',
 'running',
 'boring',
 'better',
 'went']

### TF-IDF
1. TF = no. of t terms in doc/ total number of terms in doc
2. IDF = log(no. of documents in collection/ no. of documents containing term t)
3. TF-IDF Score = TF*IDF

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_model = TfidfVectorizer()
tfidf_matrix = tfidf_model.fit_transform([cleaned_data])
print(tfidf_matrix)

  (0, 17)	0.19611613513818404
  (0, 1)	0.19611613513818404
  (0, 2)	0.19611613513818404
  (0, 14)	0.19611613513818404
  (0, 18)	0.19611613513818404
  (0, 11)	0.19611613513818404
  (0, 5)	0.19611613513818404
  (0, 12)	0.19611613513818404
  (0, 10)	0.19611613513818404
  (0, 3)	0.19611613513818404
  (0, 9)	0.3922322702763681
  (0, 4)	0.19611613513818404
  (0, 19)	0.3922322702763681
  (0, 7)	0.19611613513818404
  (0, 13)	0.19611613513818404
  (0, 0)	0.19611613513818404
  (0, 15)	0.19611613513818404
  (0, 8)	0.19611613513818404
  (0, 16)	0.19611613513818404
  (0, 6)	0.19611613513818404
