In [None]:
# Text Analytics 
# 1. Extract Sample document and apply following document preprocessing methods: 
# Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization. 
# 2. Create representation of document by calculating Term Frequency and Inverse Document 
# Frequency. 

In [56]:
import nltk as nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import math

In [57]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [58]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [59]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [60]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [61]:
f = open('doc_01.txt')
text = f.read()
print(text)

Between 2016 and 2019, the state forest department under theÂ BJPÂ government had launched â€˜Green Maharashtraâ€™ drive with an aim to plant 50 crore trees across the state in the four-year period. In October 2019, the government had claimed it had surpassed the target by planting 33 crore trees in July-September 2019.Â The Indian ExpressÂ had found that non-forest agencies â€” such as gram panchayats â€” which were tasked with planting trees had not uploaded the mandatory audio-visual proof of the tree plantation drives on the specially created portal.
In Pune Revenue Division, it was claimed the gram panchayats planted 1.7 crore saplings; however, no evidence was uploaded for 87 per cent (1.49 crore) saplings. Also, out of the 59 government agencies involved in the drive as many as 38 had not submitted survival reports about the saplings.
This year, the targets set by the forest department were comparatively modest. For example, Pune Circle â€” which comprises three divisions in Pun

In [62]:
# words separated
tokens = word_tokenize(text)
print(tokens[:5])

['Between', '2016', 'and', '2019', ',']


In [63]:
# gives the list of stoopwords
stopword_corpus = stopwords.words('english')
print(stopword_corpus[:5])

['i', 'me', 'my', 'myself', 'we']


In [64]:
def remove_stopwords(tokens,stopwords):
    filtered_tokens = []
    for i in tokens:
        if (i not in stopwords):
            filtered_tokens.append(i)
    return filtered_tokens

In [65]:
# renoval of the stopwords
tokens_without_stopwords  = remove_stopwords(tokens,stopword_corpus)
print(tokens_without_stopwords[:5])

['Between', '2016', '2019', ',', 'state']


In [66]:
# if this line executes then removal was successfull
assert len(tokens_without_stopwords)<=len(tokens)

In [67]:
# give the types of words
pos_tagged_tokens = nltk.pos_tag(tokens_without_stopwords)
print(pos_tagged_tokens[:5])

[('Between', 'IN'), ('2016', 'CD'), ('2019', 'CD'), (',', ','), ('state', 'NN')]


In [68]:
def stem_tokens(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = []
    for i in tokens:
        stemmed_token = stemmer.stem(i)
        stemmed_tokens.append(stemmed_token)
        
    return stemmed_tokens

In [69]:
# word into its base form
stemmed_tokens = stem_tokens(tokens_without_stopwords)
print(stemmed_tokens[:5])

['between', '2016', '2019', ',', 'state']


In [70]:
def lemmatize_tokens(tokens):
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = []
  for token in tokens:
    lemmatized_token = lemmatizer.lemmatize(token)
    lemmatized_tokens.append(lemmatized_token)
  return lemmatized_tokens

In [71]:
# it ensures the lematized word is valid
lemmatized_tokens = lemmatize_tokens(tokens_without_stopwords)
print(lemmatized_tokens[:5])

['Between', '2016', '2019', ',', 'state']


In [72]:
documents = text.split("\n")

In [73]:
documents

['Between 2016 and 2019, the state forest department under theÂ\xa0BJPÂ\xa0government had launched â€˜Green Maharashtraâ€™ drive with an aim to plant 50 crore trees across the state in the four-year period. In October 2019, the government had claimed it had surpassed the target by planting 33 crore trees in July-September 2019.Â\xa0The Indian ExpressÂ\xa0had found that non-forest agencies â€” such as gram panchayats â€” which were tasked with planting trees had not uploaded the mandatory audio-visual proof of the tree plantation drives on the specially created portal.',
 'In Pune Revenue Division, it was claimed the gram panchayats planted 1.7 crore saplings; however, no evidence was uploaded for 87 per cent (1.49 crore) saplings. Also, out of the 59 government agencies involved in the drive as many as 38 had not submitted survival reports about the saplings.',
 'This year, the targets set by the forest department were comparatively modest. For example, Pune Circle â€” which comprises 

In [74]:
def get_document_tokens(documents):
  document_tokens = []
  for document in documents:
    tokens = word_tokenize(document)
    document_tokens.append(tokens)
  return document_tokens

In [75]:
document_tokens = get_document_tokens(documents)

In [76]:
len(document_tokens[0])

95

In [77]:
def get_term_frequency(document_tokens):
  term_frequencies = []
  for document in document_tokens:
    term_frequency={}
    for token in document:
      try :
        term_frequency[token]+=1
      except:
        term_frequency[token] = 1
    n = len(document)
    unique_tokens = set(document)
    for token in unique_tokens:
      term_frequency[token] /= n
    term_frequencies.append(term_frequency)
  return term_frequencies

In [78]:
term_frequency = get_term_frequency(document_tokens)
term_frequency

[{'Between': 0.010526315789473684,
  '2016': 0.010526315789473684,
  'and': 0.010526315789473684,
  '2019': 0.021052631578947368,
  ',': 0.021052631578947368,
  'the': 0.08421052631578947,
  'state': 0.021052631578947368,
  'forest': 0.010526315789473684,
  'department': 0.010526315789473684,
  'under': 0.010526315789473684,
  'theÂ': 0.010526315789473684,
  'BJPÂ': 0.010526315789473684,
  'government': 0.021052631578947368,
  'had': 0.05263157894736842,
  'launched': 0.010526315789473684,
  'â€˜Green': 0.010526315789473684,
  'Maharashtraâ€™': 0.010526315789473684,
  'drive': 0.010526315789473684,
  'with': 0.021052631578947368,
  'an': 0.010526315789473684,
  'aim': 0.010526315789473684,
  'to': 0.010526315789473684,
  'plant': 0.010526315789473684,
  '50': 0.010526315789473684,
  'crore': 0.021052631578947368,
  'trees': 0.031578947368421054,
  'across': 0.010526315789473684,
  'in': 0.021052631578947368,
  'four-year': 0.010526315789473684,
  'period': 0.010526315789473684,
  '.': 

In [79]:
def get_inverse_document_frequency(term_frequency):
  document_cnt = len(term_frequency)
  inverse_document_frequencies = []
  for document in term_frequency:
    tokens = list(document.keys())
    inverse_document_frequency = {}
    for token in tokens:
      document_freq=1
      for doc in term_frequency:
        if(token in doc.keys()):
          document_freq+=1
      inverse_document_frequency[token] = math.log(document_cnt/document_freq)
    inverse_document_frequencies.append(inverse_document_frequency)
  return inverse_document_frequencies

In [80]:
inverse_document_frequency = get_inverse_document_frequency(term_frequency)