#  Import required libraries

In [79]:
#  Import required libraries
import math
import re
import nltk

#  Download necessary NLTK data
nltk.download('punkt')                       # Tokenizer models
nltk.download('punkt_tab')                   # (Uncommon, usually not needed separately)
nltk.download('stopwords')                   # List of stopwords
nltk.download('averaged_perceptron_tagger')   # POS Tagger for English
nltk.download('averaged_perceptron_tagger_eng') # (Uncommon, might be misnamed or redundant)
nltk.download('wordnet')                     # WordNet corpus for lemmatization

#  Import NLTK components after downloads
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


[nltk_data] Downloading package punkt to /home/jagdish/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jagdish/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jagdish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jagdish/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jagdish/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /home/jagdish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Text Preprocessing

In [None]:

with open('doc_01.txt', 'r') as file:
    doc1 = file.read()

with open('doc_02.txt', 'r') as file:
    doc2 = file.read()

# Tokenization
Tokenization is the process of breaking down a text into individual words or tokens. This is often the first step in natural language processing tasks

In [None]:
word_tokens = nltk.word_tokenize(doc1)
print(word_tokens)

['Between', '2016', 'and', '2019', ',', 'the', 'state', 'forest', 'department', 'under', 'the', 'BJP', 'government', 'had', 'launched', '‘', 'Green', 'Maharashtra', '’', 'drive', 'with', 'an', 'aim', 'to', 'plant', '50', 'crore', 'trees', 'across', 'the', 'state', 'in', 'the', 'four-year', 'period', '.', 'In', 'October', '2019', ',', 'the', 'government', 'had', 'claimed', 'it', 'had', 'surpassed', 'the', 'target', 'by', 'planting', '33', 'crore', 'trees', 'in', 'July-September', '2019', '.', 'The', 'Indian', 'Express', 'had', 'found', 'that', 'non-forest', 'agencies', '—', 'such', 'as', 'gram', 'panchayats', '—', 'which', 'were', 'tasked', 'with', 'planting', 'trees', 'had', 'not', 'uploaded', 'the', 'mandatory', 'audio-visual', 'proof', 'of', 'the', 'tree', 'plantation', 'drives', 'on', 'the', 'specially', 'created', 'portal', '.', 'In', 'Pune', 'Revenue', 'Division', ',', 'it', 'was', 'claimed', 'the', 'gram', 'panchayats', 'planted', '1.7', 'crore', 'saplings', ';', 'however', ',', 

# Stop Words
Stop words are common words like 'the', 'is', 'and', etc., which often do not carry significant meaning in text analysis. Remove these stop words from the text to focus on the more meaningful content.

In [82]:
stop_words = set(stopwords.words('english'))

filtered_tokens = []
for token in word_tokens:
    if token not in stop_words:
        filtered_tokens.append(token)

print(filtered_tokens)

['Between', '2016', '2019', ',', 'state', 'forest', 'department', 'BJP', 'government', 'launched', '‘', 'Green', 'Maharashtra', '’', 'drive', 'aim', 'plant', '50', 'crore', 'trees', 'across', 'state', 'four-year', 'period', '.', 'In', 'October', '2019', ',', 'government', 'claimed', 'surpassed', 'target', 'planting', '33', 'crore', 'trees', 'July-September', '2019', '.', 'The', 'Indian', 'Express', 'found', 'non-forest', 'agencies', '—', 'gram', 'panchayats', '—', 'tasked', 'planting', 'trees', 'uploaded', 'mandatory', 'audio-visual', 'proof', 'tree', 'plantation', 'drives', 'specially', 'created', 'portal', '.', 'In', 'Pune', 'Revenue', 'Division', ',', 'claimed', 'gram', 'panchayats', 'planted', '1.7', 'crore', 'saplings', ';', 'however', ',', 'evidence', 'uploaded', '87', 'per', 'cent', '(', '1.49', 'crore', ')', 'saplings', '.', 'Also', ',', '59', 'government', 'agencies', 'involved', 'drive', 'many', '38', 'submitted', 'survival', 'reports', 'saplings', '.', 'This', 'year', ',', '

# POS Tagging
POS tagging involves labeling each word in a sentence with its corresponding part of speech, such as noun, verb, adjective, etc.

In [83]:
tagged = nltk.pos_tag(word_tokens, lang='eng')
print(tagged)


[('Between', 'JJ'), ('2016', 'CD'), ('and', 'CC'), ('2019', 'CD'), (',', ','), ('the', 'DT'), ('state', 'NN'), ('forest', 'JJS'), ('department', 'NN'), ('under', 'IN'), ('the', 'DT'), ('BJP', 'NNP'), ('government', 'NN'), ('had', 'VBD'), ('launched', 'VBN'), ('‘', 'RB'), ('Green', 'NNP'), ('Maharashtra', 'NNP'), ('’', 'NNP'), ('drive', 'NN'), ('with', 'IN'), ('an', 'DT'), ('aim', 'NN'), ('to', 'TO'), ('plant', 'NN'), ('50', 'CD'), ('crore', 'NN'), ('trees', 'NNS'), ('across', 'IN'), ('the', 'DT'), ('state', 'NN'), ('in', 'IN'), ('the', 'DT'), ('four-year', 'JJ'), ('period', 'NN'), ('.', '.'), ('In', 'IN'), ('October', 'NNP'), ('2019', 'CD'), (',', ','), ('the', 'DT'), ('government', 'NN'), ('had', 'VBD'), ('claimed', 'VBN'), ('it', 'PRP'), ('had', 'VBD'), ('surpassed', 'VBN'), ('the', 'DT'), ('target', 'NN'), ('by', 'IN'), ('planting', 'VBG'), ('33', 'CD'), ('crore', 'NN'), ('trees', 'NNS'), ('in', 'IN'), ('July-September', 'NNP'), ('2019', 'CD'), ('.', '.'), ('The', 'DT'), ('Indian', 

# Stemming

 Stemming means reducing a word to its base or root form. For example, the words "running", "ran", and "runner" can all be reduced to the root word "run". Stemming is often used in information retrieval and natural language processing tasks to improve the performance of algorithms by reducing the number of unique words in a dataset. However, stemming can sometimes result in words that are not actual words in the language, so it is important to use it judiciously.

In [91]:
stemmer = nltk.stem.PorterStemmer()

stemmed_tokens = []
for token in word_tokens:
    stemmed_tokens.append(stemmer.stem(token))

print(stemmed_tokens)


['between', '2016', 'and', '2019', ',', 'the', 'state', 'forest', 'depart', 'under', 'the', 'bjp', 'govern', 'had', 'launch', '‘', 'green', 'maharashtra', '’', 'drive', 'with', 'an', 'aim', 'to', 'plant', '50', 'crore', 'tree', 'across', 'the', 'state', 'in', 'the', 'four-year', 'period', '.', 'in', 'octob', '2019', ',', 'the', 'govern', 'had', 'claim', 'it', 'had', 'surpass', 'the', 'target', 'by', 'plant', '33', 'crore', 'tree', 'in', 'july-septemb', '2019', '.', 'the', 'indian', 'express', 'had', 'found', 'that', 'non-forest', 'agenc', '—', 'such', 'as', 'gram', 'panchayat', '—', 'which', 'were', 'task', 'with', 'plant', 'tree', 'had', 'not', 'upload', 'the', 'mandatori', 'audio-visu', 'proof', 'of', 'the', 'tree', 'plantat', 'drive', 'on', 'the', 'special', 'creat', 'portal', '.', 'in', 'pune', 'revenu', 'divis', ',', 'it', 'wa', 'claim', 'the', 'gram', 'panchayat', 'plant', '1.7', 'crore', 'sapl', ';', 'howev', ',', 'no', 'evid', 'wa', 'upload', 'for', '87', 'per', 'cent', '(', '1

# Lemmatization
Lemmatization means to reduce a word to its base or root form. For example, the word "running" would be lemmatized to "run".

In [98]:
lemmatizer = nltk.stem.WordNetLemmatizer()


lemmatized_tokens = []
for token in word_tokens:
    lemmatized_tokens.append(lemmatizer.lemmatize(token))

print(lemmatized_tokens)


['Between', '2016', 'and', '2019', ',', 'the', 'state', 'forest', 'department', 'under', 'the', 'BJP', 'government', 'had', 'launched', '‘', 'Green', 'Maharashtra', '’', 'drive', 'with', 'an', 'aim', 'to', 'plant', '50', 'crore', 'tree', 'across', 'the', 'state', 'in', 'the', 'four-year', 'period', '.', 'In', 'October', '2019', ',', 'the', 'government', 'had', 'claimed', 'it', 'had', 'surpassed', 'the', 'target', 'by', 'planting', '33', 'crore', 'tree', 'in', 'July-September', '2019', '.', 'The', 'Indian', 'Express', 'had', 'found', 'that', 'non-forest', 'agency', '—', 'such', 'a', 'gram', 'panchayat', '—', 'which', 'were', 'tasked', 'with', 'planting', 'tree', 'had', 'not', 'uploaded', 'the', 'mandatory', 'audio-visual', 'proof', 'of', 'the', 'tree', 'plantation', 'drive', 'on', 'the', 'specially', 'created', 'portal', '.', 'In', 'Pune', 'Revenue', 'Division', ',', 'it', 'wa', 'claimed', 'the', 'gram', 'panchayat', 'planted', '1.7', 'crore', 'sapling', ';', 'however', ',', 'no', 'evid

# Term Frequency 
Trem Frequency means the number of times a word appears in a document 

Term Frequency = (Number of times term t appears in a document) / (Total number of terms in the document)


In [None]:

def term_frequency(doc):
    word_tokens = nltk.word_tokenize(doc)
    tf_dict = dict()
    for word in word_tokens:
        tf_dict[word] = word_tokens.count(word)
        
    tf = dict()
    for word, count in tf_dict.items():
        tf[word] = count/len(tf_dict)
    return tf

tf_doc1 = term_frequency(doc1)
tf_doc2 = term_frequency(doc2)
print("Term Frequency of Doc 1: ", tf_doc1, "\n")
print("Term Frequency of Doc 2: ", tf_doc2)

Term Frequency of Doc 1:  {'Between': 0.0033783783783783786, '2016': 0.010135135135135136, 'and': 0.060810810810810814, '2019': 0.013513513513513514, ',': 0.12837837837837837, 'the': 0.13513513513513514, 'state': 0.006756756756756757, 'forest': 0.02027027027027027, 'department': 0.006756756756756757, 'under': 0.010135135135135136, 'BJP': 0.0033783783783783786, 'government': 0.013513513513513514, 'had': 0.02364864864864865, 'launched': 0.0033783783783783786, '‘': 0.02027027027027027, 'Green': 0.010135135135135136, 'Maharashtra': 0.016891891891891893, '’': 0.02364864864864865, 'drive': 0.013513513513513514, 'with': 0.02364864864864865, 'an': 0.0033783783783783786, 'aim': 0.006756756756756757, 'to': 0.04391891891891892, 'plant': 0.010135135135135136, '50': 0.010135135135135136, 'crore': 0.04391891891891892, 'trees': 0.016891891891891893, 'across': 0.0033783783783783786, 'in': 0.037162162162162164, 'four-year': 0.0033783783783783786, 'period': 0.0033783783783783786, '.': 0.0709459459459459

# Inverse Document Frequency
 IDF means Inverse Document Frequency and it is used to measure how important a word is to a document in a collection or corpus.

 The more documents that contain the word, the less important it is.

 The IDF of a word is calculated as the logarithm of the total number of documents divided by the number of documents that contain the word.
 
 The formula for IDF is:
 IDF(word) = log_e(Total number of documents / Number of documents containing the word)

In [87]:
def inverse_document_frequency(doc1, doc2):
    word_tokens_doc1 = nltk.word_tokenize(doc1)
    word_tokens_doc2 = nltk.word_tokenize(doc2)
    
    # dict to store the idf values
    idf_dict = dict()
    # set to store all unique words
    all_words = set(word_tokens_doc1 + word_tokens_doc2)
    
    for word in all_words:
        count = 0
        if word in word_tokens_doc1:
            count += 1
        if word in word_tokens_doc2:
            count += 1
        idf_dict[word] = math.log(2 / count)
    return idf_dict
idf = inverse_document_frequency(doc1, doc2)
print("Inverse Document Frequency: ", idf, "\n")

Inverse Document Frequency:  {'organisations': 0.6931471805599453, 'There': 0.6931471805599453, 'riverbanks': 0.6931471805599453, '2016': 0.6931471805599453, 'Pune': 0.6931471805599453, 'My': 0.6931471805599453, 'volunteering': 0.6931471805599453, 'maintain': 0.0, 'Maharashtra': 0.6931471805599453, 'proof': 0.6931471805599453, 'meet': 0.6931471805599453, 'which': 0.6931471805599453, 'part': 0.0, 'where': 0.6931471805599453, 'six': 0.6931471805599453, ';': 0.6931471805599453, 'growing': 0.6931471805599453, 'related': 0.6931471805599453, 'period': 0.6931471805599453, 'between': 0.6931471805599453, 'geographical': 0.6931471805599453, 'Colleges': 0.6931471805599453, 'carried': 0.6931471805599453, '‘': 0.6931471805599453, 'than': 0.6931471805599453, 'he': 0.6931471805599453, 'compared': 0.6931471805599453, 'being': 0.6931471805599453, 'www.greenarmy.mahaforest.gov.in': 0.6931471805599453, 'so': 0.6931471805599453, '[': 0.6931471805599453, 'tasked': 0.6931471805599453, 'such': 0.693147180559