# Creating inverted index

In [1]:
#Create an inverted index using Python and execute boolean queries

In [2]:
doc1= "Today is a beautiful, and a sunny day to start my workout."
doc2= "I will not be able to come today to meet with him."
doc3= "Our class meeting starts soon!"
doc4= "My class starts at 6."

In [3]:
import nltk
nltk.download('wordnet')
import re
import string
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
tokenizer = nltk.tokenize.TreebankWordTokenizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Merin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    tokens = tokenizer.tokenize(text)
    text=" ".join(lem.lemmatize(token) for token in tokens)
    return text

In [5]:
doc1=clean_text(doc1)
doc2=clean_text(doc2)
doc3=clean_text(doc3)
doc4=clean_text(doc4)

In [6]:
Docs =[doc1,doc2,doc3,doc4]
print(Docs)

['today is a beautiful and a sunny day to start my workout', 'i will not be able to come today to meet with him', 'our class meeting start soon', 'my class start at']


In [7]:
tokens = []
for word in doc1.lower().split():
    if word not in tokens:
        tokens.append(word)
for word in doc2.lower().split():
    if word not in tokens:
        tokens.append(word)
for word in doc3.lower().split():
    if word not in tokens:
        tokens.append(word)
for word in doc4.lower().split():
    if word not in tokens:
        tokens.append(word)   

In [8]:
inverted_index = {}

In [9]:
list(enumerate(Docs,1))

[(1, 'today is a beautiful and a sunny day to start my workout'),
 (2, 'i will not be able to come today to meet with him'),
 (3, 'our class meeting start soon'),
 (4, 'my class start at')]

In [10]:
for i, a in enumerate(Docs,1):
    for term in a.split():
        if term in inverted_index:
            inverted_index[term].add(i)
        else:
            inverted_index[term] = {i}

In [11]:
inverted_index

{'today': {1, 2},
 'is': {1},
 'a': {1},
 'beautiful': {1},
 'and': {1},
 'sunny': {1},
 'day': {1},
 'to': {1, 2},
 'start': {1, 3, 4},
 'my': {1, 4},
 'workout': {1},
 'i': {2},
 'will': {2},
 'not': {2},
 'be': {2},
 'able': {2},
 'come': {2},
 'meet': {2},
 'with': {2},
 'him': {2},
 'our': {3},
 'class': {3, 4},
 'meeting': {3},
 'soon': {3},
 'at': {4}}

In [12]:
term=input("Enter the word: ")
posting_list = inverted_index[term]
print(posting_list)

Enter the word: class
{3, 4}
