In [1]:
#Implement a simple information retrieval model in Python that are capable of processing boolean queries.

In [2]:
doc1= "Today is a beautiful, and a sunny day to start my workout."
doc2= "I will not be able to come today to meet with him."
doc3= "Our class meeting starts soon!"
doc4= "My class starts at 6."

# Importing the necessary libraries

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Merin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import re
import string

# Text data preprocessing 

In [5]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
tokenizer = nltk.tokenize.TreebankWordTokenizer()

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    tokens = tokenizer.tokenize(text)
    text=" ".join(lem.lemmatize(token) for token in tokens)
    return text

In [7]:
doc1=clean_text(doc1)
doc2=clean_text(doc2)
doc3=clean_text(doc3)
doc4=clean_text(doc4)

In [8]:
doc1

'today is a beautiful and a sunny day to start my workout'

In [9]:
Docs =[doc1,doc2,doc3,doc4]
print(Docs)

['today is a beautiful and a sunny day to start my workout', 'i will not be able to come today to meet with him', 'our class meeting start soon', 'my class start at']


In [10]:
tokens = []
for word in doc1.lower().split():
    if word not in tokens:
        tokens.append(word)
for word in doc2.lower().split():
    if word not in tokens:
        tokens.append(word)
for word in doc3.lower().split():
    if word not in tokens:
        tokens.append(word)
for word in doc4.lower().split():
    if word not in tokens:
        tokens.append(word)   

In [11]:
tokens

['today',
 'is',
 'a',
 'beautiful',
 'and',
 'sunny',
 'day',
 'to',
 'start',
 'my',
 'workout',
 'i',
 'will',
 'not',
 'be',
 'able',
 'come',
 'meet',
 'with',
 'him',
 'our',
 'class',
 'meeting',
 'soon',
 'at']

In [12]:
len(tokens)

25

In [13]:
vector_1 = []
for token in tokens:
    if token in doc1.lower().split():
        vector_1.append(1)
    else:
        vector_1.append(0)
vector_2 = []
for token in tokens:
    if token in doc2.lower().split():
        vector_2.append(1)
    else:
        vector_2.append(0)
vector_3 = []
for token in tokens:
    if token in doc3.lower().split():
        vector_3.append(1)
    else:
        vector_3.append(0)
vector_4 = []
for token in tokens:
    if token in doc4.lower().split():
        vector_4.append(1)
    else:
        vector_4.append(0)

In [14]:
print(vector_1)
print(vector_2)
print(vector_3)
print(vector_4)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]


In [15]:
import pandas as pd
df = pd.DataFrame(columns=['words','doc1', 'doc2','doc3','doc4'])

In [16]:
df['words'] = tokens
df['doc1'] = vector_1
df['doc2'] = vector_2
df['doc3'] = vector_3
df['doc4'] = vector_4

# Term-Document Incidence Matrix

In [17]:
df

Unnamed: 0,words,doc1,doc2,doc3,doc4
0,today,1,1,0,0
1,is,1,0,0,0
2,a,1,0,0,0
3,beautiful,1,0,0,0
4,and,1,0,0,0
5,sunny,1,0,0,0
6,day,1,0,0,0
7,to,1,1,0,0
8,start,1,0,1,1
9,my,1,0,0,1


# GET THE VECTORS

In [18]:
doc_term_matrix = {}
for term in tokens:
    doc_term_matrix[term] = []  
    for doc in Docs:
        if term in doc:
            doc_term_matrix[term].append(1)
        else: 
            doc_term_matrix[term].append(0)

In [19]:
doc_term_matrix

{'today': [1, 1, 0, 0],
 'is': [1, 0, 0, 0],
 'a': [1, 1, 1, 1],
 'beautiful': [1, 0, 0, 0],
 'and': [1, 0, 0, 0],
 'sunny': [1, 0, 0, 0],
 'day': [1, 1, 0, 0],
 'to': [1, 1, 0, 0],
 'start': [1, 0, 1, 1],
 'my': [1, 0, 0, 1],
 'workout': [1, 0, 0, 0],
 'i': [1, 1, 1, 0],
 'will': [0, 1, 0, 0],
 'not': [0, 1, 0, 0],
 'be': [1, 1, 0, 0],
 'able': [0, 1, 0, 0],
 'come': [0, 1, 0, 0],
 'meet': [0, 1, 1, 0],
 'with': [0, 1, 0, 0],
 'him': [0, 1, 0, 0],
 'our': [0, 0, 1, 0],
 'class': [0, 0, 1, 1],
 'meeting': [0, 0, 1, 0],
 'soon': [0, 0, 1, 0],
 'at': [0, 0, 0, 1]}

In [20]:
doc_term_matrix['start']

[1, 0, 1, 1]

# Boolean operations

# "AND" ,"OR" and "AND NOT" operation

In [72]:
docs_array = np.array(Docs, dtype='object')
docs_array

array(['today is a beautiful and a sunny day to start my workout',
       'i will not be able to come today to meet with him',
       'our class meeting start soon', 'my class start at'], dtype=object)

In [89]:
phrase=input("Enter the phrase:")
split=phrase.split()
a=split[0]
b=split[1]
c=split[2]
v1 = np.array(doc_term_matrix[a])    
v2 = np.array(doc_term_matrix[c])
if (split[1]=='and' and split[2]=='not'):
    d=split[3]
    v2 = np.array(doc_term_matrix[d])
    print(a,":",v1)
    print("operator is '",b,c,"'")
    print(d,":",v2)
    print(v1,"and not",v2)
    v=v1 &~v2
    print(v)
elif split[1]=='or':
    print(a,":",v1)
    print("operator is '",b,"'")
    print(c,":",v2)
    print(v1,"or",v2)
    v=v1 | v2
    print(v)
else:
    print(a,":",v1)
    print("operator is '",b,"'")
    print(c,":",v2)
    print(v1,"and",v2)
    v=v1&v2
    print(v)

Enter the phrase:class or meeting
class : [0 0 1 1]
operator is ' or '
meeting : [0 0 1 0]
[0 0 1 1] or [0 0 1 0]
[0 0 1 1]
