**Boolean Information Retrieval Model**

In [1]:
import os 
import pandas as pd
import string
import nltk
import nltk.corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
stop_words = stopwords.words('english')

In [5]:
# Defining a function for reading and cleaning the text documents
def Document(*doc_path):
  doc_files = []
  for files in doc_path:
    data_files = open(files)
    data_read = data_files.read()

    #Removing the different characters present in the text
    spcl_characters = '''!@#$%^&*-()[]{}.;:\'"/<>?''' 
    for element in data_read:
        if element in spcl_characters:
           data_read = data_read.replace(element, '') 
    
    #Splitting the strings into list in the given text files
    text_tokens = [token for token in data_read.split()]

    # Converting the words in to lower case and removing the stopwords
    clean_text = []
    for text in text_tokens:
      words = text.lower()
      if words not in stop_words:
        clean_text.append(words)
    
    doc_files.append(clean_text)
  return doc_files 

In [6]:
docs = Document(r'/content/Father_Brown/Blue Cross.txt', r'/content/Father_Brown/Queer Feet.txt', r'/content/Father_Brown/Secret Garden.txt',
                r'/content/Father_Brown/Broken Sword.txt',r'/content/Father_Brown/Eye of Apollo.txt', r'/content/Father_Brown/Flying Stars.txt',
                r'/content/Father_Brown/Hammer of God.txt',r'/content/Father_Brown/Invisible Man.txt',
                r'/content/Father_Brown/Israel Crow.txt',r'/content/Father_Brown/Prince Saradine Sins.txt', r'/content/Father_Brown/Wrong Shape.txt')

In [7]:
# Defining a function to represent whether a word is presnt in the given document
def word_pres(word):
  vector = []
  for word_token in docs:
    wrd = word.lower()
    if wrd in word_token:
      vector.append(1)
    else:
      vector.append(0)
  return vector




In [8]:
word_pres("elegantly")

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [9]:
word_pres("learning")

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [10]:
word_pres("proper")

[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]

In [11]:
# Retrieving unique words from the document
unique_words_list = []

for un_wrd in docs:
  unique_words_list += un_wrd
unique_list = []
for unq_wrd in unique_words_list:
  if unq_wrd not in unique_list:
    unique_list.append(unq_wrd)

In [12]:
# Creating Term Document Incidence for each unique word
unique_dict = {}
for w in unique_list:
  unique_dict[w] = word_pres(w)


In [13]:
df = pd.DataFrame(list(unique_dict.items()), columns = ['words', 'Term Document Incidence'])
df

Unnamed: 0,words,Term Document Incidence
0,silver,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0]"
1,ribbon,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,morning,"[1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0]"
3,green,"[1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1]"
4,glittering,"[1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1]"
...,...,...
9845,wizardry,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
9846,hypnotism,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
9847,document,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
9848,"wanted,”","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"


In [14]:
def process_query(query_text):
  query = word_tokenize(query_text)
  words_connect= []
  words_different = []

  for word in query:
      if word.lower() != "and" and word.lower() != "or" and word.lower() != "not":
        words_different.append(word.lower())
      else:
        words_connect.append(word.lower())

  for word in words_connect:
     words_list1 = unique_dict[words_different[0]]
     words_list2 = unique_dict[words_different[1]]
     if word == "and":
        bitwise_op = [w1 & w2 for (w1,w2) in zip(words_list1, words_list2)]
     elif word == "or":
        bitwise_op = [w1 | w2 for (w1,w2) in zip(words_list1, words_list2)]
     elif word == "not":
        bitwise_op = [not w1 for w1 in words_list2]
        bitwise_op = [int(b == True) for b in bitwise_op]
        bitwise_op = [w1 & w2 for (w1, w2) in zip(words_list1, bitwise_op)]
  return bitwise_op
        


In [15]:
query1 = input("Enter the query: ")
process_query(query1)

Enter the query: elegantly or proper and learning


[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

The above result indicates that the document 1 which corresponds to the text 'Blue Cross' has the either the words 'elegantly' or 'proper' and has the word 'learning'.

In [16]:
query2 = input("Enter the query: ")
process_query(query2) 

Enter the query: proper and not learning


[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]

The above result indicates that the document 6 which corresponds to the text 'Flying Stars'  has the  the word 'proper', but not 'learning'.