# Assignment 03 - Information Retrieval 2021
## Group 06:
### - Karen Pilosyan
### - Nidhi Dhakan
### - Johan Porras

In [8]:
# First need to create the index schema
# In this case we will only use one field 'content'of which the entire dicument content is to be indexed/stored
from whoosh.fields import Schema, TEXT
from whoosh.analysis import StemmingAnalyzer
# Add keyworkd analyzer with lowercase filter
schema = Schema(content=TEXT(analyzer= StemmingAnalyzer(stoplist=frozenset(['and', 'is', 'it', 'an', 'as', 'at', 'have', 'in', 'yet', 'if', 'from', 'for', 'when', 'by', 'to', 'you', 'be', 'we', 'that', 'may', 'not', 'with', 'tbd', 'a', 'on', 'your', 'this', 'of', 'us', 'will', 'can', 'the', 'or', 'are'])),phrase=True, stored=True))

In [9]:
# Create folder structure to store index
import os.path
from whoosh.index import create_in
if not os.path.exists("index"):
    os.mkdir("index")
ix = create_in("index", schema)

In [10]:
# Initialize index writer in order to add documents to it.
writer = ix.writer()
# Read textual documents from file
documents_path = './AssociatedPress.txt'
with open(documents_path, 'r', encoding='utf-8') as doc_f:
    corpus_list = doc_f.readlines()
# Index documents
for x in corpus_list:
    writer.add_document(content= x)

writer.commit()

In [11]:
# Search with query parser
from whoosh.qparser import QueryParser

def search_and_print(query):
    with ix.searcher() as s:
        results = s.search(query)
        print(len(results))
        # Print Results
        #for r in results:
        #    print(r)
        print("Content :", results[0]["content"]) # print the content of each result.

        found = results.scored_length()
        if results.has_exact_length():
            print("Scored", found, "of exactly", len(results), "documents")
        else:
            low = results.estimated_min_length()
            high = results.estimated_length()

            print("Scored", found, "of between", low, "and", high, "documents")


In [12]:
# Search for “Michael Dukakis”,
qp = QueryParser("content", schema=ix.schema)
q = qp.parse(u"Michael Dukakis")
search_and_print(q)

# Search for “Dukakis OR Bush”
qp = QueryParser("content", schema=ix.schema)
q = qp.parse(u"Dukakis OR Bush")
search_and_print(q)

# Search for “Dukakis OR Bush”
qp = QueryParser("content", schema=ix.schema)
q = qp.parse(u"graduate of Syracuse University")
search_and_print(q)

70
Content : Jesse Jackson issued an emotional appeal today for supporters to back the Democratic ticket and told Michael Dukakis and Lloyd Bentsen that he will be keeping up the ``street heat'' to make sure they don't forget his constituents. ``It is in order and it is right to support this ticket because of our access to it and our relaionship with it,'' Jackson told several hundred delegates as Dukakis and Bentsen looked on. ``We have every reason to be hopeful, to be excited, to know that we are close to where we are going, a long way from where we started,'' Jackson said. ``And in our lifetime you and I will be in the White House.'' He introduced Dukakis as ``a man I've come to know, to respect ... a man I've come to ... love because of his own sensitivity to family,'' Jackson added: ``I bring to you a man I plan to watch up close ... when he becomes ... the next president of the United States _ Michael Dukakis.'' The group erupted in applause and shouts of ``Duke, Duke.'' ``We're

# Difficulties 
## Reading the results
- We noticed that the results could only be accessed when `searcher()` is open.
- Also the results hits can be accessed only by the fields specified in the schema in this case `content`

## Storing the content

In order to see the content within the results the `stored` propetry within text must be set to true when crearing the schema.

## Lock Error

<b> ix.writer() </b> was raising lock error that was weird for us, but after refreshing the kernel it disappeared.