# Creating a Search Index

The Python Library [Whoosh](https://whoosh.readthedocs.io) provides powerful text searching, although the search interface is pretty "close-to-the-metal".  This is a playground to see if we can make searching with Whoosh easy enough without getting distracted.

In [86]:
import os, os.path
import json
from pathlib import Path
import shutil

from IPython.display import HTML

from whoosh import index
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NUMERIC
from whoosh.analysis import StemmingAnalyzer

DOCUMENT_TEXT_PATH = Path('../data/DocumentCloud/text')
INDEX_DIR = "../whoosh_index"


In [79]:
def create_index(doc_json_array):

    schema = Schema(id=ID(stored=True),
                    title=TEXT(stored=True),
                    source=TEXT(stored=True),
                    content=TEXT(analyzer=StemmingAnalyzer(),stored=False),
                    pages=NUMERIC,
                    url=STORED,
                    thumbnail=STORED)
    
    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)
    ix = index.create_in(INDEX_DIR, schema)

    try:
        writer = ix.writer()

#         # optimization from https://whoosh.readthedocs.io/en/latest/batch.html#stemminganalyzer-cache
#         stem_ana = writer.schema["content"].format.analyzer
#         # Set the cachesize to -1 to indicate unbounded caching
#         stem_ana.cachesize = -1
#         # Reset the analyzer to pick up the changed attribute
#         stem_ana.clear()

        for i,doc in enumerate(doc_json_array):
            body_file = DOCUMENT_TEXT_PATH / f"{doc['id']}.txt"
            body = open(body_file).read()
            if i % 1000 == 0:
                print(f"{i} Doc {body_file} len {len(body)}")
            writer.add_document(
                id=doc['id'],
                title=doc['title'],
                source=doc['source'],
                pages=doc['pages'],
                url=doc['canonical_url'],
                content=body,
                thumbnail=doc['resources']['thumbnail']
            )
        writer.commit()
    except Exception as e:
        writer.cancel()
        print(e)

def index_all_documents():
    if os.path.exists(INDEX_DIR):
        print("Deleting pre-existing index")
        shutil.rmtree(INDEX_DIR)
    docs = json.load(open('../data/DocumentCloud/filtered_documents.json'))
    create_index(docs)

In [80]:
# This should only need to be run once, unless things change.
# First full run with full text index: 
#   Wall time: 12min 57s
# %time index_all_documents()

0 Doc ../data/DocumentCloud/text/6572198-Committee-on-License-and-Consumer-Protection.txt len 813
1000 Doc ../data/DocumentCloud/text/6453704-ADA-Advisory-Committee-Meeting-2019-10-15.txt len 892
2000 Doc ../data/DocumentCloud/text/6207843-Agency-Involvement-Oversight-Subcommittee-2014.txt len 468
3000 Doc ../data/DocumentCloud/text/6173400-Board-of-Commissioners-2014-06-17-June-17-2014-pdf.txt len 27895
4000 Doc ../data/DocumentCloud/text/6161318-Board-of-Commissioners-2009-12-03-Agenda.txt len 22006
5000 Doc ../data/DocumentCloud/text/6156916-Board-of-Commissioners-2007-12-04-Minutes.txt len 499861
6000 Doc ../data/DocumentCloud/text/6155868-Rules-Committee-2016-03-22-Agenda.txt len 1935
7000 Doc ../data/DocumentCloud/text/6139426-Litigation-Committee-2014-09-08-Minutes.txt len 4117
8000 Doc ../data/DocumentCloud/text/6138371-Board-of-Commissioners-Zoning-and-Building.txt len 6252
9000 Doc ../data/DocumentCloud/text/6137360-Riverboat-Video-Gaming-2014-02-25-Minutes.txt len 5647
10000

In [105]:
# these may end up interesting https://whoosh.readthedocs.io/en/latest/recipes.html
from whoosh.qparser import MultifieldParser

class Searcher(object):
    def __init__(self,index_dir='../whoosh_index/'):
        ix = index.open_dir(index_dir)
        self.searcher = ix.searcher()
        self.parser = MultifieldParser(["title", "content"], schema=ix.schema)

    def search(self,q):
        q = self.parser.parse(q)
        markup = []
        for hit in self.searcher.search(q):
            markup.append(f"""<p><a href="{hit['url']}">{hit['title']}</a></p>""")
        return HTML(''.join(markup))


In [106]:
s = Searcher()
s.search('rahm emanuel')