## Buiding Whoosh Schema

In [1]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer

schema = Schema(filename=ID(stored=True),
                line_num=ID(stored=True),
                content=TEXT(analyzer=StemmingAnalyzer())
               )

## Loading Data

In [2]:
import os, os.path
from whoosh import index

# Note, this clears the existing index in the directory
ix = index.create_in("hp", schema)

# Get a writer form the created index in 
writer = ix.writer()

In [3]:
def loadFile(writer, fname):
    '''
    Read file contents, load into database.
    '''
    line_no = 1
    with open(fname, 'r') as infile:
        for line in infile:
            line = line.rstrip('\n')
            line_no += 1
            writer.add_document(filename=fname, line_num=str(line_no),content=line)
    print("Indexed: ", fname)


def processFolder(writer,folder):
    '''
    Process a folder for files and subfolders
    '''
    print('Processing folder: ',folder)
    for root, dirs, files in os.walk(folder):
        print("root = ", root)
        # Process Files
        for file in files:
            if file.endswith(".txt"):
                filename = os.path.join(root, file)
                print('Processing File:',filename)
                loadFile(writer,filename)
            else:
                print("Unhandled File")
        # Recurse into subfolders
        for d in dirs:
            print("recursing into ",d)
            processFolder(writer,d)

# Functions defined,  get the party started:
processFolder(writer,"hp")
writer.commit() # save changes

Processing folder:  hp
root =  hp
Processing File: hp/CHAPTER 1.txt
Indexed:  hp/CHAPTER 1.txt
Processing File: hp/CHAPTER 2.txt
Indexed:  hp/CHAPTER 2.txt
Processing File: hp/CHAPTER 3.txt
Indexed:  hp/CHAPTER 3.txt
Processing File: hp/CHAPTER 4.txt
Indexed:  hp/CHAPTER 4.txt
Processing File: hp/CHAPTER 5.txt
Indexed:  hp/CHAPTER 5.txt
Processing File: hp/CHAPTER 6.txt
Indexed:  hp/CHAPTER 6.txt
Processing File: hp/CHAPTER 7.txt
Indexed:  hp/CHAPTER 7.txt
Processing File: hp/CHAPTER 8.txt
Indexed:  hp/CHAPTER 8.txt
Unhandled File
Unhandled File
Unhandled File
Unhandled File
Unhandled File
recursing into  MAIN.tmp
Processing folder:  MAIN.tmp
root =  hp/MAIN.tmp
Unhandled File


## Executing Queries

In [4]:
from whoosh.qparser import QueryParser

qp = QueryParser("content", schema=ix.schema)
q = qp.parse(u"Harry")

with ix.searcher() as s:
    results = s.search(q)
    for hit in results:
        print(hit)

<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '708'}>
<Hit {'filename': 'hp/CHAPTER 2.txt', 'line_num': '396'}>
<Hit {'filename': 'hp/CHAPTER 1.txt', 'line_num': '97'}>
<Hit {'filename': 'hp/CHAPTER 2.txt', 'line_num': '45'}>
<Hit {'filename': 'hp/CHAPTER 3.txt', 'line_num': '18'}>
<Hit {'filename': 'hp/CHAPTER 3.txt', 'line_num': '339'}>
<Hit {'filename': 'hp/CHAPTER 5.txt', 'line_num': '916'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '349'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '424'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '756'}>


In [5]:
from whoosh.qparser import QueryParser
from whoosh import scoring

qp = QueryParser("content", schema=ix.schema)
q = qp.parse(u"Harry")

with ix.searcher(weighting=scoring.TF_IDF()) as s:
    results = s.search(q)
    for hit in results:
        print(hit)

<Hit {'filename': 'hp/CHAPTER 2.txt', 'line_num': '91'}>
<Hit {'filename': 'hp/CHAPTER 2.txt', 'line_num': '258'}>
<Hit {'filename': 'hp/CHAPTER 2.txt', 'line_num': '396'}>
<Hit {'filename': 'hp/CHAPTER 5.txt', 'line_num': '296'}>
<Hit {'filename': 'hp/CHAPTER 5.txt', 'line_num': '750'}>
<Hit {'filename': 'hp/CHAPTER 5.txt', 'line_num': '809'}>
<Hit {'filename': 'hp/CHAPTER 5.txt', 'line_num': '933'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '402'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '708'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '809'}>


In [8]:
from whoosh.query import *

with ix.searcher(weighting=scoring.TF_IDF()) as s:
    qp = QueryParser("content", ix.schema)
    user_q = qp.parse(u"Harry")

    # Only show documents in the "rendering" chapter
    allow_q = Term("filename", "hp/CHAPTER 6.txt")
    
    results = s.search(user_q, filter=allow_q)
    for hit in results:
        print(hit)

<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '402'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '708'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '809'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '6'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '7'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '8'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '11'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '14'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '20'}>
<Hit {'filename': 'hp/CHAPTER 6.txt', 'line_num': '41'}>
