# Lab 3: IR

In [1]:
!pip install whoosh



You are using pip version 9.0.1, however version 18.0 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [2]:
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess

In [3]:
DATA_DIR = "lab-data"
DOCUMENTS_DIR = os.path.join(DATA_DIR, "documents")
TOPIC_FILE = os.path.join(DATA_DIR, "air.topics")
QRELS_FILE = os.path.join(DATA_DIR, "air.qrels")

# For windows:
TREC_EVAL = os.path.join("trec_eval", "trec_eval.exe")

#For mac:
#TREC_EVAL = os.path.join("trec_eval", "trec_eval")

## Part 1: Basic Indexing

### Creating the index

In [4]:
def createIndex(schema):
    # Generate a temporary directory for the index
    indexDir = tempfile.mkdtemp()

    # create and return the index
    return index.create_in(indexDir, schema)

In [5]:
# first, define a Schema for the index
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

# now, create the index at the path INDEX_DIR based on the new schema
myIndex = createIndex(mySchema)

### Indexing the documents

In [6]:
def addFilesToIndex(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                writer.add_document(file_path = filePath,
                                    file_content = fileContent)

                # print status every 1000 documents
                if ((docNum+1) % 5 == 0):
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()

In [7]:
# Build a list of files to index
filesToIndex = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

In [8]:
# Check the list
filesToIndex[:5]

['lab-data\\documents\\email01',
 'lab-data\\documents\\email02',
 'lab-data\\documents\\email03',
 'lab-data\\documents\\email04',
 'lab-data\\documents\\email05']

In [9]:
# count files to index
print("number of files:", len(filesToIndex))

number of files: 11


In [10]:
addFilesToIndex(myIndex, filesToIndex)

already indexed: 5
already indexed: 10
done indexing.


### Querying

In [11]:
# define a query parser for the field "file_content" in the index
myQueryParser = QueryParser("file_content", schema=myIndex.schema)
mySearcher = myIndex.searcher()

In [12]:
# run a sample query for the phrase "item"
sampleQuery = myQueryParser.parse("item")
sampleQueryResults = mySearcher.search(sampleQuery, limit=None)

# inspect the result:
# for each document print the rank and the score
for (docnum, result) in enumerate(sampleQueryResults):
    score = sampleQueryResults.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

email01 0 2.6746417187049216


### Evaluation using TREC_EVAL
In order to evaluate our results we will use a topic file - a list of topics we use to evaluate our IR system

In [13]:
# print the topic file
with open(TOPIC_FILE, "r") as f:
    print(f.read())

01 ducks
02 ig nobel prizes
03 mathematics
04 flowing hair
05 music
06 AIR TV



We will compare our evaluate our results with a set of judged results(qrels file) using TREC_EVAL 

In [14]:
# print the first 10 lines in the qrels file
with open(QRELS_FILE, "r") as f:
    qrels10 = f.readlines()[:10]
    print("".join(qrels10))

01 0 email01 0
01 0 email02 0
01 0 email03 0
01 0 email04 1
01 0 email05 1
01 0 email06 1
01 0 email07 0
01 0 email08 0
01 0 email09 0
01 0 email10 0



The follwing function takes a topic file, a qrels file, a query parser and a searcher and use TREC_EVAL to compare our results with the provided qrels file (see assignment PDF for more details)

In [72]:
def trecEval(topicFile, qrelsFile, queryParser, searcher):
    # Load topic file - a list of topics(search phrases) used for evalutation
    with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()

    # create an output file to which we'll write our results
    tempOutputFile = tempfile.mkstemp()[1]
    with open(tempOutputFile, "w") as outputTRECFile:
        # for each evaluated topic:
        # build a query and record the results in the file in TREC_EVAL format
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            topicQuery = queryParser.parse(topic_phrase)
            topicResults = searcher.search(topicQuery, limit=None)
            for (docnum, result) in enumerate(topicResults):
                score = topicResults.score(docnum)
                outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
    
    result = subprocess.run([TREC_EVAL,'-q', qrelsFile, tempOutputFile], stdout=subprocess.PIPE)
    print(result.stdout.decode())

In [73]:
trecEval(TOPIC_FILE, QRELS_FILE, myQueryParser, mySearcher) 

num_ret               	01	1
num_rel               	01	3
num_rel_ret           	01	1
map                   	01	0.3333
Rprec                 	01	0.3333
bpref                 	01	0.3333
recip_rank            	01	1.0000
iprec_at_recall_0.00  	01	1.0000
iprec_at_recall_0.10  	01	1.0000
iprec_at_recall_0.20  	01	1.0000
iprec_at_recall_0.30  	01	1.0000
iprec_at_recall_0.40  	01	0.0000
iprec_at_recall_0.50  	01	0.0000
iprec_at_recall_0.60  	01	0.0000
iprec_at_recall_0.70  	01	0.0000
iprec_at_recall_0.80  	01	0.0000
iprec_at_recall_0.90  	01	0.0000
iprec_at_recall_1.00  	01	0.0000
P_5                   	01	0.2000
P_10                  	01	0.1000
P_15                  	01	0.0667
P_20                  	01	0.0500
P_30                  	01	0.0333
P_100                 	01	0.0100
P_200                 	01	0.0050
P_500                 	01	0.0020
P_1000                	01	0.0010
num_ret               	05	1
num_rel               	05	2
num_rel_ret           	05	0
map                   	05	0.0000
Rprec  

## Part 2: Evaluating different configurations

### Inspecting our index

In [17]:
# Is it empty?
print("Index is empty?", myIndex.is_empty())

# How many files indexed?
print("Number of indexed files:", myIndex.doc_count())

Index is empty? False
Number of indexed files: 11


In [18]:
# define a reader object on the index
myReader = myIndex.reader()

In [19]:
# print first 5 indexed documents
[(docnum, doc_dict) for (docnum, doc_dict) in myReader.iter_docs()][0:5]

[(0, {'file_path': 'lab-data\\documents\\email01'}),
 (1, {'file_path': 'lab-data\\documents\\email02'}),
 (2, {'file_path': 'lab-data\\documents\\email03'}),
 (3, {'file_path': 'lab-data\\documents\\email04'}),
 (4, {'file_path': 'lab-data\\documents\\email05'})]

In [20]:
# list indexed terms for field "file_content"
[term for term in myReader.field_terms("file_content")][1000:1025]

['Care',
 'Carlos',
 'Carmen',
 'Carnivalesque',
 'Carolina',
 'Case',
 'Cat',
 'Catalysis',
 'Catalyst',
 'Catchers',
 'Cater',
 'Caused',
 'Caveat',
 'CbZF1d0021swQuc57kfqHt',
 'Cechetto',
 'Ceder',
 'Celebratory',
 'Center',
 'Cereal',
 'Ceremony',
 'Cerrahi',
 'Certolizumab',
 'Cervical',
 'Chair',
 'Chalfie']

In [21]:
#how many terms do we have?
print(myReader.field_length("file_content"))

29729


In [22]:
# how many documents have the phares "bit", blob"
#   in the field "file_content"?
print("# docs with 'bit'", myReader.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader.doc_frequency("file_content", "get"))

# docs with 'bit' 1
# docs with 'are' 11
# docs with 'get' 6


### Text Analyzers

In [23]:
# we start with basic tokenizer
tokenizer = RegexTokenizer()
[token.text for token in tokenizer("We are going to do Text Analysis with whoosh.analysis")]

['We',
 'are',
 'going',
 'to',
 'do',
 'Text',
 'Analysis',
 'with',
 'whoosh.analysis']

In [24]:
# we might want use stemming:
stmAnalyzer = RegexTokenizer() | StemFilter()
[token.text for token in stmAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['We', 'ar', 'go', 'to', 'do', 'Text', 'Analysi', 'with', 'whoosh.analysi']

In [25]:
# We probably want to lower-case it
# so we add LowercaseFilter
stmLwrAnalyzer = RegexTokenizer() | LowercaseFilter() | StemFilter()
[token.text for token in stmLwrAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['we', 'ar', 'go', 'to', 'do', 'text', 'analysi', 'with', 'whoosh.analysi']

In [26]:
# we probably want to ignore words like "we", "are", "with" when we index files
# so we add StopFilter to filter stop words
stmLwrStpAnalyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['go', 'do', 'text', 'analysi', 'whoosh.analysi']

In [27]:
# we also probably want to break phrases like "whoosh.analysis" into "whoosh" and "analysis"
# so we add IntraWordFilter
stmLwrStpIntraAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpIntraAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['go', 'do', 'text', 'analysi', 'whoosh', 'analysi']

### Evaluating the new analyzers

In [28]:
# define a Schema with the new analyzer
mySchema2 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrStpIntraAnalyzer))

# create the index based on the new schema
myIndex2 = createIndex(mySchema2)

In [29]:
addFilesToIndex(myIndex2, filesToIndex)

already indexed: 5
already indexed: 10
done indexing.


In [30]:
# define a query parser for the field "file_content" in the index
myQueryParser2 = QueryParser("file_content", schema=myIndex2.schema)
mySearcher2 = myIndex2.searcher()

In [31]:
trecEval(TOPIC_FILE, QRELS_FILE, myQueryParser2, mySearcher2) 

num_ret               	01	3
num_rel               	01	3
num_rel_ret           	01	3
map                   	01	1.0000
Rprec                 	01	1.0000
bpref                 	01	1.0000
recip_rank            	01	1.0000
iprec_at_recall_0.00  	01	1.0000
iprec_at_recall_0.10  	01	1.0000
iprec_at_recall_0.20  	01	1.0000
iprec_at_recall_0.30  	01	1.0000
iprec_at_recall_0.40  	01	1.0000
iprec_at_recall_0.50  	01	1.0000
iprec_at_recall_0.60  	01	1.0000
iprec_at_recall_0.70  	01	1.0000
iprec_at_recall_0.80  	01	1.0000
iprec_at_recall_0.90  	01	1.0000
iprec_at_recall_1.00  	01	1.0000
P_5                   	01	0.6000
P_10                  	01	0.3000
P_15                  	01	0.2000
P_20                  	01	0.1500
P_30                  	01	0.1000
P_100                 	01	0.0300
P_200                 	01	0.0150
P_500                 	01	0.0060
P_1000                	01	0.0030
num_ret               	02	11
num_rel               	02	8
num_rel_ret           	02	8
map                   	02	0.9207
Rprec 

In [32]:
# let count the same words again
myReader2 = myIndex2.reader()
print("# docs with 'bit'", myReader2.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader2.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader2.doc_frequency("file_content", "get"))

# docs with 'bit' 11
# docs with 'are' 0
# docs with 'get' 7


**Can you explain the differences?**

### Using NLTK's stemmers and lemmatizers

In [33]:
import nltk
from nltk.stem import *

In [34]:
# download required resources
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\natha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
# we'll compare two stemmers and a lemmatizer
lrStem = LancasterStemmer()
sbStem = SnowballStemmer("english")
wnLemm = WordNetLemmatizer()

In [36]:
# define a list of words to compare the stemmers on
listWords = ["going", "saying", "minimize", "maximum", 
             "meeting", "files", "tries", "is", "are", "beautiful",
             "summarize", "better", "dogs", "phenomena"]

In [37]:
for word in listWords:
    print("%15s %15s %15s %15s" % (lrStem.stem(word),
                                   sbStem.stem(word),
                                   wnLemm.lemmatize(word),
                                   wnLemm.lemmatize(word, 'v')))

          going              go           going              go
            say             say          saying             say
          minim           minim        minimize        minimize
          maxim         maximum         maximum         maximum
           meet            meet         meeting            meet
            fil            file            file            file
            tri             tri             try             try
             is              is              is              be
             ar             are             are              be
         beauty          beauti       beautiful       beautiful
           summ          summar       summarize       summarize
            bet          better          better          better
            dog             dog             dog             dog
       phenomen       phenomena      phenomenon       phenomena


### How to use NLTK stemmers / lemmatizers in Whoosh

In [38]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
from whoosh.analysis import Filter
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [39]:
# Example1: Whoosh filter for NLTK's LancasterStemmer
myFilter1 = RegexTokenizer() | CustomFilter(LancasterStemmer().stem)
[token.text for token in myFilter1("We are going to do Text Analysis with whoosh.analysis")]

['we', 'ar', 'going', 'to', 'do', 'text', 'analys', 'with', 'whoosh.analysis']

In [40]:
# Example2: Whoosh filter for NLTK's WordNetLemmatizer
myFilter2 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize)
[token.text for token in myFilter2("We are going to do Text Analysis with whoosh.analysis")]

['We',
 'are',
 'going',
 'to',
 'do',
 'Text',
 'Analysis',
 'with',
 'whoosh.analysis']

In [41]:
# Example3: Whoosh filter for NLTK's WordNetLemmatizer for verbs
myFilter3 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize, 'v')
[token.text for token in myFilter3("We are going to do Text Analysis with whoosh.analysis")]

['We', 'be', 'go', 'to', 'do', 'Text', 'Analysis', 'with', 'whoosh.analysis']

You can now use myFilter1/2/3 as part of your Schema

------------
You can find details of other NLTK Stemmers and Lemmatizers here:

http://www.nltk.org/api/nltk.stem.html

# Additional Information

## Search Operators

In [74]:
myQueryParser.parse("University of Toronto")

And([Term('file_content', 'University'), Term('file_content', 'of'), Term('file_content', 'Toronto')])

In [76]:
myQueryParser.parse("University OR of OR Toronto")

Or([Term('file_content', 'University'), Term('file_content', 'of'), Term('file_content', 'Toronto')])

In [79]:
myQueryParser2.parse("University OR of OR Toronto")

Or([Term('file_content', 'univers'), Term('file_content', 'toronto')])

In [78]:
myQueryParser.parse("(University Toronto) OR (University British Columbia)")

Or([And([Term('file_content', 'University'), Term('file_content', 'Toronto')]), And([Term('file_content', 'University'), Term('file_content', 'British'), Term('file_content', 'Columbia')])])

## Scoring and parameter tuning

In [80]:
sampleQueryGo = myQueryParser2.parse("go")
sampleQueryResultsGo = mySearcher2.search(sampleQueryGo, limit=None)

In [81]:
# inspect the result:
# for each document print the rank and the score
for (docnum, result) in enumerate(sampleQueryResultsGo):
    score = sampleQueryResultsGo.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

email04 0 1.532818707775775
email08 1 1.4606112073704993
email09 2 1.4308213268867864
email05 3 1.307224528544008
email03 4 1.2392797101539106
email07 5 1.2392797101539106
email06 6 1.227396210411099
email14 7 0.9206709471157581
email01 8 0.8960772103589507
email02 9 0.8960772103589507
email10 10 0.8712000019125419


In [82]:
from whoosh import scoring
mySearcher2TF = myIndex2.searcher(weighting=scoring.TF_IDF())
sampleQueryResultsGoTF = mySearcher2TF.search(sampleQueryGo, limit=None)

In [83]:
for (docnum, result) in enumerate(sampleQueryResultsGoTF):
    score = sampleQueryResultsGoTF.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

email04 0 3.651954492041481
email08 1 2.7389658690311105
email09 2 2.7389658690311105
email03 3 1.8259772460207404
email05 4 1.8259772460207404
email06 5 1.8259772460207404
email07 6 1.8259772460207404
email01 7 0.9129886230103702
email02 8 0.9129886230103702
email10 9 0.9129886230103702
email14 10 0.9129886230103702


Change BM25F parameters:

In [84]:
mySearcher2BM = myIndex2.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2))
sampleQueryResultsGoBM = mySearcher2BM.search(sampleQueryGo, limit=None)

for (docnum, result) in enumerate(sampleQueryResultsGoBM):
    score = sampleQueryResultsGoBM.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

email04 0 1.532818707775775
email08 1 1.4606112073704993
email09 2 1.4308213268867864
email05 3 1.307224528544008
email03 4 1.2392797101539106
email07 5 1.2392797101539106
email06 6 1.227396210411099
email14 7 0.9206709471157581
email01 8 0.8960772103589507
email02 9 0.8960772103589507
email10 10 0.8712000019125419


### Store additional fields

In [86]:
print(open("lab-data/documents/email02").read())

Received: from exgw1-cbr.nexus.csiro.au ([152.83.3.66]) by EXACTN1-CBR.nexus.csiro.au with Microsoft SMTPSVC(5.0.2195.6713);
	 Fri, 18 Apr 2008 23:24:14 +1000
Received: from act-MTAout6.csiro.au ([150.229.7.43]) by exgw1-cbr.nexus.csiro.au with Microsoft SMTPSVC(5.0.2195.6713);
	 Fri, 18 Apr 2008 23:24:14 +1000
X-SBRS: 6.3
X-IronPort-Anti-Spam-Filtered: true
X-IronPort-Anti-Spam-Result: AtMDABo9CEiAZ2D+bmdsb2JhbACRY5oWPw
X-IronPort-AV: E=Sophos;i="4.25,677,1199624400"; 
   d="scan'208";a="191215222"
Received: from anumail6.anu.edu.au ([130.56.64.140])
  by act-ironport-ldap.csiro.au with ESMTP/TLS/DHE-RSA-AES256-SHA; 18 Apr 2008 23:24:14 +1000
Received: from chem.harvard.edu (chem.harvard.edu [128.103.96.254])
	by anumail6.anu.edu.au (8.13.8/8.13.8) with ESMTP id m3IDO6B8020975
	for <paul.thomas@anu.edu.au>; Fri, 18 Apr 2008 23:24:11 +1000 (EST)
	(envelope-from mini-air-bounces@chem.harvard.edu)
Received: from chem.harvard.edu (chem.harvard.edu [127.0.0.1])
	by chem.harvard.edu (Postfi

In [87]:
extractedDate = re.search("Date: (.*)\n", open("lab-data/documents/email02").read()).groups()[0]
print(extractedDate)

Fri, 18 Apr 2008 09:06:11 -0400


In [88]:
import email
import time
email.utils.parsedate(extractedDate)

(2008, 4, 18, 9, 6, 11, 0, 1, -1)

In [89]:
from whoosh.fields import NUMERIC

# first, define a Schema for the index
mySchemaDate = Schema(file_path = ID(stored=True),
                      file_year = NUMERIC(stored=True),
                      file_content = TEXT(analyzer = RegexTokenizer()))

# now, create the index at the path INDEX_DIR based on the new schema
myIndexDate = createIndex(mySchemaDate)


In [90]:
def addFilesToIndexDate(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                parsedYear = email.utils.parsedate(re.search("Date: (.*)\n", fileContent).groups()[0])[0]
                writer.add_document(file_path = filePath,
                                    file_year = int(parsedYear),
                                    file_content = fileContent)

                # print status every 1000 documents
                if (docNum+1 % 1000 == 0):
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()

In [91]:
addFilesToIndexDate(myIndexDate, filesToIndex)

done indexing.


In [92]:
myQueryParserDate = QueryParser("file_content", schema=myIndexDate.schema)
mySearcherDate = myIndexDate.searcher()

In [93]:
sampleQueryDate = myQueryParserDate.parse("go")
sampleQueryResultsDate = mySearcherDate.search(sampleQueryDate, limit=None, groupedby="file_year")

In [94]:
mySearcherDate.stored_fields(8)

{'file_path': 'lab-data\\documents\\email09', 'file_year': 2008}

In [95]:
res = sampleQueryResultsDate.groups()
print(res)

{2009: [5, 0], 2008: [8, 10, 9, 1], 2010: [2, 3], 2007: [6, 4, 7]}


In [96]:
for grp in res.keys():
    print(f"Group {grp}")
    for docnum in res[grp]:
        print(f"\t{docnum}\t{mySearcherDate.stored_fields(docnum)['file_path']}\t{sampleQueryResultsDate.score(docnum)}")

Group 2009
	5	lab-data\documents\email06	0.9515367147003012
	0	lab-data\documents\email01	1.268912431091308
Group 2008
	8	lab-data\documents\email09	0.902826206556481
	10	lab-data\documents\email14	0.8905624586380126
	9	lab-data\documents\email10	0.902826206556481
	1	lab-data\documents\email02	1.257405405496971
Group 2010
	2	lab-data\documents\email03	1.2457191647278192
	3	lab-data\documents\email04	1.2219953566378796
Group 2007
	6	lab-data\documents\email07	0.9273964639464627
	4	lab-data\documents\email05	1.20998345096161
	7	lab-data\documents\email08	0.9151546430082554
