# Assignment 2: IR

## Preparations
* Put all your imports, and path constants in the next cells
* Make sure all your path constants are **relative to** ***DATA_DIR*** and **NOT hard-coded** in your code.

In [1]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget



In [2]:
import wget
wget.download("https://github.com/MIE451-1513-2019/course-datasets/raw/master/government.zip", "government.zip")

'government (8).zip'

In [3]:
!unzip government.zip

Archive:  government.zip
replace government/topics-with-full-descriptions.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [0]:
# imports all necessary libraries
# Put all your imports here
from whoosh import index, writing, qparser, scoring
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget

import nltk
from nltk.stem import *

In [0]:
DATA_DIR = "government"
# Put other path constants here
DOCUMENTS_DIR = os.path.join(DATA_DIR, "documents")
TOPIC_FILE = os.path.join(DATA_DIR, "gov.topics")
QRELS_FILE = os.path.join(DATA_DIR, "gov.qrels")

## Question 1
Provide your text answers in the following two markdown cells

### Q1 (a): Provide answer to Q1 (a) here [markdown cell]
MAP (Mean Average Precision) is appropriate for measuring search system performance for goverment websites.

### Q1 (b): Provide answer to Q1 (b) here [markdown cell]
MAP is the mean of the average precision scores for each query. It is better to measure overall precision of a set of queries, and it has good discrimination and stability. This project is aim to investigate the performance of search engine for a government website, we care more about its overall performance over a set of queries, instead of precision for a specific topic.

## Question 2

### Q2 (a): Write your code below

In [0]:
# Put your code for creating the index here (you can add more cells).
# Make sure you save the final index in the variable INDEX_Q2, your query parser in QP_Q2, and your searcher in SEARCHER_Q2

def createIndex(schema):
    # Generate a temporary directory for the index
    indexDir = tempfile.mkdtemp()

    # create and return the index
    return index.create_in(indexDir, schema)

In [0]:
# first, define a Schema for the index
mySchema2 = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

# now, create the index at the path INDEX_DIR based on the new schema
myIndex2 = createIndex(mySchema2)

In [0]:
def addFilesToIndex(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                writer.add_document(file_path = filePath,
                                    file_content = fileContent)

                # print status every 1000 documents
                if (int(docNum+1) % 1000 == 0):
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()

In [0]:
# Build a list of files to index
filesToIndex2 = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

In [10]:
addFilesToIndex(myIndex2, filesToIndex2)

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [0]:
# define a query parser for the field "file_content" in the index
myQueryParser2 = QueryParser("file_content", schema=myIndex2.schema)
mySearcher2 = myIndex2.searcher()

In [0]:
INDEX_Q2 = myIndex2 # Replace None with your index for Q2
QP_Q2 = myQueryParser2 # Replace None with your query parser for Q2
SEARCHER_Q2 = mySearcher2 # Replace None with your searcher for Q2

In [0]:
# run a sample query for the phrase "item"
sampleQuery2 = QP_Q2.parse("item")
sampleQueryResults2 = SEARCHER_Q2.search(sampleQuery2, limit=None)

# # inspect the result:
# # for each document print the rank and the score
# for (docnum, result) in enumerate(sampleQueryResults):
#     score = sampleQueryResults.score(docnum)
#     fileName = os.path.basename(result["file_path"])
#     print(fileName, docnum, score)

In [0]:
def pyTrecEval(topicFile, qrelsFile, queryParser, searcher):
    # Load topic file - a list of topics(search phrases) used for evalutation
    with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()

    # create an output file to which we'll write our results
    tempOutputFile = tempfile.mkstemp()[1]
    with open(tempOutputFile, "w") as outputTRECFile:
        # for each evaluated topic:
        # build a query and record the results in the file in TREC_EVAL format
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            #print(topic_id, topic_phrase)
            topicQuery = queryParser.parse(topic_phrase)
            topicResults = searcher.search(topicQuery, limit=None)
            for (docnum, result) in enumerate(topicResults):
                score = topicResults.score(docnum)
                #print("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
    with open(qrelsFile, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(tempOutputFile, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)
    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    for query_id, query_measures in results.items():
        for measure, value in query_measures.items():
            if measure == "runid":
              continue
            print_line(measure, query_id, value)
    for measure in query_measures.keys():
        if measure == "runid":
              continue
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))

      

In [15]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q2, SEARCHER_Q2)

num_q                    1       1.0000
num_ret                  1       1.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

### Q2 (b): Provide answer to Q2 (b) here [markdown cell]
MAP for all is 0.1971, indicating the performance is not that good. 

### Q2 (c): Provide answer to Q2(c) here [markdown cell]
The topics 1,2,6,7,9,16,18,28 did very bad, whose MAPs are 0, meaning no relevant documents were retrieved.
While topic 24 did very well with MAP score of 1.0, that implies all relevant documents are returned.
The others did pretty fair, MAP for topic 4 is 0.0312, for topic 10 is 0.1667, for topic 14 is 0.25, for topic 22 is 0.2 and for topic 26 is 0.1111.


## Question 3

In [0]:
def printRelName(topicFile, qrelsFile, queryParser, searcher, id):
  with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()
  for topic in topics:
        topic_id, topic_phrase = tuple(topic.split(" ", 1))
        if topic_id == id:
          print("---------------------------Topic_id and Topic_phrase----------------------------------")
          print(topic_id, topic_phrase)
          topicQuery = queryParser.parse(topic_phrase)
          topicResults = searcher.search(topicQuery, limit=None)
          print("---------------------------Return documents----------------------------------")
          for (docnum, result) in enumerate(topicResults):
              score = topicResults.score(docnum)
              print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
          print("---------------------------Relevant documents----------------------------------")
          with open(qrelsFile, 'r') as f_qrel:
            qrels = f_qrel.readlines()
            for i in qrels:
              qid, _, doc, rel = i.rstrip().split(" ")
              if qid == id and rel == "1":
                print(i.rstrip())

In [17]:
# Print out all topic_id and topic_phrase
def idList(TOPIC_FILE):
  ids = []
  with open(TOPIC_FILE, "r") as tf:
    topics = tf.read().splitlines()
    for i in range(len(topics)):
      for s in str.split(topics[i]):
        if s.isdigit():
          ids.append(int(s)) 
  return ids

ids = idList(TOPIC_FILE)

# Iterate with id from idList
for id in ids:
  result = printRelName(TOPIC_FILE, QRELS_FILE, QP_Q2, SEARCHER_Q2, str(id))

---------------------------Topic_id and Topic_phrase----------------------------------
1 mining gold silver coal
---------------------------Return documents----------------------------------
1 Q0 G00-90-0342721 0 26.645398 test
---------------------------Relevant documents----------------------------------
1 0 G00-00-1006224 1
1 0 G00-02-0901987 1
1 0 G00-03-1898526 1
1 0 G00-10-3730888 1
1 0 G00-10-3849661 1
---------------------------Topic_id and Topic_phrase----------------------------------
2 juvenile delinquency
---------------------------Return documents----------------------------------
2 Q0 G00-22-3396139 0 17.262139 test
2 Q0 G00-76-0415824 1 10.597055 test
2 Q0 G00-78-1531079 2 8.778648 test
2 Q0 G00-15-1718631 3 8.076860 test
2 Q0 G00-70-2787853 4 6.788751 test
2 Q0 G00-74-1394517 5 3.368380 test
---------------------------Relevant documents----------------------------------
2 0 G00-08-1145623 1
2 0 G00-37-1427392 1
---------------------------Topic_id and Topic_phrase-------

In [0]:
# with open(DOCUMENTS_DIR+'/02/G00-02-0541868', "r", encoding="utf-8") as f:
#   fileContent1 = f.read()
#   fileWords1 = fileContent.split()
  
#   wordsCounter1 = 0
#   for word in fileWords1:
#       # strip non-alphanumeric character, and convert to lower case
#       if word.strip(""" ,.*()[]!@#$%^&*{}?'`"-""").lower() == "and":
#           wordsCounter1 += 1
# print(wordsCounter1) 
# #print(fileContent)

In [0]:
# with open(DOCUMENTS_DIR+'/75/G00-75-2371200','r', encoding="utf-8") as f:
#   fileContent = f.read()
#   fileWords2 = fileContent.split()
  
#   wordsCounter2 = 0
#   for word in fileWords2:
#       # strip non-alphanumeric character, and convert to lower case
#       if word.strip(""" ,.*()[]!@#$%^&*{}?'`"-""").lower() == "and":
#           wordsCounter1 += 1
# print(wordsCounter2) 
# #print(fileContent)

### Q3 (a): Provide answer to Q3 (a) here [markdown cell]
The documents with high rank ususally contain more searching word in the form as it is in the query, so we firstly need to make it lowercase. Alosi, through stemming or lemmatization, we can reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. Besides, high frequencies of apparance of stop-words such as "and", "or", "we" would impact the rank of document relevancey.

Let's pick topic 28 Early Childhood Education as an example.

The top-ranked document is G00-75-2371200 with score of 24.505669, but it is a irrelvant document, which is false positive.

The two relevant documents (G00-02-0541868 and G00-54-2576117) are even not be found and returned, they are false negative.

Therefore, we open and look into the two files, and found that, the relevant document contains many words like "child", which is lowercase and the stem of "childhood". Also, there are many stop words which reduce the document relevancy, such as "and", the occured in the relevant document 10 times, but none in the top-ranked document. This motivated me to remove the stop words.



### Q3 (b): Write your code below

In [20]:
# Download required resources
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
# Put your code for creating the index here (you can add more cells).
# Make sure you save the final index in the variable INDEX_Q3, your query parser in QP_Q3, and your searcher in SEARCHER_Q3

# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
from whoosh.analysis import Filter
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [22]:
# Customize a new analyzer
stmLwrStpIntraAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()

# define a Schema with the new analyzer
mySchema3 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrStpIntraAnalyzer))

# create the index based on the new schema
myIndex3 = createIndex(mySchema3)

# add files to index
filesToIndex3 = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]
addFilesToIndex(myIndex3, filesToIndex3)

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [0]:
# Define a query parser for the field "file_content" in the index
myQueryParser3 = QueryParser("file_content", schema=myIndex3.schema)
mySearcher3 = myIndex3.searcher()

In [0]:
INDEX_Q3 = myIndex3 # Replace None with your index for Q3
QP_Q3 = myQueryParser3 # Replace None with your query parser for Q3
SEARCHER_Q3 = mySearcher3 # Replace None with your searcher for Q3

In [25]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q3, SEARCHER_Q3)

num_q                    1       1.0000
num_ret                  1       3.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

### Q3 (c): Provide answer to Q3 (c) here [markdown cell]

In this part, we modified the Analyzer, adding LowerCaserFilter, IntraWordFilter, StopFilter and StemFilter, all modifications would find more key words which match the search quries. So the overall performance is improved, the current MAP is 0.3366, better than previous one.

Using function "printRelName" to show the ground true relevant files and the files the systme return, found that all relevant files are returned, that indicates the performance of false negative improved a lot. But the highest-ranked document is still G00-75-2371200 (false positive), no improvment.

In [26]:
printRelName(TOPIC_FILE, QRELS_FILE, QP_Q3, SEARCHER_Q3, "28")

---------------------------Topic_id and Topic_phrase----------------------------------
28 Early Childhood Education
---------------------------Return documents----------------------------------
28 Q0 G00-75-2371200 0 19.649658 test
28 Q0 G00-93-3702508 1 17.522582 test
28 Q0 G00-93-4160214 2 17.337695 test
28 Q0 G00-48-1527977 3 17.337695 test
28 Q0 G00-99-2279811 4 17.240218 test
28 Q0 G00-61-3894960 5 17.215261 test
28 Q0 G00-31-0429249 6 16.852799 test
28 Q0 G00-30-2788847 7 16.811836 test
28 Q0 G00-78-2978026 8 16.734738 test
28 Q0 G00-28-3705847 9 16.719697 test
28 Q0 G00-50-3231467 10 16.661427 test
28 Q0 G00-74-2972556 11 16.341956 test
28 Q0 G00-93-1203370 12 16.022081 test
28 Q0 G00-91-3997333 13 15.976799 test
28 Q0 G00-77-3295130 14 15.920519 test
28 Q0 G00-49-2602614 15 15.752977 test
28 Q0 G00-02-0541868 16 15.437172 test
28 Q0 G00-04-3016417 17 15.070530 test
28 Q0 G00-16-2494170 18 14.951547 test
28 Q0 G00-82-0211909 19 14.947712 test
28 Q0 G00-78-0877232 20 14.848352 te

### Q3 (d): Provide answer to Q3 (d) here [markdown cell]

Yes, for now, the MAP is 0.3366, performance is got much better by around 71%.



### Q3 (e): Provide answer to Q3 (e) here [markdown cell]

Yes, topic 2,4,9,10,14,18,28 all got better, while topic 22 and 26 got even worse. 

Topic 1,6,7,16 remain bad and topic 24 remain good.


### Q3 (f): Provide answer to Q3 (f) here [markdown cell]

Yes, it's good, at least this means can return more relevant documents, and the overall performance got better, and a lot of individual topic also got better performance accoridng to their MAP score. But meantime we retrieved more false positive canses.

## Question 4

In [27]:
# Put your code for creating the index here (you can add more cells).
# Make sure you save the final index in the variable INDEX_Q4, your query parser in QP_Q4, and your searcher in SEARCHER_Q4

# Customize a new filter
myFilter = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter() | CustomFilter(LancasterStemmer().stem)  

# define a Schema with the new analyzer
mySchema4 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = myFilter))

# create the index based on the new schema
myIndex4 = createIndex(mySchema4)

filesToIndex4 = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

addFilesToIndex(myIndex4, filesToIndex4)


already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [0]:

# Define a query parser for the field "file_content" in the index
# By default, the parser treats the words of query as if they were connected by "AND", but usually people expect documents contain more of words they searched for score higher
# So using parser of "OR" instead, any document containing any of query word will be presented
# The "factory()" is a scalling factor on the bonus (between 0 and 1)
myQueryParser4 = QueryParser("file_content", schema=myIndex4.schema, group=qparser.OrGroup.factory(0.9))

#myQueryParser4 = myQueryParser.add_plugin(qparser.FuzzyTermPlugin())
# From reference website, the optimal parameter of b is between 0.3 and 0.9, and k1 is typically evaluated in the range of 0 to 3, and the optimal will be from 0.5 to 2.0, 
# but after tuning found B=0.54 and K1=2.5 performed best
mySearcher4 = myIndex4.searcher(weighting=scoring.BM25F(B=0.54, K1=2.5))

### Please answer the following questions here
(a) A clear list of all final modifications made. 

1. Added a CustomFilter(LancasterStemmer()) to analyzer
2. Change the parser to "OR" instead of default "AND"
3. Changed scoring method to BM25F (BestMatching) whihc is a ranking function to estimate the relevance of documents to a given search query



(b)  Why each modification was made – how did it help? 



1. CustomFilter(LancasterStemmer) would make the words "cleaner", which would including more relevant words 
2. People usually expect documents contain more of words they searched for score higher, thus using "OR" would return any document containing any of search query compared to "AND" in the parser
3. BM25F is a weighting scoring method based on the relevance. It is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document, regardless of their proximity within the document.
Through hyperparameter tuning, B=0.54 and K1=2.5 performed best.


(c)  The  final  MAP  performance  that  these  modifications  attained.

The final overall MAP score is 0.4110, improved by 22.1%(MAP is 0.3366 for Q3)





In [29]:
INDEX_Q4 = myIndex4 # Replace None with your index for Q4
QP_Q4 = myQueryParser4 # Replace None with your query parser for Q4
SEARCHER_Q4 = mySearcher4 # Replace None with your searcher for Q4

pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q4, SEARCHER_Q4)

num_q                    1       1.0000
num_ret                  1       469.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.0659
gm_map                   1       -2.7203
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0526
iprec_at_recall_0.00     1       0.1034
iprec_at_recall_0.10     1       0.1034
iprec_at_recall_0.20     1       0.1034
iprec_at_recall_0.30     1       0.1034
iprec_at_recall_0.40     1       0.1034
iprec_at_recall_0.50     1       0.1034
iprec_at_recall_0.60     1       0.1034
iprec_at_recall_0.70     1       0.0519
iprec_at_recall_0.80     1       0.0519
iprec_at_recall_0.90     1       0.0472
iprec_at_recall_1.00     1       0.0472
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0500
P_30                     1       0.10

In [30]:
printRelName(TOPIC_FILE, QRELS_FILE, QP_Q4, SEARCHER_Q4, "28")

---------------------------Topic_id and Topic_phrase----------------------------------
28 Early Childhood Education
---------------------------Return documents----------------------------------
28 Q0 G00-75-2371200 0 18.014298 test
28 Q0 G00-11-3066108 1 17.921780 test
28 Q0 G00-93-3702508 2 17.320336 test
28 Q0 G00-93-1203370 3 15.628318 test
28 Q0 G00-54-2576117 4 15.109240 test
28 Q0 G00-77-3295130 5 14.682202 test
28 Q0 G00-02-0541868 6 13.827817 test
28 Q0 G00-27-2159399 7 13.659349 test
28 Q0 G00-78-1531079 8 13.080391 test
28 Q0 G00-82-0211909 9 13.013776 test
28 Q0 G00-74-1394517 10 12.915128 test
28 Q0 G00-78-0877232 11 12.690096 test
28 Q0 G00-69-0204239 12 12.577726 test
28 Q0 G00-99-2279811 13 12.023816 test
28 Q0 G00-78-2978026 14 11.989399 test
28 Q0 G00-28-3705847 15 11.864604 test
28 Q0 G00-93-4160214 16 11.850933 test
28 Q0 G00-48-1527977 17 11.850933 test
28 Q0 G00-61-3894960 18 11.781422 test
28 Q0 G00-50-3231467 19 11.738381 test
28 Q0 G00-74-2972556 20 11.653364 te

## Validation

In [0]:
# Run the following cells to make sure your code returns the correct value types

In [0]:
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
import os.path

### Q2 Validation

In [33]:
assert(isinstance(INDEX_Q2, FileIndex)), "Index Type"
assert(isinstance(QP_Q2, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q2, Searcher)), "Searcher Type"
print("Q2 Types Validated")

Q2 Types Validated


### Q3 Validation

In [34]:
assert(isinstance(INDEX_Q3, FileIndex)), "Index Type"
assert(isinstance(QP_Q3, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q3, Searcher)), "Searcher Type"
print("Q3 Types Validated")

Q3 Types Validated


### Q4 Validation (Graduate Students)

In [35]:
assert(isinstance(INDEX_Q4, FileIndex)), "Index Type"
assert(isinstance(QP_Q4, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q4, Searcher)), "Searcher Type"
print("Q4 Types Validated")

Q4 Types Validated
