# Information Retrieval

In [3]:
import ParserCACM
import TextRepresenter
import indexation
import modeles
from query import QueryParserCACM
import evaluation
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import operator
# Auto reload the imported modules when running cells
%load_ext autoreload 
%autoreload 2

# Constants:
srcFolder = "cacm/" 
srcFile = "cacm.txt"
qryFile = "cacm.qry"
relFile = "cacm.rel"
gendata = "gendata" #output folder
indexName = "cacm"


cacm_txt = os.path.join(srcFolder, srcFile)
cacm_qry = os.path.join(srcFolder, qryFile)
cacm_rel = os.path.join(srcFolder, relFile)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Testing ParserCACM:

In [11]:
print("##### Testing ParserCACM #####")
parser = ParserCACM.ParserCACM()
parser.initFile(cacm_txt)
doc = parser.nextDocument()
noErrors = True
nbDocs = 3205
i = 1
while doc is not None:
    docId = int(doc.getId())
    docTxt = doc.getText()
    if docId != i:
        noErrors = False
        print("Error, doc at position %d is #%d" % (i, docId))
    if docId == 1:
        if ("Preliminary Report-International Algebraic Language" not in docTxt and
        "Perlis" not in docTxt and 
        "Samelson" not in docTxt):
            print("Error, the document #1 is not as expected.")
        else:
            print("Doc #1 is as expected")
        
    elif docId == 178:
        if ("ROOTFINDER" not in docTxt and
        "Thacher" not in docTxt):
            print("Error, the document #178 is not as expected.")
        else:
            print("Doc #178 is as expected")        
            
    elif docId == 3204:
        if ("An On-Line Program for Non-Numerical Algebra" not in docTxt and
        "The goal of this program is" not in docTxt and
        "console at Stanford University." not in docTxt):
            print("Error, the document #3204 is not as expected.")
        else:
            print("Doc #3204 is as expected")        
        
    i += 1
    doc = parser.nextDocument()
    
if i == nbDocs:
    print("Success: there are %d documents as expected" % nbDocs)
else:
    print("Error, found %d docs, should contain %d docs" % (i, nbDocs))
    noErrors = False

if noErrors:
    print("##### Test: success #####")

##### Testing ParserCACM #####
Doc #1 is as expected
Doc #178 is as expected
Doc #3204 is as expected
Success: there are 3205 documents as expected
##### Test: success #####


## Testing PorterStemmer:

In [5]:
print("##### Testing PorterStemmer start    #####")
stemmer = TextRepresenter.PorterStemmer()
txtRepr = (stemmer.getTextRepresentation("Information retrieval (IR) is the activity of \
    obtaining information resources relevant to an \
    information need from a collection of information resources"))
expectDic = {'resourc':2, 'inform':4, 'relev':1,
             'retriev':1, 'activ': 1, 'obtain':1,
             'collect':1, 'ir':1}
if txtRepr != expectDic:
    print("Error, stemmer does not work as expected")
else:
    print("Stemmer works as expected")
print("##### Testing PorterStemmer finished #####")

##### Testing PorterStemmer start    #####
Stemmer works as expected
##### Testing PorterStemmer finished #####


## Testing the indexation:

In [6]:
# Construct the index:
idx = indexation.Index(indexName, gendata)
%time idx.indexation(cacm_txt, parser, stemmer, verbose=True)

Performing the indexation...
1st pass: build the index...
2nd pass: build the inverted index...
Finished.
CPU times: user 5 s, sys: 636 ms, total: 5.63 s
Wall time: 6.77 s


In [7]:
print("\n###### Testing the index: ###### ")

words = ["logic", "nation", "test"]
noErrors = True
for word in words:
    idxDocContain = set(idx.getTfsForStem(word).keys())

    parser.initFile(cacm_txt)
    doc = parser.nextDocument()
    parserDocContain = set()
    while doc is not None:
        docText = doc.getText()
        docStems = stemmer.getTextRepresentation(docText).keys()
        docId = int(doc.getId())
        if word in docStems:
            parserDocContain.add(docId)
            if docId not in idxDocContain:
                print("Doc #%d contains %s: “%s”" % (docId, word, docText))
        elif docId in idxDocContain:
            print("Doc #%d should contain %s: “%s”" % (docId, word, docText))

        doc = parser.nextDocument()

    if len(idxDocContain - parserDocContain) > 0 or len(parserDocContain - idxDocContain) > 0:
        print("Step failed for word '%s'" % word)
        break
    else:
        print("Step succeeded for word '%s'." % word)
    
if noErrors:
    print("##### Test: succes #####")
else:
    print("##### Test: fail #####")


###### Testing the index: ###### 
Step succeeded for word 'logic'.
Step succeeded for word 'nation'.
Step succeeded for word 'test'.
##### Test: succes #####


In [8]:
print("Links from the 1st article:", sorted(idx.getSuccNodes('1')))
print("(Should be [43, 53, 91, ..., 1883, 1982, 3184])")
print("\nLinks from the 2nd article:", idx.getSuccNodes('2'))
print("(Should be empty)")

print("\nLinks to the 1st article:", sorted(idx.getPrevNodes('1')))
print("(Should be [43, 53, 91, ..., 1883, 1982, 3184])")
print("\nLinks to the 43rd article:", sorted(idx.getPrevNodes('43')))
print("(Should be [1, 205])")


Links from the 1st article: [43, 53, 91, 100, 123, 164, 165, 196, 205, 210, 214, 324, 398, 410, 642, 669, 1273, 1883, 1982, 3184]
(Should be [43, 53, 91, ..., 1883, 1982, 3184])

Links from the 2nd article: []
(Should be empty)

Links to the 1st article: [43, 53, 91, 100, 123, 164, 165, 196, 205, 210, 214, 324, 398, 410, 642, 669, 1273, 1883, 1982, 3184]
(Should be [43, 53, 91, ..., 1883, 1982, 3184])

Links to the 43rd article: [1, 205]
(Should be [1, 205])


In [9]:
# Choosing what to test:
test_binaryweighter = True
test_tfidfweighter = True
test_queryparser = True
test_PRrecallmeasure = True
test_averageprecision = True
test_eval_ir_model = True
test_unigram_model = True
test_okapi = True
gridsearch_language = True

## Testing the weighters:

In [13]:
query = stemmer.getTextRepresentation(" Parallel languages; languages for parallel computation")

if test_binaryweighter:
    print("\n#####Testing BinaryWeighter: #####")

    bw = modeles.BinaryWeighter(idx)
    print("bw.getDocWeightsForDoc(",docId,"):", 
        bw.getDocWeightsForDoc(docId))
    print("\nbw.getDocWeightsForStem('logic'):", 
        bw.getDocWeightsForStem("logic"))
    print("\nbw.getWeightsForQuery(' Parallel languages; languages for parallel computation'):", 
        bw.getWeightsForQuery(query))

    print("\n#####Testing Vectoriel with BinaryWeighter: #####")
    vect = modeles.Vectoriel(idx, bw)
    print("Top 10 documents for the previous query:")
    print("""Should include some of the following docs: 1043, 1188, 1306, 
    1358, 1396, 1491, 1923, 2246, 2316, 2527, 2699, 2710, 2715, 2716, 
    2906, 2923, 2956, 3073, 3150, """)
    print(vect.getRanking(query)[:10])
else:
    print("\n##### Skipping the test of BinaryWeighter #####")
    


#####Testing BinaryWeighter: #####
bw.getDocWeightsForDoc( 3204 ): {'1': 1, 'te': 1, 'assist': 1, 'santa': 1, 'access': 1, '5': 1, 'compil': 1, 'chosen': 1, 'easi': 1, 'written': 1, 'requir': 1, 'time': 2, 'debugg': 1, 'stanford': 1, 'comput': 2, 'numer': 1, 'obtain': 1, 'short': 1, 'design': 1, 'korsvold': 1, 'program': 4, 'monica': 1, 'consol': 1, 'programm': 1, 'line': 1, 'lisp': 1, 'develop': 1, 'teletyp': 1, 'automat': 1, 'goal': 1, 'california': 1, 'corpor': 1, '32': 1, 'univers': 1, 'remot': 1, 'step': 1, 'share': 1, 'result': 1, 'mathemat': 1, 'algebra': 1}
Error with the representation. Exit


NameError: name 'exit' is not defined

In [None]:
tfidfWeighter = modeles.TfidfWeighter(idx)
vect = modeles.Vectoriel(idx, tfidfWeighter)

if test_tfidfweighter:
    print("\n###### Testing TfidfWeighter: ###### ")
    print("tfidfWeighter.getDocWeightsForDoc(", docId, "):",
         tfidfWeighter.getDocWeightsForDoc(docId))
    print("\ntfidfWeighter.getDocWeightsForStem(\"logic\"):",
         tfidfWeighter.getDocWeightsForStem(wordTest))
    print("\ntfidfWeighter.getWeightsForQuery(query):",
         tfidfWeighter.getWeightsForQuery(query))
    print("\n###### Testing Vectoriel with TfidfWeighter: ###### ")
    print("Top 10 documents for the query:")
    print(vect.getRanking(query)[:10])
else:
    print("\n##### Skipping the test of TfidfWeighter #####")

## Testing the QueryParser

In [None]:
if test_queryparser:
    print("\n###### Testing QueryParserCACM: ###### ")
    qp = QueryParserCACM(cacm_qry, cacm_rel)
    query = qp.nextQuery()
    print("Searching for query #%d:" % queryId)
    while query is not None and query.getID() != str(queryId):
        #print(query)
        #print(20*'-')
        query = qp.nextQuery()

    print("Query:", query)
    queryTxt = stemmer.getTextRepresentation(query.getText())
    print("\nCompute scores:")
    scores = vect.getRanking(queryTxt)
    print(scores[:10])
    print("Done.")
else:
    print("\n##### Skipping the test of QueryParserCACM #####")

## Testing precision/recall measure:


In [None]:
if test_PRrecallmeasure:
    print("\n###### Testing evaluation.PrecisionRecallMeasure: ###### ")
    queryChosen = np.random.randint(1, 50, size=10)
    queryChosen =[7, 10, 25]#, 14, 26, 27, 42, 43] # lots of relevant results
    qp = QueryParserCACM(cacm_qry, cacm_rel)
    query = qp.nextQuery()
    print("Searching for query #", queryChosen)
    while query is not None :
        if int(query.getID()) in queryChosen:
            print("Query:", query)
            queryTxt = stemmer.getTextRepresentation(query.getText())
            print("Retrieve scores...")
            %time ranking = vect.getRanking(queryTxt)
            print("Create PrecisionRecallMeasure object")
            irlist = evaluation.IRList(query, ranking)
            precisRecall = evaluation.PrecisionRecallMeasure(irlist)
            print("Evaluate the scores.")
            pr = precisRecall.eval(verbose=True, nbLevel=100) # (recall, precision)
            precision = [p for r,p in pr]
            recall = [r for r,p in pr]
            plt.plot(recall, precision)
            plt.title("Precision-Recall for query #"+query.getID())
            plt.show()
            print(20*'-')


        query = qp.nextQuery()
    print("Done.")
else:
    print("\n##### Skipping the test of PrecisionRecallMeasure #####")

## Testing average precision:

In [None]:
if test_averageprecision:
    print("\n###### Testing evaluation.AveragePrecision: ###### ")
    queryChosen = np.random.randint(1, 50, size=10)
    #queryChosen =[7, 10, 14, 29]#, 25, 26, 27, 42, 43] # lots of relevant results
    qp = QueryParserCACM(cacm_qry, cacm_rel)
    query = qp.nextQuery()
    print("Searching for query #",queryChosen, "\n")
    while query is not None :
        if int(query.getID()) in queryChosen:
            print("Query:", query)
            queryTxt = stemmer.getTextRepresentation(query.getText())
            print("Retrieve scores...")
            scores = vect.getRanking(queryTxt)
            irlist = evaluation.IRList(query, scores)
            average_measure = evaluation.AveragePrecision(irlist)
            print("Evaluate the scores.")
            average_prec = average_measure.eval(verbose=True)
            print("Average precision: %f" % average_prec)
            print(20*'-')

        query = qp.nextQuery()
    print("Done.")
else:
    print("\n##### Skipping the test of AveragePrecision #####")

## Testing EvalIRModel:

In [None]:
if test_eval_ir_model:
    print("\n###### Testing evaluation.EvalIRModel: ###### ")
    queryChosen = np.random.randint(1, 50, size=50)
    #queryChosen =[7, 10, 14]#, 25, 26, 27, 42, 43] # lots of relevant results
    queries = []
    qp = QueryParserCACM(cacm_qry, cacm_rel)
    query = qp.nextQuery()
    print("Searching for queries #",queryChosen, "...")
    while query is not None :
        if int(query.getID()) in queryChosen:
            queries.append(query)
        query = qp.nextQuery()
    print("Found queries")
    irmodels = {"vectoriel": modeles.Vectoriel(idx, tfidfWeighter)}
    measures = {"averagePrecision": evaluation.AveragePrecision}
    eval_model = evaluation.EvalIRModel(queries, irmodels, measures)
    print("Calling eval()...")
    print(eval_model.eval(verbose=False))
    print("Done.")
else:
    print("\n##### Skipping the test of EvalIRModel #####")

## Testing the unigram language model:

In [None]:
if test_unigram_model:
    print("\n###### Testing Unigram language: ###### ")
    queryChosen = np.random.randint(1, 50, size=3)
    #queryChosen =[7, 10, 14]#, 25, 26, 27, 42, 43] # lots of relevant results
    queries = []
    relevants = {} #dict of {query id : list of relevant doc id}
    qp = QueryParserCACM(cacm_qry, cacm_rel)
    query = qp.nextQuery()
    print("Searching for queries #",queryChosen, "...")
    while query is not None :
        if int(query.getID()) in queryChosen:
            queries.append(query)
            relevants[query.getID()] = list(query.getRelevants().keys())
            #print(query)
        query = qp.nextQuery()
    print("Found queries")
    model = modeles.UnigramLanguage(idx, 0.8)
    for q, (q_id, relev) in zip(queries, relevants.items()):
        print(q)
        print("Scores for 3 relevant docs:")
        for doc_id in np.random.choice(relev, size=3):
            print(doc_id, model.score(stemmer.getTextRepresentation(q.getText()), doc_id))
        print("Scores for 3 random docs:")
        for doc_id in np.random.choice(idx.getDocsID(), size=3):
            if doc_id in relev:
                print("Doc #", doc_id, "Relevant doc")
            else:
                stems = ','.join(idx.getTfsForDoc(doc_id).keys())
                print("Irrelevant doc #", doc_id, stems)
                print("Score for this doc:", 
                      model.score(stemmer.getTextRepresentation(q.getText()), doc_id))
        print(20*'-')
    print("Done.")
else:
    print("\n##### Skipping the test of LanguageModel #####")

##  Testing Okapi language model:

In [None]:
if test_okapi:
    print("\n###### Testing Okapi: ###### ")
    queryChosen = np.random.randint(1, 50, size=3)
    #queryChosen =[7, 10, 14]#, 25, 26, 27, 42, 43] # lots of relevant results
    queries = []
    relevants = {} #dict of {query id : list of relevant doc id}
    qp = QueryParserCACM(cacm_qry, cacm_rel)
    query = qp.nextQuery()
    print("Searching for queries #",queryChosen, "...")
    while query is not None :
        if int(query.getID()) in queryChosen:
            queries.append(query)
            relevants[query.getID()] = list(query.getRelevants().keys())
            #print(query)
        query = qp.nextQuery()
    print("Found queries")
    
    okapi = modeles.Okapi(idx, k=1, b=1)
    for q, (q_id, relev) in zip(queries, relevants.items()):
        print(q)
        print("Scores for 3 relevant docs:")
        for doc_id in np.random.choice(relev, size=3):
            stems = ','.join(idx.getTfsForDoc(doc_id).keys())
            print("Relevant doc #", doc_id, stems)
            print("Score:", 
                  okapi.score(stemmer.getTextRepresentation(q.getText()), doc_id, verbose=True))
        print("Scores for 3 random docs:")
        for doc_id in np.random.choice(idx.getDocsID(), size=3):
            if doc_id in relev:
                print("Doc #", doc_id, "Relevant doc")
            else:
                stems = ','.join(idx.getTfsForDoc(doc_id).keys())
                print("Irrelevant doc #", doc_id, stems)
                print("Score for this doc:", 
                      okapi.score(stemmer.getTextRepresentation(q.getText()), doc_id, verbose=True))
        print(20*'-')
    print("Done.")
else:
    print("\n##### Skipping the test of Okapi #####")

In [None]:
print("\n###### Testing Okapi & AveragePrecision: ###### ")
for query in queries:
    print("Query:", query)
    queryTxt = stemmer.getTextRepresentation(query.getText())
    print("Retrieve scores...")
    scores = okapi.getRanking(queryTxt)
    irlist = evaluation.IRList(query, scores)
    average_measure = evaluation.AveragePrecision(irlist)
    print("Evaluate the scores.")
    average_prec = average_measure.eval(verbose=True)
    print("Average precision: %f" % average_prec)
    print(20*'-')

print("Done.")

## Comparison of language models:

In [None]:
from sklearn.model_selection import train_test_split
import itertools 

# Searching queries:
queries = []
qp = QueryParserCACM(cacm_qry, cacm_rel)
query = qp.nextQuery()
print("Searching for queries #",queryChosen, "...")
while query is not None :
    if int(query.getID()) in queryChosen:
        queries.append(query)
    query = qp.nextQuery()
print("Found all queries")

q_train, q_test = train_test_split(queries)

# Train models, find best parameters.

def dict_combinations(dic):
    keys = dic.keys()
    #print(keys)
    values = [dic[key] for key in keys]
    #print("values:", list(values), ".")
    #for combination in itertools.product(*values):
        #print(combination)
    combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]
    return combinations

def gridsearch(model_class, param_grid, queries, measure_object, verbose=False):
    """
    :param model_class: modeles.Vectoriel for instance (the class, not an instance)
    :param param_grid: dict of {string:iterable}
    :param queries: list of Query objects
    :param measure_class: evaluation.AveragePrecision() for instance
    """
    params = []
    irmodels = {}
    for i, comb in enumerate(dict_combinations(param_grid)):
        params.append(comb)
        irmodels[i] = model_class(**comb)
    eval_models = evaluation.EvalIRModel(queries, irmodels, {'measure':measure_object})
    if verbose:
        print("Calling eval()")
        scores = eval_models.eval(verbose=verbose)
        for k,v in scores.items():
            print(params[k[0]])
            print("--->", v[0])
    else:
        scores = eval_models.eval()
    best_irmodel = max(scores.keys(), key=(lambda key: scores[key][0]))[0]
    return params[best_irmodel]

if gridsearch_language:
    unigram_params_grid = {'index':[idx], 'regularization':np.linspace(0,1, 20)}

    #c = dict_combinations(params_grid)
    #print("comb: ",c)

    best_unigram_params = gridsearch(modeles.UnigramLanguage, unigram_params_grid, 
                     q_train, evaluation.AveragePrecision, verbose=1)
else:
    best_unigram_params = {'index': idx, 'regularization': 0.36842105263157893}



In [None]:
if gridsearch_language:
    okapi_params_grid = {'index':[idx], 'k':np.linspace(1, 2, 10), 'b':np.linspace(0,2,10)}
    best_okapi_params = gridsearch(modeles.Okapi, okapi_params_grid, 
                 q_train, evaluation.AveragePrecision, verbose=1)

else:
    best_okapi_params = {'index': idx, 'k': 2.0, 'b': 0.5}

### Execution on the test set:

In [None]:
print("best unigram params:", best_unigram_params)
print("best okapi params:", best_okapi_params)

irmodels = {'unigram':modeles.UnigramLanguage(**best_unigram_params),
            'okapi':modeles.Okapi(**best_okapi_params)}

eval_models = evaluation.EvalIRModel(q_test, irmodels, {'measure':evaluation.AveragePrecision})
scores = eval_models.eval(verbose=True)
print(scores)