This notebook is used for SI 650 Information Retrieval class. You should implement retrieval functions and report corresponding results in your submission on Canvas. 


In [100]:
# install metapy, it may take several minutes.
!pip install metapy
import metapy



In [101]:
# Reading Data
!wget -nc https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt
!wget -N https://meta-toolkit.org/data/2016-11-10/cranfield.tar.gz
!tar xf cranfield.tar.gz
!wget -N http://www-personal.umich.edu/~shiyansi/cacm.tar.gz
!tar xf cacm.tar.gz

File ‘lemur-stopwords.txt’ already there; not retrieving.

--2018-10-18 22:55:48--  https://meta-toolkit.org/data/2016-11-10/cranfield.tar.gz
Resolving meta-toolkit.org (meta-toolkit.org)... 50.116.41.177, 2600:3c02::f03c:91ff:feae:b777
Connecting to meta-toolkit.org (meta-toolkit.org)|50.116.41.177|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘cranfield.tar.gz’ not modified on server. Omitting download.

--2018-10-18 22:55:51--  http://www-personal.umich.edu/~shiyansi/cacm.tar.gz
Resolving www-personal.umich.edu (www-personal.umich.edu)... 141.211.243.103
Connecting to www-personal.umich.edu (www-personal.umich.edu)|141.211.243.103|:80... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘cacm.tar.gz’ not modified on server. Omitting download.



In [0]:
# Setting cranfield dataset
with open('cranfield/tutorial.toml', 'w') as f:
    f.write('type = "line-corpus"\n')
    f.write('store-full-text = true\n')

config = """prefix = "." # tells MeTA where to search for datasets

dataset = "cranfield" # a subfolder under the prefix directory
corpus = "tutorial.toml" # a configuration file for the corpus specifying its format & additional args

index = "cranfield-idx" # subfolder of the current working directory to place index files

query-judgements = "cranfield/cranfield-qrels.txt" # file containing the relevance judgments for this dataset

stop-words = "lemur-stopwords.txt"

[[analyzers]]
method = "ngram-word"
ngram = 1
filter = "default-unigram-chain"
"""
with open('cranfield-config.toml', 'w') as f:
    f.write(config)

In [0]:
# Setting cacm dataset
with open('cacm/tutorial.toml', 'w') as f:
    f.write('type = "line-corpus"\n')
    f.write('store-full-text = true\n')

config = """prefix = "." # tells MeTA where to search for datasets

dataset = "cacm" # a subfolder under the prefix directory
corpus = "tutorial.toml" # a configuration file for the corpus specifying its format & additional args

index = "cacm-idx" # subfolder of the current working directory to place index files

query-judgements = "cacm/cacm-qrels.txt" # file containing the relevance judgments for this dataset

stop-words = "lemur-stopwords.txt"

[[analyzers]]
method = "ngram-word"
ngram = 1
filter = "default-unigram-chain"
"""
with open('cacm-config.toml', 'w') as f:
    f.write(config)

In [0]:
# Make sure you have installed metapy package and downloaded the data before running the following code

In [0]:
# Build the index for dataset.
inv_idx_cran = metapy.index.make_inverted_index('cranfield-config.toml')
inv_idx_cacm = metapy.index.make_inverted_index('cacm-config.toml')


#** 3 Define New Retrieval Function**

**Please write your own retrieval function in the cell below**

In [0]:
import math
class  NewRF (metapy.index.RankingFunction):                                                                                                           
    def __init__(self, k1 = 1.2, b = 0.9, k3 = 500):                                             
        self.k1 = k1
        self.b = b
        self.k3 = k3
        # You *must* invoke the base class __init__() here!
        super(NewRF, self).__init__()                                        
                                                                                 
    def score_one(self, sd):
        """
        You need to override this function to return a score for a single term.
        
        You may want to call some of the following variables when implementing your retrieval function:
        1sd.avg_dl: average document length of the collection1
        1sd.num_docs: total number of documents in the index1
        v sd.total_terms: total number of terms in the index
        sd.query_length: the total length of the current query (sum of all term weights)
        1sd.query_term_weight: query term count (or weight in case of feedback)
        1sd.doc_count: number of documents that a term t_id appears in
        sd.corpus_term_count: number of times a term t_id appears in the collection
        1sd.doc_term_count: number of times the term appears in the current document
        1sd.doc_size: total number of terms in the current document
        sd.doc_unique_terms: number of unique terms in the current document
        """
        #Write your answer here
        
        k1 = self.k1
        b = self.b
        k3 = self.k3
        
        #Fill your answer here
        # modified ES 
        TF = sd.doc_term_count/(sd.doc_term_count + b * math.sqrt(sd.doc_size/sd.avg_dl))
        IDF = ((sd.corpus_term_count ** 3) * sd.num_docs / (sd.doc_count**4)) ** k1 
        QTF = (k3 + 1)* sd.query_term_weight / (k3 + sd.query_term_weight)
        return IDF * TF * QTF


In [179]:
b_list = [0.96,1.0,1.04,1.08,1.12]

k1_list = [0.3,0.4, 0.5, 0.6]
k3_list = [500,1000]

for k1_t in k1_list:
    for b_t in b_list:
        for k3_t in k3_list:
            ranker = NewRF(k1 = k1_t, b = b_t, k3 = k3_t)
            ev = metapy.index.IREval('cranfield-config.toml')
            num_results = 30
            precision_list = []
            with open('cranfield/cranfield-queries.txt') as query_file:
                for query_num, line in enumerate(query_file):
                    query = metapy.index.Document()
                    query.content(line.strip())
                    results = ranker.score(inv_idx_cran, query, num_results)                            
                    avg_p = ev.avg_p(results, query_num + 1, num_results)
                    precision_list.append(ev.precision(results,query_num+1,num_results))
            print("settings: "+str(k1_t)+' , '+str(b_t)+' , '+str(k3_t))
            print "cranfield-MAP", ev.map()
            
            ranker = NewRF(k1 = k1_t, b = b_t, k3 = k3_t)
            with open('cacm/cacm-queries.txt') as query_file:
                ev2 = metapy.index.IREval('cacm-config.toml')
                num_results = 30
                for query_num, line in enumerate(query_file):
                    query = metapy.index.Document()
                    query.content(line.strip())
                    results = ranker.score(inv_idx_cacm, query, num_results)                            
                    avg_p = ev2.avg_p(results, query_num + 1, num_results)
                    precision_list.append(ev2.precision(results, query_num+1, num_results))
            print "cacm_MAP", ev2.map()
  

settings: 0.3 , 0.96 , 500
cranfield-MAP 0.292336835835
cacm_MAP 0.26924280565
settings: 0.3 , 0.96 , 1000
cranfield-MAP 0.292336835835
cacm_MAP 0.269216500937
settings: 0.3 , 1.0 , 500
cranfield-MAP 0.293928691701
cacm_MAP 0.269349695609
settings: 0.3 , 1.0 , 1000
cranfield-MAP 0.29392443457
cacm_MAP 0.269349695609
settings: 0.3 , 1.04 , 500
cranfield-MAP 0.294830091579
cacm_MAP 0.268378913493
settings: 0.3 , 1.04 , 1000
cranfield-MAP 0.294843101227
cacm_MAP 0.268378913493
settings: 0.3 , 1.08 , 500
cranfield-MAP 0.295102982963
cacm_MAP 0.268274177184
settings: 0.3 , 1.08 , 1000
cranfield-MAP 0.295098880399
cacm_MAP 0.268269206176
settings: 0.3 , 1.12 , 500
cranfield-MAP 0.295642596757
cacm_MAP 0.269115250524
settings: 0.3 , 1.12 , 1000
cranfield-MAP 0.295601444494
cacm_MAP 0.268464208858
settings: 0.4 , 0.96 , 500
cranfield-MAP 0.300461703809
cacm_MAP 0.26980488726
settings: 0.4 , 0.96 , 1000
cranfield-MAP 0.300585160599
cacm_MAP 0.269787105491
settings: 0.4 , 1.0 , 500
cranfield-MAP

# Testing Search Results for a Single Query

In [0]:
query = metapy.index.Document()
query.content("ibm")
top_docs = ranker.score(inv_idx_cacm, query, num_results=5)

In [178]:
for num, (d_id, _) in enumerate(top_docs):
    content = inv_idx_cacm.metadata(d_id).get('content')
    print("{}. {}...\n".format(num + 1, content))

1. ibm 704 code nundrums...

2. character scanning on the ibm 7070...

3. counting ones on the ibm 7090...

4. starting approximations for square root calculation on ibm system 360 several starting approximations for square root calculation by newton method are presented in a form to facilitate their use in ibm system 360 square root routines these approximations include several for the range 1 16 1 which is the interval of primary interest on ibm system 360...

5. statistical programs for the ibm 650 part i a collection is given of brief descriptions of statistical programs now in use in university computing centers which have ibm 650...



**Please submit your code for  NewRF class to canvas. We need your code to verify your results.**