# Project Part 2

## Importing / Setting up Pyterrier

In [1]:
# Getting the post reader
from post_parser_record import PostParserRecord
post_reader = PostParserRecord("SPosts.xml")

In [2]:
# Making sure correct version of pyterrier is installed
!pip install python-terrier
!pip install --upgrade git+https://github.com/terrier-org/pyterrier.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/terrier-org/pyterrier.git
  Cloning https://github.com/terrier-org/pyterrier.git to /tmp/pip-req-build-hu_q_74_
  Running command git clone -q https://github.com/terrier-org/pyterrier.git /tmp/pip-req-build-hu_q_74_
Building wheels for collected packages: python-terrier
  Building wheel for python-terrier (setup.py) ... [?25l[?25hdone
  Created wheel for python-terrier: filename=python_terrier-0.9.1-py3-none-any.whl size=111552 sha256=0ca3ac90f69893147d935f72b3f05a53ce5dfd1b3c4744c4bca65a3b85baa69b
  Stored in directory: /tmp/pip-ephem-wheel-cache-no0ga69l/wheels/61/12/f7/d3c3d17f72ab9ad1c5d510a0d6bd1612023e01fa0e07f01059
Successfully built python-terrier
Installing collected packages: python-terrier
  Attempting uninstall: python-terrier
    Found existin

In [3]:
# Importing and starting pyterrier
import pandas as pd
import pyterrier as pt
if not pt.started():
  pt.init(boot_packages=['com.github.terrierteam:terrier-prf:-SNAPSHOT'])

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [4]:
# Installing ranx and importing necessary libraries
!pip install -U ranx
from ranx import Qrels, Run, evaluate, compare, fuse

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Indexing

In [5]:
# reading questions into a list
questions = []
for question_id in post_reader.map_questions:
  question = post_reader.map_questions[question_id]
  questions.append({'docno':str(question_id), 'title': question.title, 'body': question.body})

In [6]:
# reading answers into a list
answers = []
for answer_id in post_reader.map_just_answers:
  answer = post_reader.map_just_answers[answer_id]
  answers.append({'docno':str(answer_id), 'title': "", 'body': answer.body})

In [7]:
# adding answers list to questions list
alldocs = questions + answers

In [8]:
# creating the index which is a pandas dataframe
iter_indexer = pt.IterDictIndexer("./index", meta={'docno': 20, 'title': 10000, 'body':20000},
overwrite=True)
iter_indexer.setProperty("tokeniser", "UTFTokeniser")
RETRIEVAL_FIELDS = ['body', 'title']
indexref1 = iter_indexer.index(alldocs, fields=RETRIEVAL_FIELDS)

### Setting up Queries

In [9]:
# Dataframe of all 20 queries with their query id
queries = pd.DataFrame([["1", "What was the purpose of what happened to Trip in the last episode?"], 
                        ["2", "Sometimes zombies are flammable, while other times they are not at all (since they are a big squishy wet blob). Have the zombies in The Walking Dead been shown to be more or less flammable as a living human?"],
                        ["3", "What is the Positronic brain coined by Asimov?"],
                        ["4", "Was Howland Reed the Knight of the Laughing Tree?"],
                        ["5", "Why didn't the basilisk bite in Chamber of Secrets destroy the Horcrux?"],
                        ["6", "What influenced the far-future setting of Viriconium?"],
                        ["7", "Why don't zombies eat each other?"],
                        ["8", "When did Dumbledore realize that Voldemort was not 100% dead?"],
                        ["9", "Why Did Mrs. Crouch Die While Taking Polyjuice Potion?"],
                        ["10", "How come the Federation did not routinely use projectile weapons against the Borg?"],
                        ["11", "Why are there no non-humanoid Borg?"],
                        ["12", "Is the first season of Heroes self-contained?"],
                        ["13", "Can someone explain the scene where Tony Stark first meets Brandt?"],
                        ["14", "How did Snape find Harry and Hermione in the Forest of Dean?"],
                        ["15", "Does River Song ever get jealous of the Doctor’s female companions?"],
                        ["16", "How fast does an alien grow in the Alien movies?"],
                        ["17", "Which was the first story featuring a psychic being unable to control their own powers and destroying their surroundings with them?"],
                        ["18", "Short story where a man discovers he's living in a fake world"],
                        ["19", "Are the shields and hull of the USS Enterprise-D capable of withstanding a direct nuclear strike?"],
                        ["20", "Is The Spirit of Carnan possibly an Entwife?"],
                        ],columns=['qid','query'])

In [10]:
# Tokenizer for the queries
tokenizer = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
# Function to tokenize
def strip_markup(text):
    return " ".join(tokenizer.getTokens(text))

In [11]:
# Tokenizes queries and updates the dataframe
queries = pt.apply.query(lambda r: strip_markup(r.query))(queries)
queries = queries.drop(columns = ["query_0"])
queries

Unnamed: 0,qid,query
0,1,what was the purpose of what happened to trip ...
1,2,sometimes zombies are flammable while other ti...
2,3,what is the positronic brain coined by asimov
3,4,was howland reed the knight of the laughing tree
4,5,why didn t the basilisk bite in chamber of sec...
5,6,what influenced the far future setting of viri...
6,7,why don t zombies eat each other
7,8,when did dumbledore realize that voldemort was...
8,9,why did mrs crouch die while taking polyjuice ...
9,10,how come the federation did not routinely use ...


## Step 1. Retrieval with PyTerrier 

### TF-IDF

In [12]:
# TF-IDF model of retrieval. Saves results into a trec formatted tsv
tfidf = pt.BatchRetrieve(indexref1, num_results = 1000, wmodel="TF_IDF").transform(queries)
pt.io.write_results(tfidf, "tfidf_ranked.tsv", format='trec')
tfidf

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,3492,13530,0,17.106307,what was the purpose of what happened to trip ...
1,1,13813,57008,1,14.329058,what was the purpose of what happened to trip ...
2,1,108313,84005,2,12.191978,what was the purpose of what happened to trip ...
3,1,157909,185939,3,11.848492,what was the purpose of what happened to trip ...
4,1,75796,14084,4,11.269100,what was the purpose of what happened to trip ...
...,...,...,...,...,...,...
19995,20,96298,55996,995,4.214785,is the spirit of carnan possibly an entwife
19996,20,100228,66011,996,4.214785,is the spirit of carnan possibly an entwife
19997,20,53313,208926,997,4.212141,is the spirit of carnan possibly an entwife
19998,20,79026,20060,998,4.212141,is the spirit of carnan possibly an entwife


### BM25

In [13]:
# BM25 model of retrieval. Saves results into a trec formatted tsv
bm25 = pt.BatchRetrieve(indexref1, num_results = 1000, wmodel="BM25").transform(queries)
pt.io.write_results(bm25, "bm25_ranked.tsv", format='trec')
bm25

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,3492,13530,0,30.054742,what was the purpose of what happened to trip ...
1,1,13813,57008,1,25.273587,what was the purpose of what happened to trip ...
2,1,108313,84005,2,21.356063,what was the purpose of what happened to trip ...
3,1,157909,185939,3,20.706735,what was the purpose of what happened to trip ...
4,1,75796,14084,4,19.852036,what was the purpose of what happened to trip ...
...,...,...,...,...,...,...
19995,20,138390,146875,995,7.630147,is the spirit of carnan possibly an entwife
19996,20,179936,235621,996,7.630147,is the spirit of carnan possibly an entwife
19997,20,240,1725,997,7.626413,is the spirit of carnan possibly an entwife
19998,20,141570,152842,998,7.618700,is the spirit of carnan possibly an entwife


### PL2

In [14]:
# PL2 model of retrieval. Saves results into a trec formatted tsv
pl2 = pt.BatchRetrieve(indexref1, num_results = 1000, wmodel="PL2").transform(queries)
pt.io.write_results(pl2, "pl2_ranked.tsv", format='trec')
pl2

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,3492,13530,0,15.371058,what was the purpose of what happened to trip ...
1,1,13813,57008,1,13.103093,what was the purpose of what happened to trip ...
2,1,108313,84005,2,11.739899,what was the purpose of what happened to trip ...
3,1,158314,186749,3,10.651128,what was the purpose of what happened to trip ...
4,1,75796,14084,4,10.598365,what was the purpose of what happened to trip ...
...,...,...,...,...,...,...
19995,20,46112,182826,995,3.502263,is the spirit of carnan possibly an entwife
19996,20,148614,167616,996,3.493930,is the spirit of carnan possibly an entwife
19997,20,38000,155443,997,3.491454,is the spirit of carnan possibly an entwife
19998,20,61061,238025,998,3.491454,is the spirit of carnan possibly an entwife


### Reranking BM25

In [15]:
# Reranking BM25 results with TF-IDF. Saves result in tsv
tf_idf = pt.BatchRetrieve(indexref1, wmodel="TF_IDF")
bm25 = pt.BatchRetrieve(indexref1, num_results = 100, wmodel="BM25")
pipeline = (bm25 % 100) >> tf_idf
result = pipeline.transform(queries)
pt.io.write_results(result, "res_reranked.tsv", format='trec')
result

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,3492,13530,0,17.106307,what was the purpose of what happened to trip ...
1,1,13813,57008,1,14.329058,what was the purpose of what happened to trip ...
2,1,108313,84005,2,12.191978,what was the purpose of what happened to trip ...
3,1,157909,185939,3,11.848492,what was the purpose of what happened to trip ...
4,1,75796,14084,4,11.269100,what was the purpose of what happened to trip ...
...,...,...,...,...,...,...
1995,20,63420,245785,95,6.396089,is the spirit of carnan possibly an entwife
1996,20,142149,153931,96,6.379810,is the spirit of carnan possibly an entwife
1997,20,162162,194511,97,6.374303,is the spirit of carnan possibly an entwife
1998,20,57198,224373,98,6.359151,is the spirit of carnan possibly an entwife


## Step 2. Retrieval Result Fusion

In [16]:
# Fusion between TF-IDF and BM25 models. Saves results in tsv
run_1 = Run.from_file("tfidf_ranked.tsv", kind="trec")
run_2 = Run.from_file("bm25_ranked.tsv", kind="trec")

fused = fuse(
runs=[run_1, run_2],
norm="min-max", # Default normalization strategy
method="mnz",
)

fused.save("fused.tsv", kind="trec")

## Step 3. (Extra) Weighted Combination

In [17]:
# Weighted combination between BM25 results and TF-IDF results, giving BM25 a weighting of 0.75 and TF-IDF a weighting of 0.25. Saves results in tsv
bm25 = pt.BatchRetrieve(indexref1, wmodel="BM25") >> pt.pipelines.PerQueryMaxMinScoreTransformer()
tfidf = pt.BatchRetrieve(indexref1, wmodel="TF_IDF") >> pt.pipelines.PerQueryMaxMinScoreTransformer()

linear = 0.75 * bm25 + 0.25 * tfidf

comb_res = linear.transform(queries)

pt.io.write_results(comb_res, "weightedComb.tsv", format='trec')

## Step 4: Query Expansion 

### RM3 Expansion

In [18]:
# Query expansion using RM3 using the BM25 model
pipe = (pt.BatchRetrieve(indexref1, num_results = 1000, wmodel="BM25").transform(queries) >> 
    pt.rewrite.RM3(indexref1) >> 
    pt.BatchRetrieve(indexref1, num_results = 1000, wmodel="BM25").transform(queries))

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [19]:
# Pipe that is used for retrieval
pipe

ComposedPipeline(ComposedPipeline(      qid   docid   docno  rank      score  \
0       1    3492   13530     0  30.054742   
1       1   13813   57008     1  25.273587   
2       1  108313   84005     2  21.356063   
3       1  157909  185939     3  20.706735   
4       1   75796   14084     4  19.852036   
...    ..     ...     ...   ...        ...   
19995  20  138390  146875   995   7.630147   
19996  20  179936  235621   996   7.630147   
19997  20     240    1725   997   7.626413   
19998  20  141570  152842   998   7.618700   
19999  20  184188  245186   999   7.616779   

                                                   query  
0      what was the purpose of what happened to trip ...  
1      what was the purpose of what happened to trip ...  
2      what was the purpose of what happened to trip ...  
3      what was the purpose of what happened to trip ...  
4      what was the purpose of what happened to trip ...  
...                                                  ...  


In [20]:
# Gets results for queries and saves it in tsv
pipe = pipe.transform(queries)
pt.io.write_results(pipe, "queryexpansion.tsv", format='trec')
print(pipe)

      qid                                            query_0   docid   docno  \
0       1  what was the purpose of what happened to trip ...    3492   13530   
1       1  what was the purpose of what happened to trip ...   13813   57008   
2       1  what was the purpose of what happened to trip ...  108313   84005   
3       1  what was the purpose of what happened to trip ...  157909  185939   
4       1  what was the purpose of what happened to trip ...   75796   14084   
...    ..                                                ...     ...     ...   
19995   9  why did mrs crouch die while taking polyjuice ...   63250  245232   
19996   9  why did mrs crouch die while taking polyjuice ...  179853  235448   
19997   9  why did mrs crouch die while taking polyjuice ...  123496  115066   
19998   9  why did mrs crouch die while taking polyjuice ...  130420  129471   
19999   9  why did mrs crouch die while taking polyjuice ...  141401  152521   

       rank      score                 

## Step 5: Testing

In [21]:
# The qrel file that will be used for evaluation
qrel = Qrels.from_file("qrel.tsv", kind="trec")

### Evaluation

#### TF-IDF

In [22]:
run1 = Run.from_file("tfidf_ranked.tsv", kind="trec")
evaluate(qrel, run1, ["precision@5", "ndcg@5"])

{'precision@5': 0.7500000000000001, 'ndcg@5': 0.9217616932194987}

#### BM25

In [23]:
run2 = Run.from_file("bm25_ranked.tsv", kind="trec")
evaluate(qrel, run2, ["precision@5", "ndcg@5"])

{'precision@5': 0.7500000000000001, 'ndcg@5': 0.9225649558842776}

#### PL2

In [24]:
run3 = Run.from_file("pl2_ranked.tsv", kind="trec")
evaluate(qrel, run3, ["precision@5", "ndcg@5"])

{'precision@5': 0.72, 'ndcg@5': 0.9010484503746203}

#### Reranked BM25

In [25]:
run4 = Run.from_file("res_reranked.tsv", kind="trec")
evaluate(qrel, run4, ["precision@5", "ndcg@5"])

{'precision@5': 0.7500000000000001, 'ndcg@5': 0.9217616932194987}

#### Fusion of TF-IDF and BM25

In [26]:
run5 = Run.from_file("fused.tsv", kind="trec")
evaluate(qrel, run5, ["precision@5", "ndcg@5"])

{'precision@5': 0.7500000000000001, 'ndcg@5': 0.9232648079136764}

#### Weighted Combination of BM25 and TF-IDF

In [27]:
run6 = Run.from_file("weightedComb.tsv", kind="trec")
evaluate(qrel, run6, ["precision@5", "ndcg@5"])

{'precision@5': 0.7500000000000001, 'ndcg@5': 0.9232648079136764}

#### Query Expansion on BM25

In [28]:
run7 = Run.from_file("queryexpansion.tsv", kind="trec")
evaluate(qrel, run7, ["precision@5", "ndcg@5"])

{'precision@5': 0.7500000000000001, 'ndcg@5': 0.9225649558842776}

### Per query precision for PL2 and CombMNZ fusion

In [29]:
# PL2
evaluate(qrel, run3, ["precision@5", "ndcg@5"], return_mean = False)

{'precision@5': array([0.4, 1. , 1. , 0.2, 0.6, 0.6, 1. , 0.6, 0.4, 0.4, 0.8, 0.8, 0.2,
        1. , 1. , 1. , 0.6, 1. , 0.8, 1. ]),
 'ndcg@5': array([0.65082052, 0.90925698, 1.        , 1.        , 0.90822094,
        0.93604034, 1.        , 0.879078  , 0.77634337, 0.91972079,
        0.79576075, 1.        , 1.        , 0.84217577, 0.9920466 ,
        0.97946536, 0.87028764, 0.84960741, 0.77774708, 0.93439746])}

In [30]:
# CombMNZ
evaluate(qrel, run5, ["precision@5", "ndcg@5"], return_mean = False)

{'precision@5': array([0.6, 1. , 1. , 0.2, 0.8, 0.6, 1. , 0.8, 0.6, 0.4, 0.8, 0.8, 0.2,
        1. , 1. , 1. , 0.8, 0.8, 0.6, 1. ]),
 'ndcg@5': array([0.85292787, 1.        , 1.        , 1.        , 1.        ,
        0.93604034, 1.        , 1.        , 0.97785851, 0.91972079,
        0.79576075, 1.        , 1.        , 0.74261428, 1.        ,
        0.88549505, 1.        , 0.67701174, 0.74346937, 0.93439746])}

### Significance Test for PL2 and fusion of TFIDF and BM25

In [31]:
# Compares PL2 and fused runs and performs statistical tests
report = compare(
    qrels = qrel,
    runs = [run3,run5],
    metrics=["precision@5", "ndcg@5", "map@100"],
    max_p=0.01,  # P-value threshold
    rounding_digits=3,
)
print(report)

#    Model        P@5    NDCG@5    MAP@100
---  ---------  -----  --------  ---------
a    pyterrier   0.72     0.901      0.875
b    comb_mnz    0.75     0.923      0.902
