# Assignment 1 Setup Index

In [1]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import json
import numpy as np
import math
from elasticsearch7 import Elasticsearch
from tqdm import tqdm

In [4]:
# Path to the directory containing files
DIRECTORY_PATH = 'ap89_collection/'
QUERY_PATH = 'AP_DATA/query_desc.51-100.short.txt'
INDEX_NAME = 'ap89_collection_v1'
STOP_WORDS_PATH = 'AP_DATA/stoplist.txt'
DEFAULT_HEADER = {'Content-Type': 'application/json'}
ES_URL = "http://localhost:9200/"

es = Elasticsearch(ES_URL)

# Task 2 : Document Indexing

Create an index of the downloaded corpus. The documents are found within the ap89_collection folder in the data .zip file. You will need to write a program to parse the documents and send them to your elasticsearch instance.

The corpus files are in a standard format used by TREC. Each file contains multiple documents. The format is similar to XML, but standard XML and HTML parsers will not work correctly. Instead, read the file one line at a time with the following rules:

1. Each document begins with a line containing <DOC> and ends with a line containing </DOC>.
2. The first several lines of a document’s record contain various metadata. You should read the <DOCNO> field and use it as the ID of the document.
3. The document contents are between lines containing <TEXT> and </TEXT>.
4. All other file contents can be ignored.

Use elasticsearch API to retrieve values like term frequency and term positions within a document. You will need such values to score documents using the retrieval models listed below.

In [402]:
import os
import pandas as pd
import re


data = []

for filename in os.listdir(DIRECTORY_PATH):
    file_path = os.path.join(DIRECTORY_PATH, filename)

    with open(file_path, 'r', encoding='ISO-8859-1', errors='ignore') as file:
        content = file.read()

    # Use regular expression to extract each document
    docs = re.findall(r'<DOC>.*?</DOC>', content, flags=re.DOTALL)

    # Extract metadata and text for each document
    for doc in docs:
        docno_match = re.search(r'<DOCNO>(.*?)</DOCNO>', doc)
        fileid_match = re.search(r'<FILEID>(.*?)</FILEID>', doc)
        first_match = re.search(r'<FIRST>(.*?)</FIRST>', doc)
        second_match = re.search(r'<SECOND>(.*?)</SECOND>', doc)
        head_matches = re.findall(r'<HEAD>(.*?)</HEAD>', doc)
        byline_match = re.search(r'<BYLINE>(.*?)</BYLINE>', doc)
        dateline_match = re.search(r'<DATELINE>(.*?)</DATELINE>', doc)
        text_match = re.search(r'<TEXT>(.*?)</TEXT>', doc, flags=re.DOTALL)

        # Create a dictionary for each document
        doc_data = {
            'DOCNO': docno_match.group(1) if docno_match else None,
            'FILEID': fileid_match.group(1) if fileid_match else None,
            'FIRST': first_match.group(1) if first_match else None,
            'SECOND': second_match.group(1) if second_match else None,
            'HEAD': head_matches if head_matches else None,
            'BYLINE': byline_match.group(1) if byline_match else None,
            'DATELINE': dateline_match.group(1) if dateline_match else None,
            'TEXT': text_match.group(1) if text_match else None
        }
        data.append(doc_data)


df = pd.DataFrame(data)
df


Unnamed: 0,DOCNO,FILEID,FIRST,SECOND,HEAD,BYLINE,DATELINE,TEXT
0,AP891220-0001,AP-NR-12-20-89 2343EST,u i BC-Panama-Resistance 12-20 0723,"BC-Panama-Resistance,0746","[Defense Forces, Noriega Resist; Loyalty Remai...",By ELOY O. AGUILAR,"PANAMA CITY, Panama (AP)",\n Instead of collapsing when the United\nSt...
1,AP891220-0002,AP-NR-12-20-89 0031EST,r a AM-RollingStones 12-20 0372,"AM-Rolling Stones,0385",[Stones Pay Per View Concert Tops North Americ...,By HENRY STERN,"ATLANTIC CITY, N.J. (AP)",\n The Rolling Stones capped the\nnext-to-la...
2,AP891220-0003,AP-NR-12-20-89 0038EST,r a PM-Christmas-Jesus'KinI 12-20 1240,"PM-Christmas-Jesus' Kin I,1277",[Part I: Jesus Descended From Checkered Ancest...,By GEORGE W. CORNELL,,\n This first installment of a three-part Ch...
3,AP891220-0004,AP-NR-12-20-89 0039EST,r i PM-SAfrica-CoupTrial 12-20 0176,"PM-SAfrica-Coup Trial,0178",[Leader Of Failed Homeland Coup Gets 18-Year P...,,"JOHANNESBURG, South Africa (AP)",\n A soldier who led a failed\ncoup attempt ...
4,AP891220-0005,AP-NR-12-20-89 0039EST,r a PM-RobinsonProfile 12-20 0558,"PM-Robinson Profile,0576","[Savannah Leaders Praise Slain Alderman, Eds: ...",By LAURAN NEERGAARD,"SAVANNAH, Ga. (AP)","\n Robert Robinson, the alderman and lawyer\..."
...,...,...,...,...,...,...,...,...
84674,AP890726-0308,AP-NR-07-26-89 1933EDT,u f AM-Psychiatrist-InsiderTrading 07-26 0312,"AM-Psychiatrist-Insider Trading,0322",[Psychiatrist Charged With Using Inside Inform...,By VERA HALLER,NEW YORK (AP),\n A psychiatrist was indicted Wednesday on\...
84675,AP890726-0309,AP-NR-07-26-89 1937EDT,u f AM-ComputerVirus 2ndLd-Writethru f0258 07-...,"AM-Computer Virus, 2nd Ld-Writethru, f0258,0664",[Grand Jury Indicts Cornell Student on Compute...,By JAMES ROWLEY,WASHINGTON (AP),\n A Cornell University graduate student was...
84676,AP890726-0310,AP-NR-07-26-89 2131EDT,u f AM-Japan-Markets 07-26 0275,"AM-Japan-Markets,0285","[Dollar Falls, Stock Prices Move For New High ...",,TOKYO (AP),\n The U.S. dollar fell in early trading aga...
84677,AP890726-0311,AP-NR-07-26-89 2223EDT,r f AM-InteriorBill 3rdLd-Writethru f0329 07-2...,"AM-Interior Bill, 3rd Ld-Writethru, f0329,0762",[Senate Mulls Spending Bill Easing Ocean Drill...,By ALAN FRAM,WASHINGTON (AP),\n The Senate approved legislation on Wednes...


In [6]:
import string
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

# Open stopwords file
with open(STOP_WORDS_PATH, 'r') as file:
    stoplist = file.read().splitlines()

# Preprocess the text
def preprocess_text(text, stoplist):
    # Remove punctuation from text
    text = re.sub(r'[^\w\s]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [token.strip().lower() for token in tokens]
    tokens = [token for token in tokens if token not in stoplist]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    text = ' '.join(tokens)
    return text

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [403]:
def token_lengths(text):
    tokens = word_tokenize(text)
    return len(tokens)

df['PROCESSED_TEXT'] = df['TEXT'].parallel_apply(preprocess_text, args=(stoplist,))
df['DOCNO'] = df['DOCNO'].str.strip()
df.head()

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=8468), Label(value='0 / 8468'))), …

Unnamed: 0,DOCNO,FILEID,FIRST,SECOND,HEAD,BYLINE,DATELINE,TEXT,PROCESSED_TEXT
0,AP891220-0001,AP-NR-12-20-89 2343EST,u i BC-Panama-Resistance 12-20 0723,"BC-Panama-Resistance,0746","[Defense Forces, Noriega Resist; Loyalty Remai...",By ELOY O. AGUILAR,"PANAMA CITY, Panama (AP)",\n Instead of collapsing when the United\nSt...,collaps unit state threw militari gen manuel a...
1,AP891220-0002,AP-NR-12-20-89 0031EST,r a AM-RollingStones 12-20 0372,"AM-Rolling Stones,0385",[Stones Pay Per View Concert Tops North Americ...,By HENRY STERN,"ATLANTIC CITY, N.J. (AP)",\n The Rolling Stones capped the\nnext-to-la...,roll stone cap show 32 citi north american tou...
2,AP891220-0003,AP-NR-12-20-89 0038EST,r a PM-Christmas-Jesus'KinI 12-20 1240,"PM-Christmas-Jesus' Kin I,1277",[Part I: Jesus Descended From Checkered Ancest...,By GEORGE W. CORNELL,,\n This first installment of a three-part Ch...,instal three part christma seri rel jesu deal ...
3,AP891220-0004,AP-NR-12-20-89 0039EST,r i PM-SAfrica-CoupTrial 12-20 0176,"PM-SAfrica-Coup Trial,0178",[Leader Of Failed Homeland Coup Gets 18-Year P...,,"JOHANNESBURG, South Africa (AP)",\n A soldier who led a failed\ncoup attempt ...,soldier led fail coup attempt south africa s n...
4,AP891220-0005,AP-NR-12-20-89 0039EST,r a PM-RobinsonProfile 12-20 0558,"PM-Robinson Profile,0576","[Savannah Leaders Praise Slain Alderman, Eds: ...",By LAURAN NEERGAARD,"SAVANNAH, Ga. (AP)","\n Robert Robinson, the alderman and lawyer\...",robert robinson alderman lawyer assassin mail ...


In [5]:
es.ping()

True

In [406]:
configurations = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords_path": "my_stoplist.txt"
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions"
            }
        }
    }
}

es.indices.create(index=INDEX_NAME, body=configurations)

def add_data(id, text):
    es.index(index=INDEX_NAME, id=id, document={'content': text})

# Loop through all documents
for index, row in df.iterrows():
    # Add document to the index
    add_data(row['DOCNO'], row['PROCESSED_TEXT'])


  es.indices.create(index=INDEX_NAME, body=configurations)


In [None]:
# es.indices.delete(index=INDEX_NAME)

{'acknowledged': True}

# Task3: Query execution
Write a program to run the queries in the file query_desc.51-100.short.txt, included in the data .zip file. You should run all queries (omitting the leading number) using each of the retrieval models listed below, and output the top 1000 results for each query to an output file. If a particular query has fewer than 1000 documents with a nonzero matching score, then just list whichever documents have nonzero scores.

You should write precisely one output file per retrieval model. Each line of an output file should specify one retrieved document, in the following format:

In [7]:
def read_queries(path):
    queries = {}
    with open(QUERY_PATH, 'r') as file:
        content = file.readlines()
        # Extract query id and query text
        for line in content:
            query_id, query_text = line.split('.', 1)
            queries[query_id] = {
                'original_query': query_text,
                'processed_query': word_tokenize(preprocess_text(query_text, stoplist))
            }
    return queries

queries = read_queries(QUERY_PATH)
queries

{'85': {'original_query': '   Document will discuss allegations, or measures being taken against, corrupt public officials of any governmental jurisdiction worldwide.   \n',
  'processed_query': ['alleg', 'corrupt', 'public', 'offici']},
 '59': {'original_query': '   Document will report a type of weather event which has directly caused at least one fatality in some location.   \n',
  'processed_query': ['weather', 'least', 'fatal', 'locat']},
 '56': {'original_query': '   Document will include a prediction about the prime lending rate, or will report an actual prime rate move.   \n',
  'processed_query': ['prime', 'lend', 'rate', 'prime', 'rate']},
 '71': {'original_query': '   Document will report incursions by land, air, or water into the border area of one country by military forces of a second country or a guerrilla group based in a second country.   \n',
  'processed_query': ['incurs',
   'border',
   'area',
   'militari',
   'forc',
   'second',
   'guerrilla',
   'second']},
 

In [9]:
def fetch_all_document_ids(path="AP_DATA/doclist_new_0609.txt"):
    with open(path, 'r') as file:
        all_document_ids = []
        for line in file:
            all_document_ids.append(line.strip().split(' ')[2])
    return all_document_ids

def fetch_all_term_vectors(document_ids):
    document_to_vectors = {}
    document_to_length = {}
    term_document_frequency = {}
    term_total_term_frequency = {}
    vocabulary_words = set()

    def fetch_term_vectors(id_batch):

        vectors = es.mtermvectors(index=INDEX_NAME, term_statistics = True, ids=id_batch, fields='content')

        for tv in vectors['docs']:
            document_id = tv['_id']

            if 'content' not in tv['term_vectors']:
                document_to_vectors[document_id] = {}
                document_to_length[document_id] = 0
            else:
                terms = tv['term_vectors']['content']['terms']

                for term in terms:
                    term_document_frequency[term] = terms[term]['doc_freq']
                    term_total_term_frequency[term] = terms[term]['ttf']

                document_to_vectors[document_id] = terms
                document_to_length[document_id] = sum([terms[x]['term_freq'] for x in terms])
                for term in terms:
                    vocabulary_words.add(term)

    batch_size = 250
    for i in tqdm(range(0, len(document_ids), batch_size)):
        id_batch = document_ids[i:i + batch_size]
        fetch_term_vectors(id_batch)

    return document_to_vectors, document_to_length, term_document_frequency, term_total_term_frequency, vocabulary_words

all_document_ids = fetch_all_document_ids()
document_to_vectors, document_to_length, term_document_frequency, term_total_term_frequency, vocabulary_words = fetch_all_term_vectors(all_document_ids)

number_of_documents = len(all_document_ids)
average_document_length = sum([document_to_length[d] for d in all_document_ids]) / number_of_documents
total_document_length = sum([document_to_length[d] for d in all_document_ids])
vocabulary_size = len(vocabulary_words)


100%|██████████| 339/339 [05:23<00:00,  1.05it/s]


In [10]:
def get_doc_freq(term):
    if term in term_document_frequency:
        return term_document_frequency[term]
    else:
        return 0

def get_term_freq(term, document_id):
    if document_id in document_to_vectors and term in document_to_vectors[document_id]:
        return document_to_vectors[document_id][term]['term_freq']
    else:
        return 0

def get_word_freq_in_query(term, query):
    return query.count(term)

### ES built-in 
Use ES query with the API "match"{"body_text":"query keywords"}. This should be somewhat similar to BM25 scoring.

In [9]:
es.ping()

True

In [10]:
def es_built_in_search(queries, size=1000):
    scores = {}

    for query_id, query in queries.items():
        query = query['processed_query']

        res = es.search(index=INDEX_NAME, body={
            "query": {
                "match": {
                    "content": " ".join(query)
                }
            },
            "size": size
        })

        hits = res['hits']['hits']
        scores[query_id] = []
        for hit in hits:
            scores[query_id].append((hit['_id'], hit['_score']))

    return scores

es_built_in_search(queries)

  res = es.search(index=INDEX_NAME, body={


{'85': [('AP890220-0143', 14.12233),
  ('AP890108-0030', 14.116497),
  ('AP890107-0129', 13.96368),
  ('AP890516-0072', 13.818281),
  ('AP890517-0182', 13.717943),
  ('AP890125-0007', 13.654452),
  ('AP890516-0158', 13.648869),
  ('AP890518-0050', 13.523772),
  ('AP890122-0062', 13.10289),
  ('AP891111-0095', 13.092963),
  ('AP890131-0078', 12.932965),
  ('AP890503-0199', 12.922917),
  ('AP890223-0032', 12.826558),
  ('AP890819-0056', 12.751603),
  ('AP890622-0096', 12.732622),
  ('AP890112-0180', 12.686827),
  ('AP891013-0120', 12.65733),
  ('AP890522-0129', 12.646246),
  ('AP890915-0075', 12.636365),
  ('AP890414-0097', 12.627297),
  ('AP890525-0042', 12.622875),
  ('AP890615-0143', 12.608652),
  ('AP890417-0030', 12.60383),
  ('AP890704-0150', 12.483531),
  ('AP890710-0011', 12.483531),
  ('AP891023-0145', 12.422465),
  ('AP891002-0245', 12.37619),
  ('AP890620-0122', 12.353137),
  ('AP890315-0010', 12.339736),
  ('AP891208-0002', 12.313621),
  ('AP890315-0167', 12.206216),
  ('AP89

### Okapi TF
This is a vector space model using a slightly modified version of TF to score documents.

In [11]:
def okapi_tf(tf_wd, curr_doc_len, avg_doc_len):
    return tf_wd / (tf_wd + 0.5 + (1.5 * (curr_doc_len / avg_doc_len)))

def okapi_score(document, query):
    curr_doc_len = document_to_length[document]
    score = 0
    for term in query:
        tf_wd = get_term_freq(term, document)
        score += okapi_tf(tf_wd, curr_doc_len, average_document_length)
    return score

### TF-IDF

This is the second vector space model

In [12]:
def tf_idf_score(document_id, query):
    curr_doc_len = document_to_length[document_id]
    tf_idf_score = 0
    for word in query:
        tf_wd = get_term_freq(word, document_id)
        doc_freq = get_doc_freq(word)
        if doc_freq == 0:
            continue
        tf_idf_score += okapi_tf(tf_wd, curr_doc_len, average_document_length) * math.log(number_of_documents / doc_freq)

    return tf_idf_score

### Okapi BM25
BM25 is a language model based on a binary independence model.

In [13]:
def bm25_score(document_id, query):
    k1 = 1.2
    k2 = 100
    b = 0.75
    score = 0
    curr_doc_len = document_to_length[document_id]
    for word in query:
        tf_wd = get_term_freq(word, document_id)
        doc_freq = get_doc_freq(word)
        tf_wq = get_word_freq_in_query(word, query)
        
        subscore = np.log((number_of_documents + 0.5) / (doc_freq + 0.5))
        subscore *= (tf_wd + k1 * tf_wd) / (tf_wd + k1 * ((1 - b) + b * (curr_doc_len / average_document_length)))
        subscore *= (tf_wq + k2 * tf_wq) / (tf_wq + k2)
        score += subscore
        
    return score

### Unigram LM with Laplace smoothing
This is a language model with Laplace (“add-one”) smoothing. We will use maximum likelihood estimates of the query based on a multinomial model “trained” on the document.

In [38]:
def p_laplace(word, document, tf_wd):
    return (tf_wd + 1) / (document_to_length[document] + vocabulary_size)

def lm_laplace(document, query):
    score = 0
    for word in query:
        tf_wd = get_term_freq(word, document)
        if tf_wd == 0:
            score -= 100
        else:
            score += np.log(p_laplace(word, document, tf_wd))
    return score

## Unigram LM with Jelinek-Mercer smoothing

This is a similar language model, except that here we smooth a foreground document language model with a background model from the entire corpus.



In [39]:
def p_jm(word, document, tf_wd, lambda_jm=0.98):
    p_mle = tf_wd / document_to_length[document]
    ttf = term_total_term_frequency[word]
    return lambda_jm * p_mle + (1 - lambda_jm) * ttf / total_document_length

def lm_jm(document, query):
    score = 0
    for word in query:
        tf_wd = get_term_freq(word, document)
        if tf_wd == 0:
            score -= 100
        else:
            score += np.log(p_jm(word, document, tf_wd))
    return score

## Task4: Evaluation

A) Compare manually the top 10 docs returned by ESBuilt-In, TFIDF, BM25, LMLaplace, for 5 queries specified by TAs. Explain or speculate on the reasons for differences in the rankings

In [40]:
retrieval_models = ['ES', 'OKAPI', 'TFIDF', 'BM25', 'LMLAPLACE', 'LMJM' ]

def master_score(document, query, model):
    if model == 'OKAPI':
        return okapi_score(document, query)
    elif model == 'TFIDF':
        return tf_idf_score(document, query)
    elif model == 'BM25':
        return bm25_score(document, query)
    elif model == 'LMLAPLACE':
        return lm_laplace(document, query)
    elif model == 'LMJM':
        return lm_jm(document, query)

def calculate_scores(queries, models=retrieval_models):
    scores = {}
    for model in models:
        print('Calculating scores for', model)
        if model == 'ES':
            scores[model] = es_built_in_search(queries)
        else:
            scores[model] = {}
            for query_id, query in queries.items():
                query = query['processed_query']
                scores[model][query_id] = []
                for document in all_document_ids:
                    if document_to_length[document] == 0:
                        continue
                    scores[model][query_id].append((document, master_score(document, query, model)))
    
            for query_id, query in queries.items():
                scores[model][query_id].sort(key=lambda x: x[1], reverse=True)

    return scores

scores = calculate_scores(queries)

Calculating scores for ES


  res = es.search(index=INDEX_NAME, body={


Calculating scores for OKAPI
Calculating scores for TFIDF
Calculating scores for BM25
Calculating scores for LMLAPLACE
Calculating scores for LMJM


In [41]:
for model in retrieval_models:
    with open('results_' + model + '.txt', 'w') as file:
        for query_id, query_scores in scores[model].items():
            for i, (document, score) in enumerate(query_scores):
                file.write(query_id + ' Q0 ' + document + ' ' + str(i + 1) + ' ' + str(score) + ' ' + 'Exp' + '\n')
                if i == 1000:
                    break

In [42]:
print("ES Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_ES.txt

print("--------------------------------------------")
print("Okapi Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_OKAPI.txt

print("--------------------------------------------")
print("TFIDF Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_TFIDF.txt

print("--------------------------------------------")
print("BM25 Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_BM25.txt

print("--------------------------------------------")
print("LMLAPLACE Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_LMLAPLACE.txt

print("--------------------------------------------")
print("LMJM Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_LMJM.txt

ES Retrieval Model
Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    25000
    Relevant:      1832
    Rel_ret:       1319
Interpolated Recall - Precision Averages:
    at 0.00       0.7125
    at 0.10       0.5369
    at 0.20       0.4596
    at 0.30       0.3748
    at 0.40       0.3240
    at 0.50       0.2838
    at 0.60       0.2426
    at 0.70       0.2161
    at 0.80       0.1694
    at 0.90       0.0766
    at 1.00       0.0249
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.2930
Precision:
  At    5 docs:   0.4480
  At   10 docs:   0.4280
  At   15 docs:   0.3947
  At   20 docs:   0.3800
  At   30 docs:   0.3627
  At  100 docs:   0.2424
  At  200 docs:   0.1704
  At  500 docs:   0.0933
  At 1000 docs:   0.0528
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.3186
--------------------------------------------
Okapi Retrieval Model
Error 

In [31]:
def get_top_n_docs(model, query_id, n=10):
    print("The given model is: ", model)
    print(f"Query {query_id} : {queries[query_id]['processed_query']}")
    print(f"Top {n} documents are: ")

    for i in range(n):
        print(f"\tRank {i+1}. {scores[model][query_id][i][0]} with score {scores[model][query_id][i][1]}")
        # get the document
        doc = es.get(index=INDEX_NAME, id=scores[model][query_id][i][0])
        print(doc['_source']['content'])

In [32]:
for model in retrieval_models:
    get_top_n_docs(model, '85')

The given model is:  ES
Query 85 : ['alleg', 'corrupt', 'public', 'offici']
Top 10 documents are: 
	Rank 1. AP890220-0143 with score 14.12233
bomb explod offic interior justic minist enriqu ortez colindr monday order polic occup prison offici accus corrupt injuri report four stick dynamit explod 7 10 p m damag build shatter window honduran telecommun co build street polic sourc immedi claim responsibl polic sunday occupi nation s largest prison govern accus prison offici corrupt drug traffick tri foment riot 1 500 inmat offici monday public secur forc statement detect danger escal sedit riot occur today monday alleg particip offici central penitentiari tegucigalpa capit prison offici highli danger delinqu plan riot massiv escap prison detriment nation secur tranquil ortez colindr told associ press temporarili suspend prison director marco tulio mendieta ortez colindr name maj david mendoza garcia school prison administr argentina facil s act director polic statement investig detect ele

# Task5: Pseudo-relevance Feedback (MS students only)
Pseudo-relevance Feedback
Implement pseudo-relevance feedback. The general algorithm is:

- Retrieve the top k documents using one of the above retrieval models.
- Identify terms in these documents which are distinctive to the documents.
- Add the terms to the query, and re-run the retrieval process. Return the final results.
- It is up to you to devise a reasonable way to choose terms to add to the query. It doesn’t have to be complicated, but you should be prepared to explain and justify your approach.

Evaluate your results using trec_eval and include similar metrics with your submission.

In [44]:
def get_top_n_terms(document_id, n=10):
    # calculate the tfidf to get the distinctive terms
    tfidf = {}
    for term in document_to_vectors[document_id]:
        tf = get_term_freq(term, document_id)
        idf = math.log(number_of_documents / get_doc_freq(term))
        tfidf[term] = tf * idf
    return sorted(tfidf.items(), key=lambda x: x[1], reverse=True)[:n]
    
def pseudo_relevance_feedback(model, k=5, n=1):
    print('Running pseudo relevance feedback for', model)
    new_queries = {}
    for query_id, query in queries.items():
        query = query['processed_query']
        top_k_docs = [x[0] for x in scores[model][query_id][:k]]
        new_query = query.copy()
        for doc in top_k_docs:
            doc_terms = document_to_vectors[doc]
            for i, term in enumerate(doc_terms):
                # Get the top n terms from the document
                top_n_terms = get_top_n_terms(doc, n)
                for term in top_n_terms:
                    if term[0] not in new_query:
                        new_query.append(term[0])
        new_queries[query_id] = {"processed_query" : new_query}
    return new_queries


retrieval_models = ['ES', 'OKAPI', 'TFIDF', 'BM25', 'LMLAPLACE', 'LMJM' ]


new_queries_pr = pseudo_relevance_feedback('OKAPI')
new_scores_pr = calculate_scores(new_queries_pr, retrieval_models)

for model in retrieval_models:
    new_queries_pr[model] = pseudo_relevance_feedback(model)
    new_scores_pr[model] = calculate_scores(new_queries_pr[model], [model])
    with open('results_relevance_' + model + '.txt', 'w') as file:
        for query_id, query_scores in new_scores_pr[model][model].items():
            for i, (document, score) in enumerate(query_scores):
                file.write(query_id + ' Q0 ' + document + ' ' + str(i + 1) + ' ' + str(score) + ' ' + 'Exp' + '\n')
                if i == 1000:
                    break

Running pseudo relevance feedback for OKAPI
Calculating scores for ES


  res = es.search(index=INDEX_NAME, body={


Calculating scores for OKAPI
Calculating scores for TFIDF
Calculating scores for BM25
Calculating scores for LMLAPLACE
Calculating scores for LMJM
Running pseudo relevance feedback for ES
Calculating scores for ES
Running pseudo relevance feedback for OKAPI
Calculating scores for OKAPI
Running pseudo relevance feedback for TFIDF
Calculating scores for TFIDF
Running pseudo relevance feedback for BM25
Calculating scores for BM25
Running pseudo relevance feedback for LMLAPLACE
Calculating scores for LMLAPLACE
Running pseudo relevance feedback for LMJM
Calculating scores for LMJM


In [None]:
print("ES Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_ES.txt

print("--------------------------------------------")
print("Okapi Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_OKAPI.txt

print("--------------------------------------------")
print("TFIDF Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_TFIDF.txt

print("--------------------------------------------")
print("BM25 Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_BM25.txt

print("--------------------------------------------")
print("LMLAPLACE Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_LMLAPLACE.txt

print("--------------------------------------------")
print("LMJM Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_LMJM.txt

ES Retrieval Model
Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    25000
    Relevant:      1832
    Rel_ret:       1335
Interpolated Recall - Precision Averages:
    at 0.00       0.6593
    at 0.10       0.5203
    at 0.20       0.3784
    at 0.30       0.3270
    at 0.40       0.2968
    at 0.50       0.2527
    at 0.60       0.2136
    at 0.70       0.1834
    at 0.80       0.1438
    at 0.90       0.0643
    at 1.00       0.0170
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.2618
Precision:
  At    5 docs:   0.3920
  At   10 docs:   0.3800
  At   15 docs:   0.3520
  At   20 docs:   0.3180
  At   30 docs:   0.3067
  At  100 docs:   0.2212
  At  200 docs:   0.1562
  At  500 docs:   0.0886
  At 1000 docs:   0.0534
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2826
--------------------------------------------
Okapi Retrieval Model
Error 

## Pseudo-relevance Feedback using ElasticSearch aggs "significant terms"
Use ES API "significat terms" separately on each query term (stem root) to get a list of related words. The words you want to add to the query are:
    - related to more than one query term
    - not stopwords
    - high IDF
    - other tricks you might need in order to only get interesting words
Add few of these words to the query and rerun your models.

Below is an example of this API in Sense for query term "atom":
GET /ap_dataset/document/_search
{
    "query" : {
        "terms" : {"TEXT" : [ "atom" ]}
    },
    "aggregations" : {
        "significantCrimeTypes" : {
            "significant_terms" : {
              "field" : "TEXT"             
            }
        }
    },
    "size": 0
}

In [47]:
def get_significant_terms(term):
#     - related to more than one query term
#     - not stopwords
#     - high IDF
#     - other tricks you might need in order to only get interesting words
    res = es.search(index=INDEX_NAME, body={
        "query" : {
            "terms" : {"content" : [ term ]}
        },
        "aggregations" : {
            "significantCrimeTypes" : {
                "significant_terms" : {
                  "field" : "content"             
                }
            }
        },
        "size": 0
    })
    return res['aggregations']['significantCrimeTypes']['buckets']

def get_all_related_terms(query, n=5):
    related_terms = []
    for term in query:
        related_terms.extend(get_significant_terms(term))
    # Sort realted terms by score
    related_terms = sorted(related_terms, key=lambda x: x['score'], reverse=True)
    related_terms = [x for x in related_terms if x['key'] not in query]
    # add idf and sort by idf
    for term in related_terms:
        term['idf'] = math.log(number_of_documents / get_doc_freq(term['key']))
    related_terms = sorted(related_terms, key=lambda x: x['idf'], reverse=True)

    return related_terms[:n]

def pseudo_relevance_feedback_es(queries, n=3):
    new_queries = {}
    for query_id, query in queries.items():
        query = query['processed_query']
        new_query = query.copy()
        related_terms = get_all_related_terms(query, n)
        for term in related_terms:
            new_query.append(term['key'])
        new_queries[query_id] = {"processed_query" : new_query}
    return new_queries

new_queries_es = pseudo_relevance_feedback_es(queries)
new_scores_es = calculate_scores(new_queries_es, retrieval_models)

for model in retrieval_models:
    with open('results_relevance_es_aggs_' + model + '.txt', 'w') as file:
        for query_id, query_scores in new_scores_es[model].items():
            for i, (document, score) in enumerate(query_scores):
                file.write(query_id + ' Q0 ' + document + ' ' + str(i + 1) + ' ' + str(score) + ' ' + 'Exp' + '\n')
                if i == 1000:
                    break

  res = es.search(index=INDEX_NAME, body={


Calculating scores for ES


  res = es.search(index=INDEX_NAME, body={


Calculating scores for OKAPI
Calculating scores for TFIDF
Calculating scores for BM25
Calculating scores for LMLAPLACE
Calculating scores for LMJM


In [48]:
print("ES Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_es_aggs_ES.txt

print("--------------------------------------------")
print("Okapi Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_es_aggs_OKAPI.txt

print("--------------------------------------------")
print("TFIDF Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_es_aggs_TFIDF.txt

print("--------------------------------------------")
print("BM25 Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_es_aggs_BM25.txt

print("--------------------------------------------")
print("LMLAPLACE Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_es_aggs_LMLAPLACE.txt

print("--------------------------------------------")
print("LMJM Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_relevance_es_aggs_LMJM.txt

ES Retrieval Model
Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    25000
    Relevant:      1832
    Rel_ret:       1314
Interpolated Recall - Precision Averages:
    at 0.00       0.4991
    at 0.10       0.4097
    at 0.20       0.3649
    at 0.30       0.3166
    at 0.40       0.2764
    at 0.50       0.2387
    at 0.60       0.1985
    at 0.70       0.1749
    at 0.80       0.1354
    at 0.90       0.0714
    at 1.00       0.0235
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.2284
Precision:
  At    5 docs:   0.3120
  At   10 docs:   0.3240
  At   15 docs:   0.3253
  At   20 docs:   0.3140
  At   30 docs:   0.3053
  At  100 docs:   0.1964
  At  200 docs:   0.1376
  At  500 docs:   0.0854
  At 1000 docs:   0.0526
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2447
--------------------------------------------
Okapi Retrieval Model
Error 

In [53]:
for query in queries:
    print("ID: ", query)
    print("Query : ", queries[query]['original_query'])
    print("Processed Query: ", " ".join(queries[query]['processed_query']))
    print("New Query using pseudo relevance: ", " ".join(new_queries_pr[query]['processed_query']))
    print("New Query using pseudo relevance with ES aggs: ", " ".join(new_queries_es[query]['processed_query']))
    print("=============================================")

ID:  85
Query :     Document will discuss allegations, or measures being taken against, corrupt public officials of any governmental jurisdiction worldwide.   

Processed Query:  alleg corrupt public offici
New Query using pseudo relevance:  alleg corrupt public offici ortez ligachev connaughton
New Query using pseudo relevance with ES aggs:  alleg corrupt public offici ziyang zhao xiaop
ID:  59
Query :     Document will report a type of weather event which has directly caused at least one fatality in some location.   

Processed Query:  weather least fatal locat
New Query using pseudo relevance:  weather least fatal locat thunderstorm tornado connel
New Query using pseudo relevance with ES aggs:  weather least fatal locat overtown longitud thunderstorm
ID:  56
Query :     Document will include a prediction about the prime lending rate, or will report an actual prime rate move.   

Processed Query:  prime lend rate prime rate
New Query using pseudo relevance:  prime lend rate prime rat

# Task EC

In [15]:
def calculate_scores_bg(queries, models):
    scores = {}
    for model in models:
        print('Calculating scores for', model)
        if model == 'ES':
            scores[model] = es_built_in_search(queries)
        else:
            scores[model] = {}
            for query_id, query_data in tqdm(queries.items(), desc="Processing queries for model " + model):
                query = query_data['processed_query']
                scores[model][query_id] = []
                for document in all_document_ids:
                    if document_to_length[document] == 0:
                        continue
                    if model == 'BIGRAM':
                        score = bigram_score(document, ' '.join(query))
                    if score is not None:  
                        scores[model][query_id].append((document, score))
    
            for query_id, query in queries.items():
                scores[model][query_id].sort(key=lambda x: x[1], reverse=True)

    return scores

def get_bigrams(text):
    words = text.split()
    return [words[i] + ' ' + words[i + 1] for i in range(len(words) - 1)]

def bigram_score(document, query):
    doc_terms = document_to_vectors[document]
    query_bigrams = get_bigrams(query)
    
    score = 0
    if not query_bigrams:
        return 0
    
    for bigram in query_bigrams:
        term1, term2 = bigram.split()
        if term1 in doc_terms and term2 in doc_terms:
            term1_positions = [pos for pos, term in enumerate(doc_terms.keys()) if term == term1]
            term2_positions = [pos for pos, term in enumerate(doc_terms.keys()) if term == term2]
            distances = [abs(pos2 - pos1) for pos1 in term1_positions for pos2 in term2_positions]
            if distances:
                score += sum([1 / (1 + d) for d in distances])
    return score


# Example usage:
scores = calculate_scores_bg(queries,  ['BIGRAM'])

with open('results_BIGRAM' + '.txt', 'w') as file:
    for query_id, query_scores in scores['BIGRAM'].items():
        for i, (document, score) in enumerate(query_scores):
            file.write(query_id + ' Q0 ' + document + ' ' + str(i + 1) + ' ' + str(score) + ' ' + 'Exp' + '\n')
            if i == 1000:
                break

print("--------------------------------------------")
print("Bigram Retrieval Model")
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_BIGRAM.txt


Calculating scores for BIGRAM


Processing queries for model BIGRAM: 100%|██████████| 25/25 [00:49<00:00,  1.96s/it]


--------------------------------------------
Bigram Retrieval Model
Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    25000
    Relevant:      1832
    Rel_ret:        797
Interpolated Recall - Precision Averages:
    at 0.00       0.3251
    at 0.10       0.1895
    at 0.20       0.1172
    at 0.30       0.0915
    at 0.40       0.0744
    at 0.50       0.0630
    at 0.60       0.0611
    at 0.70       0.0523
    at 0.80       0.0472
    at 0.90       0.0291
    at 1.00       0.0087
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.0797
Precision:
  At    5 docs:   0.1680
  At   10 docs:   0.1240
  At   15 docs:   0.1040
  At   20 docs:   0.1060
  At   30 docs:   0.0960
  At  100 docs:   0.0672
  At  200 docs:   0.0572
  At  500 docs:   0.0398
  At 1000 docs:   0.0319
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.0971
