# Assignment 1 Setup Index

In [1]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import json
import numpy as np
import math
from elasticsearch7 import Elasticsearch

In [2]:
# Path to the directory containing files
DIRECTORY_PATH = 'ap89_collection/'
QUERY_PATH = 'AP_DATA/query_desc.51-100.short.txt'
INDEX_NAME = 'ap89_collection'
STOP_WORDS_PATH = 'AP_DATA/stoplist.txt'
DEFAULT_HEADER = {'Content-Type': 'application/json'}
ES_URL = "http://localhost:9200/"

es = Elasticsearch(ES_URL)

In [None]:
# Open stopwords file
with open(STOP_WORDS_PATH, 'r') as file:
    stoplist = file.read().splitlines()

def preprocess_text(text, stoplist):
    # Remove punctuation from text
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token.strip().lower() for token in tokens]
    tokens = [token for token in tokens if token not in stoplist]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    text = ' '.join(tokens)
    return text


In [3]:
es.ping()

True

# Task3: Query execution
Write a program to run the queries in the file query_desc.51-100.short.txt, included in the data .zip file. You should run all queries (omitting the leading number) using each of the retrieval models listed below, and output the top 1000 results for each query to an output file. If a particular query has fewer than 1000 documents with a nonzero matching score, then just list whichever documents have nonzero scores.

You should write precisely one output file per retrieval model. Each line of an output file should specify one retrieved document, in the following format:

In [52]:
def read_queries(path):
    queries = {}
    with open(QUERY_PATH, 'r') as file:
        content = file.readlines()
        # 62.   Document will report a military coup d'etat, either attempted or successful, in any country.  
        # Extract query id and query text
        for line in content:
            query_id, query_text = line.split('.', 1)
            queries[query_id] = {
                'original_query': query_text,
                'processed_query': preprocess_text(query_text, stoplist) 
            }
    return queries

queries = read_queries(QUERY_PATH)
queries

{'85': {'original_query': '   Document will discuss allegations, or measures being taken against, corrupt public officials of any governmental jurisdiction worldwide.   \n',
  'processed_query': 'alleg corrupt public offici'},
 '59': {'original_query': '   Document will report a type of weather event which has directly caused at least one fatality in some location.   \n',
  'processed_query': 'weather least fatal locat'},
 '56': {'original_query': '   Document will include a prediction about the prime lending rate, or will report an actual prime rate move.   \n',
  'processed_query': 'prime lend rate prime rate'},
 '71': {'original_query': '   Document will report incursions by land, air, or water into the border area of one country by military forces of a second country or a guerrilla group based in a second country.   \n',
  'processed_query': 'incurs border area militari forc second guerrilla second'},
 '64': {'original_query': '   Document will report an event or result of politica

In [5]:
def fetch_all_document_ids(path="AP_DATA/doclist_new_0609.txt"):
    with open(path, 'r') as file:
        all_document_ids = []
        for line in file:
            all_document_ids.append(line.strip().split(' ')[2])
    return all_document_ids

def fetch_all_term_vectors(document_ids):
    document_to_vectors = {}
    document_to_length = {}
    term_document_frequency = {}
    term_total_term_frequency = {}
    vocabulary_words = set()

    def fetch_term_vectors(id_batch):
        vectors = es.mtermvectors(index=INDEX_NAME, term_statistics = True, ids=id_batch, fields='content')

        for tv in vectors['docs']:
            document_id = tv['_id']

            if 'content' not in tv['term_vectors']:
                document_to_vectors[document_id] = {}
                document_to_length[document_id] = 0
            else:
                terms = tv['term_vectors']['content']['terms']

                for term in terms:
                    term_document_frequency[term] = terms[term]['doc_freq']
                    term_total_term_frequency[term] = terms[term]['ttf']

                document_to_vectors[document_id] = terms
                document_to_length[document_id] = sum([terms[x]['term_freq'] for x in terms])
                for term in terms:
                    vocabulary_words.add(term)

    batch_size = 250
    for i in range(0, len(document_ids), batch_size):
        print('retrieved term vectors for', i)
        id_batch = document_ids[i:i + batch_size]
        fetch_term_vectors(id_batch)

    return document_to_vectors, document_to_length, term_document_frequency, term_total_term_frequency, vocabulary_words

all_document_ids = fetch_all_document_ids()
document_to_vectors, document_to_length, term_document_frequency, term_total_term_frequency, vocabulary_words = fetch_all_term_vectors(all_document_ids)

number_of_documents = len(all_document_ids)
average_document_length = sum([document_to_length[d] for d in all_document_ids]) / number_of_documents
total_document_length = sum([document_to_length[d] for d in document_to_length])
vocabulary_size = len(vocabulary_words)


retrieved term vectors for 0
retrieved term vectors for 250
retrieved term vectors for 500
retrieved term vectors for 750
retrieved term vectors for 1000
retrieved term vectors for 1250
retrieved term vectors for 1500
retrieved term vectors for 1750
retrieved term vectors for 2000
retrieved term vectors for 2250
retrieved term vectors for 2500
retrieved term vectors for 2750
retrieved term vectors for 3000
retrieved term vectors for 3250
retrieved term vectors for 3500
retrieved term vectors for 3750
retrieved term vectors for 4000
retrieved term vectors for 4250
retrieved term vectors for 4500
retrieved term vectors for 4750
retrieved term vectors for 5000
retrieved term vectors for 5250
retrieved term vectors for 5500
retrieved term vectors for 5750
retrieved term vectors for 6000
retrieved term vectors for 6250
retrieved term vectors for 6500
retrieved term vectors for 6750
retrieved term vectors for 7000
retrieved term vectors for 7250
retrieved term vectors for 7500
retrieved term

In [6]:
def get_doc_freq(term):
    if term in term_document_frequency:
        return term_document_frequency[term]
    else:
        return 0

def get_term_freq(term, document_id):
    if document_id in document_to_vectors and term in document_to_vectors[document_id]:
        return document_to_vectors[document_id][term]['term_freq']
    else:
        return 0

def get_word_freq_in_query(term, query):
    return query.count(term)



### ES built-in 
Use ES query with the API "match"{"body_text":"query keywords"}. This should be somewhat similar to BM25 scoring.

In [7]:
es.ping()

True

In [143]:
def es_built_in_search(queries, size=1000):
    scores = {}

    for query_id, query in queries.items():
        query = query['processed_query']

        res = es.search(index=INDEX_NAME, body={
            "query": {
                "match": {
                    "content": query
                }
            },
            "size": size
        })

        hits = res['hits']['hits']
        scores[query_id] = []
        for hit in hits:
            scores[query_id].append((hit['_id'], hit['_score']))

    return scores

es_built_in_search(queries)

  res = es.search(index=INDEX_NAME, body={


{'85': [('AP890108-0030', 14.200768),
  ('AP890107-0129', 13.997027),
  ('AP890220-0143', 13.993785),
  ('AP890516-0072', 13.891917),
  ('AP890125-0007', 13.832297),
  ('AP890516-0158', 13.808294),
  ('AP890517-0182', 13.564507),
  ('AP890518-0050', 13.559555),
  ('AP891111-0095', 13.006072),
  ('AP890503-0199', 12.997587),
  ('AP890122-0062', 12.941163),
  ('AP890622-0096', 12.915293),
  ('AP890414-0097', 12.858449),
  ('AP890131-0078', 12.818568),
  ('AP891013-0120', 12.759994),
  ('AP890112-0180', 12.700686),
  ('AP890525-0042', 12.668644),
  ('AP890819-0056', 12.558189),
  ('AP890704-0150', 12.530955),
  ('AP890710-0011', 12.530955),
  ('AP890223-0032', 12.521022),
  ('AP891023-0145', 12.516952),
  ('AP890615-0143', 12.486102),
  ('AP890522-0129', 12.484653),
  ('AP891002-0245', 12.469807),
  ('AP890915-0075', 12.461873),
  ('AP890620-0122', 12.454208),
  ('AP890417-0030', 12.446884),
  ('AP890315-0167', 12.302612),
  ('AP890315-0010', 12.264419),
  ('AP890131-0031', 12.22847),
  (

### Okapi TF
This is a vector space model using a slightly modified version of TF to score documents.

In [144]:
def okapi_tf(tf_wd, curr_doc_len, avg_doc_len):
    return tf_wd / (tf_wd + 0.5 + (1.5 * (curr_doc_len / avg_doc_len)))

def okapi_score(document, query):
    curr_doc_len = document_to_length[document]
    score = 0
    for term in query:
        tf_wd = get_term_freq(term, document)
        score += okapi_tf(tf_wd, curr_doc_len, average_document_length)
    return score

### TF-IDF

This is the second vector space model

In [145]:
def tf_idf_score(document_id, query):
    curr_doc_len = document_to_length[document_id]
    tf_idf_score = 0
    for word in query:
        tf_wd = get_term_freq(word, document_id)
        doc_freq = get_doc_freq(word)
        if doc_freq == 0:
            continue
        tf_idf_score += okapi_tf(tf_wd, curr_doc_len, average_document_length) * math.log(number_of_documents / doc_freq)

    return tf_idf_score

### Okapi BM25
BM25 is a language model based on a binary independence model.

In [146]:
def bm25_score(document_id, query):
    k1 = 1.2
    k2 = 100
    b = 0.75
    score = 0
    curr_doc_len = document_to_length[document_id]
    for word in query:
        tf_wd = get_term_freq(word, document_id)
        doc_freq = get_doc_freq(word)
        tf_wq = get_word_freq_in_query(word, " ".join(query))
        
        subscore = np.log((number_of_documents + 0.5) / (doc_freq + 0.5))
        subscore *= (tf_wd + k1 * tf_wd) / (tf_wd + k1 * ((1 - b) + b * (curr_doc_len / average_document_length)))
        subscore *= (tf_wq + k2 * tf_wq) / (tf_wq + k2)
        score += subscore
        
    return score

### Unigram LM with Laplace smoothing
This is a language model with Laplace (“add-one”) smoothing. We will use maximum likelihood estimates of the query based on a multinomial model “trained” on the document.

In [147]:
def p_laplace(word, document, tf_wd):
    return (tf_wd + 1) / (document_to_length[document] + vocabulary_size)

def lm_laplace(document, query):
    score = 0
    for word in query:
        tf_wd = get_term_freq(word, document)
        if tf_wd == 0:
            continue
        else:
            score += np.log(p_laplace(word, document, tf_wd))
    return score

## Unigram LM with Jelinek-Mercer smoothing

This is a similar language model, except that here we smooth a foreground document language model with a background model from the entire corpus.



In [148]:
def p_jm(word, document, tf_wd, lambda_jm=0.9):
    p_mle = tf_wd / document_to_length[document]
    ttf = term_total_term_frequency[word]
    return lambda_jm * p_mle + (1 - lambda_jm) * ttf / total_document_length

def lm_jm(document, query):
    score = 0
    for word in query:
        tf_wd = get_term_freq(word, document)
        if tf_wd == 0:
            continue
        else:
            score += np.log(p_jm(word, document, tf_wd))
    return score

## Task4: Evaluation

A) Compare manually the top 10 docs returned by ESBuilt-In, TFIDF, BM25, LMLaplace, for 5 queries specified by TAs. Explain or speculate on the reasons for differences in the rankings

In [149]:
query_ids = list(queries.keys())
queries[query_ids[0]]['processed_query']

'alleg corrupt public offici'

In [158]:
retrieval_models = ['ES', 'OKAPI', 'TFIDF', 'BM25', 'LMLAPLACE', 'LMJM' ]

def master_score(document, query, model):
    if model == 'OKAPI':
        return okapi_score(document, query)
    elif model == 'TFIDF':
        return tf_idf_score(document, query)
    elif model == 'BM25':
        return bm25_score(document, query)
    elif model == 'LMLAPLACE':
        return lm_laplace(document, query)
    elif model == 'LMJM':
        return lm_jm(document, query)

def calculate_scores():
    scores = {}
    for model in retrieval_models:
        print('Calculating scores for', model)
        if model == 'ES':
            scores[model] = es_built_in_search(queries)
        else:
            scores[model] = {}
            for query_id, query in queries.items():
                query = query['processed_query'].split()
                scores[model][query_id] = []
                for document in all_document_ids:
                    if document_to_length[document] == 0:
                        continue
                    scores[model][query_id].append((document, master_score(document, query, model)))
    
            for query_id, query in queries.items():
                scores[model][query_id].sort(key=lambda x: x[1], reverse=True)

    return scores

scores = calculate_scores()

Calculating scores for ES


  res = es.search(index=INDEX_NAME, body={


Calculating scores for OKAPI
Calculating scores for TFIDF
Calculating scores for BM25
Calculating scores for LMLAPLACE
Calculating scores for LMJM


In [159]:
for model in retrieval_models:
    with open('results_' + model + '.txt', 'w') as file:
        for query_id, query_scores in scores[model].items():
            for i, (document, score) in enumerate(query_scores):
                file.write(query_id + ' Q0 ' + document + ' ' + str(i + 1) + ' ' + str(score) + ' ' + 'Exp' + '\n')
                if i == 1000:
                    break

In [160]:
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_ES.txt

Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    25000
    Relevant:      1832
    Rel_ret:       1203
Interpolated Recall - Precision Averages:
    at 0.00       0.6843
    at 0.10       0.4970
    at 0.20       0.4157
    at 0.30       0.3335
    at 0.40       0.2926
    at 0.50       0.2547
    at 0.60       0.2090
    at 0.70       0.1832
    at 0.80       0.1281
    at 0.90       0.0518
    at 1.00       0.0133
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.2638
Precision:
  At    5 docs:   0.4160
  At   10 docs:   0.3880
  At   15 docs:   0.3947
  At   20 docs:   0.3780
  At   30 docs:   0.3573
  At  100 docs:   0.2308
  At  200 docs:   0.1598
  At  500 docs:   0.0846
  At 1000 docs:   0.0481
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2987


In [161]:
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_OKAPI.txt

Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    25000
    Relevant:      1832
    Rel_ret:       1093
Interpolated Recall - Precision Averages:
    at 0.00       0.5989
    at 0.10       0.4361
    at 0.20       0.2995
    at 0.30       0.2624
    at 0.40       0.2097
    at 0.50       0.1738
    at 0.60       0.1371
    at 0.70       0.1055
    at 0.80       0.0780
    at 0.90       0.0487
    at 1.00       0.0036
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.1969
Precision:
  At    5 docs:   0.3920
  At   10 docs:   0.3560
  At   15 docs:   0.3253
  At   20 docs:   0.3180
  At   30 docs:   0.2853
  At  100 docs:   0.1852
  At  200 docs:   0.1354
  At  500 docs:   0.0737
  At 1000 docs:   0.0437
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2428


In [162]:
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_TFIDF.txt

Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    25000
    Relevant:      1832
    Rel_ret:       1194
Interpolated Recall - Precision Averages:
    at 0.00       0.6314
    at 0.10       0.5165
    at 0.20       0.3798
    at 0.30       0.3292
    at 0.40       0.2869
    at 0.50       0.2487
    at 0.60       0.2086
    at 0.70       0.1779
    at 0.80       0.1122
    at 0.90       0.0533
    at 1.00       0.0123
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.2554
Precision:
  At    5 docs:   0.4240
  At   10 docs:   0.3760
  At   15 docs:   0.3840
  At   20 docs:   0.3700
  At   30 docs:   0.3493
  At  100 docs:   0.2232
  At  200 docs:   0.1588
  At  500 docs:   0.0830
  At 1000 docs:   0.0478
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2886


In [163]:
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_BM25.txt

Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    25000
    Relevant:      1832
    Rel_ret:       1173
Interpolated Recall - Precision Averages:
    at 0.00       0.6720
    at 0.10       0.4880
    at 0.20       0.3800
    at 0.30       0.3255
    at 0.40       0.2777
    at 0.50       0.2450
    at 0.60       0.1958
    at 0.70       0.1697
    at 0.80       0.1202
    at 0.90       0.0580
    at 1.00       0.0132
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.2494
Precision:
  At    5 docs:   0.3840
  At   10 docs:   0.3680
  At   15 docs:   0.3787
  At   20 docs:   0.3640
  At   30 docs:   0.3467
  At  100 docs:   0.2224
  At  200 docs:   0.1538
  At  500 docs:   0.0826
  At 1000 docs:   0.0469
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2788


In [164]:
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_LMLAPLACE.txt

Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    25000
    Relevant:      1832
    Rel_ret:       1096
Interpolated Recall - Precision Averages:
    at 0.00       0.5930
    at 0.10       0.4818
    at 0.20       0.3995
    at 0.30       0.3148
    at 0.40       0.2408
    at 0.50       0.1894
    at 0.60       0.1463
    at 0.70       0.1090
    at 0.80       0.0665
    at 0.90       0.0301
    at 1.00       0.0000
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.2162
Precision:
  At    5 docs:   0.3920
  At   10 docs:   0.4000
  At   15 docs:   0.3520
  At   20 docs:   0.3320
  At   30 docs:   0.3067
  At  100 docs:   0.2064
  At  200 docs:   0.1438
  At  500 docs:   0.0744
  At 1000 docs:   0.0438
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2618


In [165]:
!perl ./trec_eval/trec_eval.pl AP_DATA/qrels.adhoc.51-100.AP89.txt results_LMJM.txt

Error due to 25

Queryid (Num):       25
Total number of documents over all queries
    Retrieved:    25000
    Relevant:      1832
    Rel_ret:       1079
Interpolated Recall - Precision Averages:
    at 0.00       0.5906
    at 0.10       0.4227
    at 0.20       0.3469
    at 0.30       0.2719
    at 0.40       0.2318
    at 0.50       0.1866
    at 0.60       0.1433
    at 0.70       0.1163
    at 0.80       0.0767
    at 0.90       0.0392
    at 1.00       0.0073
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.2019
Precision:
  At    5 docs:   0.3280
  At   10 docs:   0.3360
  At   15 docs:   0.3253
  At   20 docs:   0.3000
  At   30 docs:   0.2787
  At  100 docs:   0.1988
  At  200 docs:   0.1350
  At  500 docs:   0.0714
  At 1000 docs:   0.0432
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2408


In [122]:
def get_top_n_docs(model, query_id, n=10):
    print("The given model is: ", model)
    print(f"Query {query_id} : {queries[query_id]['processed_query']}")
    print(f"Top {n} documents are: ")

    for i in range(n):
        print(f"\tRank {i+1}. {scores[model][query_id][i][0]} with score {scores[model][query_id][i][1]}")
        # get the document
        doc = es.get(index=INDEX_NAME, id=scores[model][query_id][i][0])
        print(doc['_source']['content'])

In [139]:
for model in retrieval_models:
    get_top_n_docs(model, '85')

The given model is:  ES
Query 85 : alleg corrupt public offici
Top 10 documents are: 
	Rank 1. AP890108-0030 with score 14.200768
three former drug agent accus deal cocain money launder possibl seduc glamor hollywood lifestyl larg sum money made southern california grow drug trade offici alleg conspiraci three men part 150agent drug enforc administr conting assign lo angel area worri offici case multipleag corrupt rare appar isol despit increas temptat region assist attorney joyc karlin three bad appl forc embarrass karlin offer dea agent ive work 1 million 10 million sell laugh face probe alleg corrupt trio agent charg tax evas widen alleg heroin cash theft dea vault cocain traffick karlin former agent john jackson darnel garcia wayn countryman indict tax fraud alleg conspiraci launder 608000 swiss bank face new charg cocain traffick end januari karlin offici given detail men accus intern revenu servic agent john anderson evid three involv money launder narcot traffick follow employ t