In [34]:
from pyserini.index.lucene import LuceneIndexReader

index_reader = LuceneIndexReader('IndexStopwordsKrovetzTrecFormat')

Find and save to file the posting list of the term 'predictions'

In [35]:
term = 'predictions'

In [36]:
from pyserini.analysis import Analyzer, get_lucene_analyzer

analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))

In [37]:
# Analyze the term to match how it was indexed
analyzed_term = analyzer.analyze(term)[0]
print(f'Original term: {term}')
print(f'Analyzed term: {analyzed_term}')

# Get posting list for the analyzed term
posting_list = index_reader.get_postings_list(analyzed_term, analyzer=None)

# Save to file
with open('posting_list.txt', 'w') as f:
    for posting in posting_list:
        # posting.docid is the internal Lucene docid (integer)
        # Convert to collection docid (string like "AP880630-0057")
        collection_docid = index_reader.convert_internal_docid_to_collection_docid(posting.docid)
        f.write(f'{collection_docid}\t{posting.tf}\n')

print(f'Saved {len(posting_list)} postings to posting_list.txt')

Original term: predictions
Analyzed term: prediction
Saved 1572 postings to posting_list.txt


Find the line referring to document "AP880630-0057"

In [38]:
# Find the posting for document "AP880630-0057"
for posting in posting_list:
    collection_docid = index_reader.convert_internal_docid_to_collection_docid(posting.docid)
    if collection_docid == "AP880630-0057":
        print(f'Document: {collection_docid}')
        print(f'Internal Lucene docid: {posting.docid}')
        print(f'Term frequency: {posting.tf}')
        break

Document: AP880630-0057
Internal Lucene docid: 130859
Term frequency: 3


How many times does “predictions” appear in the document “AP880630-0057” and in which positions?

In [39]:
# Get term positions for the analyzed term in document "AP880630-0057"
# get_term_positions returns a dictionary mapping terms to their positions
term_positions_map = index_reader.get_term_positions("AP880630-0057")

if term_positions_map and analyzed_term in term_positions_map:
    positions = term_positions_map[analyzed_term]
    print(f'Term "{term}" (analyzed: "{analyzed_term}")')
    print(f'Appears {len(positions)} times in document "AP880630-0057"')
    print(f'Positions: {list(positions)}')
else:
    print(f'Term "{analyzed_term}" not found in document "AP880630-0057"')

Term "predictions" (analyzed: "prediction")
Appears 3 times in document "AP880630-0057"
Positions: [305, 370, 564]


Find and save to file the text of the document “AP880630-0057”

In [40]:
# Get document by collection docid
document = index_reader.doc("AP880630-0057")

if document:
    # Save raw document contents to file
    with open('AP880630-0057.txt', 'w') as f:
        f.write(document.raw())
    print('Document saved to AP880630-0057.txt')
else:
    print('Document not found')

Document saved to AP880630-0057.txt


How many times the term “predictions” appear in the document? Is there any difference from the inverted list? Why?

In [41]:
# Compare term occurrences from inverted index vs document vector
if term_positions_map and analyzed_term in term_positions_map:
    positions_count = len(term_positions_map[analyzed_term])
    print(f'From inverted index positions: {positions_count} occurrences')

# Get document vector (term frequencies)
doc_vector = index_reader.get_document_vector("AP880630-0057")
if doc_vector and analyzed_term in doc_vector:
    tf_from_vector = doc_vector[analyzed_term]
    print(f'From document vector: {tf_from_vector} occurrences')
    
    if positions_count == tf_from_vector:
        print('\nNo difference - both methods return the same count.')
        print('This is expected: term frequency = number of positions.')
else:
    print(f'Term "{analyzed_term}" not found in document vector')

From inverted index positions: 3 occurrences
From document vector: 3 occurrences

No difference - both methods return the same count.
This is expected: term frequency = number of positions.
