In [2]:
import os
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
import shutil

# Define output directory for jsonl files
output_dir = "/teamspace/studios/this_studio/jsonl_docs"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir)

# Function to process and save documents as JSONL
def save_as_jsonl(docno, title, text, output_file):
    doc_dict = {
        "id": docno,
        "title": title,
        "contents": text
    }
    with open(output_file, 'a', encoding='utf-8') as f:
        f.write(json.dumps(doc_dict) + '\n')

# Process documents
doccount = 0
jsonl_file = os.path.join(output_dir, "docs.jsonl")

# Delete the file if it already exists
if os.path.exists(jsonl_file):
    os.remove(jsonl_file)

for filename in os.listdir("/teamspace/studios/this_studio/trec678rb/trec678rb/documents"):
    print(f"Processing file: {filename}")
    with open("/teamspace/studios/this_studio/trec678rb/trec678rb/documents/" + filename, 'r', encoding="ISO-8859-1") as fp:
        soup = BeautifulSoup(fp, 'html.parser')
        
        if filename.startswith('fb'):
            doc = soup.find("doc")
            while doc is not None:
                docno = doc.findChildren("docno")[0].get_text().strip()
                title = doc.findChildren("ti")
                text = doc.findChildren("text")
                if len(text) == 0:
                    doc = doc.find_next("doc")
                    continue
                text = text[0].get_text().strip()
                title = "" if len(title) == 0 else title[0].get_text().strip()
                # print(f'{doccount} -- {docno} -> {title}')
                save_as_jsonl(docno, title, text, jsonl_file)
                doc = doc.find_next("doc")
                doccount += 1
                
        elif filename.startswith('ft'):
            doc = soup.find("doc")
            while doc is not None:
                docno = doc.findChildren("docno")[0].get_text().strip()
                title = doc.findChildren("headline")
                text = doc.findChildren("text")
                if len(text) == 0:
                    doc = doc.find_next("doc")
                    continue
                text = text[0].get_text().strip()
                title = "" if len(title) == 0 else title[0].get_text().strip()
                # print(f'{doccount} -- {docno} -> {title}')
                save_as_jsonl(docno, title, text, jsonl_file)
                doc = doc.find_next("doc")
                doccount += 1
                
        elif filename.startswith('la'):
            doc = soup.find("doc")
            while doc is not None:
                docno = doc.findChildren("docno")[0].get_text().strip()
                title = doc.findChildren("headline")
                text = doc.findChildren("text")
                if len(text) == 0:
                    doc = doc.find_next("doc")
                    continue
                text = text[0].get_text().strip()
                title = "" if len(title) == 0 else title[0].get_text().strip()
                # print(f'{doccount} -- {docno} -> {title}')
                save_as_jsonl(docno, title, text, jsonl_file)
                doc = doc.find_next("doc")
                doccount += 1
#There is  4 type of file  for my easy work i take only three.................
print(f"Total documents processed: {doccount}")
print(f"JSONL file created at: {jsonl_file}")

# Now run Pyserini indexing
print("Starting Pyserini indexing...")


Processing file: fb396001
Processing file: fb396002
Processing file: fb396003
Processing file: fb396004
Processing file: fb396005
Processing file: fb396006
Processing file: fb396007


Processing file: fb396008
Processing file: fb396009
Processing file: fb396010
Processing file: fb396011
Processing file: fb396012
Processing file: fb396013
Processing file: fb396014
Processing file: fb396015
Processing file: fb396016
Processing file: fb396017
Processing file: fb396018
Processing file: fb396019
Processing file: fb396020
Processing file: fb396021
Processing file: fb396022
Processing file: fb396023
Processing file: fb396024
Processing file: fb396025
Processing file: fb396026
Processing file: fb396027
Processing file: fb396028
Processing file: fb396029
Processing file: fb396030
Processing file: fb396031
Processing file: fb396032
Processing file: fb396033
Processing file: fb396034
Processing file: fb396035
Processing file: fb396036
Processing file: fb396037
Processing file: fb396038
Processing file: fb396039
Processing file: fb396040
Processing file: fb396041
Processing file: fb396042
Processing file: fb396043
Processing file: fb396044
Processing file: fb396045
Processing f

In [3]:
import subprocess
output_dir='/teamspace/studios/this_studio/jsonl_docs'
index_path = "/teamspace/studios/this_studio/index-dir"
cmd = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", output_dir,
    "--index", index_path,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "8",
    "--storePositions", "--storeDocvectors", "--storeRaw"
]

print(f"Running command: {' '.join(cmd)}")
process = subprocess.run(cmd, capture_output=True, text=True)

if process.returncode == 0:
    print("Indexing completed successfully!")
    print(process.stdout)
else:
    print("Indexing failed with error:")
    print(process.stderr)

Running command: python -m pyserini.index.lucene --collection JsonCollection --input /teamspace/studios/this_studio/jsonl_docs --index /teamspace/studios/this_studio/index-dir --generator DefaultLuceneDocumentGenerator --threads 8 --storePositions --storeDocvectors --storeRaw
Indexing completed successfully!
2025-04-14 07:47:46,375 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:205) - Setting log level to INFO
2025-04-14 07:47:46,378 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) - AbstractIndexer settings:
2025-04-14 07:47:46,379 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + DocumentCollection path: /teamspace/studios/this_studio/jsonl_docs
2025-04-14 07:47:46,379 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + CollectionClass: JsonCollection
2025-04-14 07:47:46,379 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Index path: /teamspace/studios/this_studio/index-dir
2025-04-14 07:47:46,379 INFO  [main] 

In [3]:
from pyserini.search.lucene import LuceneSearcher
index_path='/teamspace/studios/this_studio/index-dir'
searcher = LuceneSearcher(index_path)
print(f"Index contains {searcher.num_docs} documents")  # Should match your JSONL line count
# print(f"Index stats: {searcher.stats()}")



Index contains 468189 documents


Apr 14, 2025 8:19:21 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


In [2]:
import torchvision,torch,transformers
print(f"PyTorch: {torch.__version__}")
print(f"TorchVision: {torchvision.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"TorchVision: {torchvision.__version__}")
print(f"Transformers: {transformers.__version__}")

#Main problem about to pyserini is , it may not work in version mismatch

PyTorch: 2.2.1+cu121
TorchVision: 0.17.1+cu121
Transformers: 4.37.2
PyTorch: 2.2.1+cu121
TorchVision: 0.17.1+cu121
Transformers: 4.37.2


In [1]:
from pyserini.search.lucene import LuceneSearcher
import xml.etree.ElementTree as ET

# Load the searcher from the index directory
searcher = LuceneSearcher('/teamspace/studios/this_studio/index-dir')
searcher.set_bm25()  # You can also set custom parameters: set_bm25(k1=0.9, b=0.4)

# Parse topics from TREC-style XML
robust_topics = ET.parse("/teamspace/studios/this_studio/trec678rb/trec678rb/topics/robust.xml").getroot()

# Output results to a file
with open("robust_bm25_", "a") as outfile:
    for top in robust_topics:
        query_num = top.find('num').text.strip().split()[-1]  # e.g., "Number: 303" -> "303"
        title = top.find('title').text.strip()
        print(query_num, title)

        # Run the search
        hits = searcher.search(title, k=1000)

        # Write results in TREC format
        for rank, hit in enumerate(hits, start=1):
            outfile.write(f"{query_num}\tQ0\t{hit.docid}\t{rank}\t{hit.score}\tcs2307\n")


Apr 18, 2025 1:24:07 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


601 Turkey Iraq water
602 Czech, Slovak sovereignty
603 Tobacco cigarette lawsuit
604 Lyme disease arthritis
605 Great Britain health care
606 leg traps ban
607 human genetic code
608 taxing social security
609 per capita alcohol consumption
610 minimum wage adverse impact
611 Kurds Germany violence
612 Tibet protesters
613 Berlin wall disposal
614 Flavr Savr tomato
615 timber exports Asia
616 Volkswagen Mexico
617 Russia Cuba economy
618 Ayatollah Khomeini death
619 Winnie Mandela scandal
620 France nuclear testing
621 women ordained Church of England
622 price fixing
623 toxic chemical weapon
624 SDI Star Wars
625 arrests bombing WTC
626 human stampede
627 Russian food crisis
628 U.S. invasion of Panama
629 abortion clinic attack
630 Gulf War Syndrome
631 Mandela South Africa President
632 southeast Asia tin mining
633 Welsh devolution
634 L-tryptophan deaths
635 doctor assisted suicides
636 jury duty exemptions
637 human growth hormone (HGH)
638 wrongful convictions
639 consumer on-