In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


installing Pyserini library for sparse passage retrieval

In [5]:
%%capture
!pip install pyserini==0.22.0
!pip install faiss-cpu

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

read pquad dataset

In [None]:
import json

with open('/content/drive/MyDrive/PQuAD-main/Dataset/Train.json', 'r') as f:
  train_data = json.load(f)

with open('/content/drive/MyDrive/PQuAD-main/Dataset/Validation.json', 'r') as f:
  dev_data = json.load(f)

with open('/content/drive/MyDrive/PQuAD-main/Dataset/Test.json', 'r') as f:
  test_data = json.load(f)


a function to write documents with json-lines format

In [6]:
def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

index the passages of pquad dataset

In [15]:
documents = []
i = 0

for data in train_data['data']:
  for paragraph in data['paragraphs']:
    doc_info = {}
    doc_info['id'] = i
    doc_info['contents'] = paragraph['context']
    doc_info['title'] = data['title']
    i += 1
    documents.append(doc_info)

for data in dev_data['data']:
  for paragraph in data['paragraphs']:
    doc_info = {}
    doc_info['id'] = i
    doc_info['contents'] = paragraph['context']
    doc_info['title'] = data['title']
    i += 1
    documents.append(doc_info)

for data in test_data['data']:
  for paragraph in data['paragraphs']:
    doc_info = {}
    doc_info['id'] = i
    doc_info['contents'] = paragraph['context']
    doc_info['title'] = data['title']
    i += 1
    documents.append(doc_info)

output_path = '/content/drive/MyDrive/pquad_documents.jsonl'
dump_jsonl(documents, output_path)

Wrote 11141 records to /content/drive/MyDrive/pquad_documents.jsonl


In [16]:
! python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input /content/drive/MyDrive/tests/resources/pquad_collection_jsonl \
  --language fa \
  --index /content/drive/MyDrive/indexes/pquad_collection_jsonl \
  --generator DefaultLuceneDocumentGenerator \
  --threads 1 \
  --storePositions --storeDocvectors --storeRaw

2024-01-22 22:45:03,645 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2024-01-22 22:45:03,647 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2024-01-22 22:45:03,648 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: /content/drive/MyDrive/tests/resources/pquad_collection_jsonl
2024-01-22 22:45:03,648 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2024-01-22 22:45:03,649 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2024-01-22 22:45:03,649 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 1
2024-01-22 22:45:03,650 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: fa
2024-01-22 22:45:03,650 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2024-01-22 22:45:03,650 INFO  [main] index.IndexC

train set

In [18]:
import random
from pyserini.search.lucene import LuceneSearcher


searcher = LuceneSearcher('/content/drive/MyDrive/indexes/pquad_collection_jsonl')
searcher.set_language('fa')

dpr_data_list = []

for data in train_data['data']:
  for paragraph in data['paragraphs']:
    for qa in paragraph['qas']:
      if not qa['is_impossible'] and len(qa['answers']) == 1:
        dpr_data = {}
        dpr_data['dataset'] = 'PQuAD'
        dpr_data['question'] = qa['question']
        dpr_data['answers'] = qa['answers']
        dpr_data['positive_ctxs'] = [{'title': data['title'] , 'text': paragraph['context'],
                                'score': 1000, 'title_score':1, 'passage_id':qa['id']}]
        dpr_data['negative_ctxs'] = []

        hits = searcher.search(qa['question'])
        hard_negatives = []

        for i in range(len(hits)):
          negative_data = {}
          negative_id = hits[i].docid
          doc = searcher.doc(hits[0].docid)
          doc_dict = json.loads(doc.raw())
          negative_content = doc_dict['contents']
          negative_title = doc_dict['title']
          if qa['answers'][0]['text'] not in negative_content and data['title'] != negative_title:
            negative_data['passage_id'] = negative_id
            negative_data['text'] = negative_content
            negative_data['title'] = negative_title
            negative_data['score'] = hits[i].score
            negative_data['title_score'] = 0
            hard_negatives.append(negative_data)

          if len(hard_negatives) == 5:
            break

        dpr_data['hard_negative_ctxs'] = hard_negatives

        if len(dpr_data['hard_negative_ctxs']) == 5:
          dpr_data_list.append(dpr_data)

        del dpr_data, hard_negatives, hits
  # gc.collect()

random.shuffle(dpr_data_list)
with open('/content/drive/MyDrive/PQuAD_DPR_train.json', 'w') as fout:
  json.dump(dpr_data_list, fout, ensure_ascii = False)

development set

In [21]:
import random
from pyserini.search.lucene import LuceneSearcher


searcher = LuceneSearcher('/content/drive/MyDrive/indexes/pquad_collection_jsonl')
searcher.set_language('fa')

dpr_data_list = []

my_data = dev_data['data'] + test_data['data']

for data in my_data:
  for paragraph in data['paragraphs']:
    for qa in paragraph['qas']:
      if not qa['is_impossible'] and len(qa['answers']) == 1:
        dpr_data = {}
        dpr_data['dataset'] = 'PQuAD'
        dpr_data['question'] = qa['question']
        dpr_data['answers'] = qa['answers']
        dpr_data['positive_ctxs'] = [{'title': data['title'] , 'text': paragraph['context'],
                                'score': 1000, 'title_score':1, 'passage_id':qa['id']}]
        dpr_data['negative_ctxs'] = []

        hits = searcher.search(qa['question'])
        hard_negatives = []

        for i in range(len(hits)):
          negative_data = {}
          negative_id = hits[i].docid
          doc = searcher.doc(hits[0].docid)
          doc_dict = json.loads(doc.raw())
          negative_content = doc_dict['contents']
          negative_title = doc_dict['title']
          if qa['answers'][0]['text'] not in negative_content and data['title'] != negative_title:
            negative_data['passage_id'] = negative_id
            negative_data['text'] = negative_content
            negative_data['title'] = negative_title
            negative_data['score'] = hits[i].score
            negative_data['title_score'] = 0
            hard_negatives.append(negative_data)

          if len(hard_negatives) == 5:
            break

        dpr_data['hard_negative_ctxs'] = hard_negatives

        if len(dpr_data['hard_negative_ctxs']) == 5:
          dpr_data_list.append(dpr_data)

        del dpr_data, hard_negatives, hits
  # gc.collect()

random.shuffle(dpr_data_list)
with open('/content/drive/MyDrive/PQuAD_DPR_dev.json', 'w') as fout:
  json.dump(dpr_data_list, fout, ensure_ascii = False)