In [1]:
import json
import re
import string
import pandas as pd
from collections import Counter
from operator import itemgetter
from bs4 import BeautifulSoup

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


functions for reading datasets with jsonlines format

In [3]:
class Answer:
    def __init__(self, dictionary) -> None:
        self.text = dictionary["text"]
        self.start_char = dictionary["start_char"]

    def to_dict(self) -> dict:
        return {
            "text": self.text,
            "start_char": self.start_char
        }  # answer_dict


class PassageQuestion:
    def __init__(self, dictionary) -> None:
        self.answers = []
        self.pq_id = dictionary["pq_id"]
        self.passage = dictionary["passage"]
        self.surah = dictionary["surah"]
        self.verses = dictionary["verses"]
        self.question = dictionary["question"]
        for answer in dictionary["answers"]:
            self.answers.append(Answer(answer))

    def to_dict(self, include_answers=True) -> dict:
        passage_question_dict = {
            "pq_id": self.pq_id,
            "surah": self.surah, "verses": self.verses,
            "passage": self.passage,
            "question": self.question,

        }  # passage_question_dict
        if include_answers:
            passage_question_dict["answers"] = [x.to_dict() for x in self.answers]

        return passage_question_dict


In [4]:
def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

def read_JSONL_file(file_path) -> list:
    data_in_file = load_jsonl(file_path)

    # get list of PassageQuestion objects
    passage_question_objects = []
    for passage_question_dict in data_in_file:
        # instantiate a PassageQuestion object
        pq_object = PassageQuestion(passage_question_dict)
        passage_question_objects.append(pq_object)

    print(f"Collected {len(passage_question_objects)} Object from {file_path}")
    return passage_question_objects

a function to writee jsonl dataset

In [5]:
def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

# combining AQQAC and QRCD datasets

In [6]:
train_passage_question_objects = read_JSONL_file('/content/drive/MyDrive/qrcd_v1.1_train.jsonl')
test_passage_question_objects = read_JSONL_file('/content/drive/MyDrive/qrcd_v1.1_test.jsonl')
dev_passage_question_objects = read_JSONL_file('/content/drive/MyDrive/qrcd_v1.1_dev.jsonl')
aqqac_passage_question_objects = read_JSONL_file('/content/drive/MyDrive/ARCDrefined.jsonl')

train_data = [dict({"pq_id": passage_question_object.pq_id,
                    "question":passage_question_object.question,
                    "context": passage_question_object.passage, 
                    "answers": r.text})
              for passage_question_object in train_passage_question_objects
              for r in passage_question_object.answers]

test_data = [dict({"pq_id": passage_question_object.pq_id,
                    "question":passage_question_object.question,
                    "context": passage_question_object.passage, 
                    "answers": r.text})
              for passage_question_object in test_passage_question_objects
              for r in passage_question_object.answers]

dev_data = [dict({"pq_id": passage_question_object.pq_id,
                    "question":passage_question_object.question,
                    "context": passage_question_object.passage, 
                    "answers": r.text})
              for passage_question_object in dev_passage_question_objects
              for r in passage_question_object.answers]

aqqac_data = [dict({"pq_id": passage_question_object.pq_id,
                    "question":passage_question_object.question,
                    "context": passage_question_object.passage, 
                    "answers": r.text})
              for passage_question_object in aqqac_passage_question_objects
              for r in passage_question_object.answers]

data = train_data + test_data + dev_data + aqqac_data

Loaded 710 records from /content/drive/MyDrive/qrcd_v1.1_train.jsonl
Collected 710 Object from /content/drive/MyDrive/qrcd_v1.1_train.jsonl
Loaded 274 records from /content/drive/MyDrive/qrcd_v1.1_test.jsonl
Collected 274 Object from /content/drive/MyDrive/qrcd_v1.1_test.jsonl
Loaded 109 records from /content/drive/MyDrive/qrcd_v1.1_dev.jsonl
Collected 109 Object from /content/drive/MyDrive/qrcd_v1.1_dev.jsonl
Loaded 732 records from /content/drive/MyDrive/ARCDrefined.jsonl
Collected 732 Object from /content/drive/MyDrive/ARCDrefined.jsonl


# Separating factoid data from non-factoid data

In [7]:
marks = [ 'دلیل','هل ', 'كيف ', 'لماذا', 'ما هو سبب', 'فلماذا', 'ما هي الدلائل', 'ما الدلائل', 'الدلائل', 'الدليل', 'ما سبب']

non_factoid = []
factoid = []

for each in data:
  question = each['question'].replace('هلك', '')
  question = question.replace('أهل', '')
  constraint = any(ele in question for ele in marks)
  if constraint:
    non_factoid.append(each)
  else:
    factoid.append(each)

In [None]:
output_path = '/content/drive/MyDrive/nonfactoid.jsonl'
dump_jsonl(non_factoid, output_path)

In [None]:
output_path = '/content/drive/MyDrive/factoid.jsonl'
dump_jsonl(factoid, output_path)

installing Google translate python API

In [8]:
!pip install deep-translator

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deep-translator
  Downloading deep_translator-1.10.1-py3-none-any.whl (35 kB)
Installing collected packages: deep-translator
Successfully installed deep-translator-1.10.1


In [None]:
from deep_translator import GoogleTranslator
translator = GoogleTranslator(source='ar',target='fa')

with open('/content/drive/MyDrive/makarem.xml', 'r', encoding="utf8") as f:
    content = f.read() 

soup= BeautifulSoup(content, 'xml') 

not_complete_list = []

complete_list = []

for qa in factoid:
  farsi_question = translator.translate(qa['question'])
  farsi_answer = translator.translate(qa['answers'])
  chapter = qa['pq_id'].split(":", 1)[0]
  verses = (qa['pq_id'].split(":", 1)[1]).split('_')[0]
  print(qa['pq_id'])
  start_verse = int(verses.split('-')[0])
  end_verse = int(verses.split('-')[1])

  farsi_passage = ''
  for i in range(start_verse, end_verse+1):
    verse_id = 's'+ str(chapter) + '.' + str(i)
    translated_verse = soup.find(id=verse_id).contents[0]
    farsi_passage += translated_verse
    farsi_passage += ' '

  new_data = {}
  new_data['context'] = farsi_passage
  new_data['pq_id'] = qa['pq_id']
  new_data['question'] = farsi_question
  
  if farsi_answer in farsi_passage:
    new_data['answers'] = farsi_answer
    complete_list.append(new_data)

  else:
    new_data['answers'] = qa['answers']
    not_complete_list.append(new_data)