In [None]:
import json
import re
import string
import pandas as pd
from collections import Counter
from operator import itemgetter
from bs4 import BeautifulSoup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# preprocessing Arabic text

In [None]:
arabic_punctuations = '''`÷×؛۩<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             ّ    | # shadda
                             ٰ    | #qasr
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ۚ    | # waqf1
                             ۗ    | ##
                             ۖ    | ##
                             ۘ    | ##
                             ۙ    |##
                             ۛ    |##
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
def remove_digits(text):
    text = re.sub(r"[1234567890١٢٣٤٥٦٧٨٩٠]+", "", text)
    return text

def remove_english_characters(text):
    text = re.sub(r'[a-zA-Z]+','',text)
    return text
    

def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text


def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def Arabic_normalization(sentence):
    text = remove_english_characters(sentence)
    text = remove_digits(text)
    text = remove_diacritics(text)
    text = remove_punctuations(text)
    text = text.replace('ي', 'ی')
    text = text.replace('ة', 'ه')  
    text = text.replace('آ', 'ا')
    text = text.strip()
    text = re.sub(' +', ' ', text)  
    return text    

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Arabic-Original.csv" ,delimiter='|')
df['ayah']= df['verse'].apply(Arabic_normalization)
df.head()

Unnamed: 0,surah,verse number,verse,ayah
0,1,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ,بسم الله الرحمن الرحیم
1,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,الحمد لله رب العالمین
2,1,3,الرَّحْمَٰنِ الرَّحِيمِ,الرحمن الرحیم
3,1,4,مَالِكِ يَوْمِ الدِّينِ,مالك یوم الدین
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,إیاك نعبد وإیاك نستعین


In [None]:
def most_frequent(List):
	occurence_count = Counter(List)
	return occurence_count.most_common(1)[0][0]

Adding quranic address to AQQAC dataset

In [None]:
with open('/content/drive/MyDrive/AQQAC.jsonl', 'r',encoding="utf8") as json_file:
    json_list = list(json_file)

exceptions = []
exceptions2 = []

dict_data_list = []

for json_str in json_list:
    result = json.loads(json_str)
    text = result['passage']
    text = Arabic_normalization(text)
    candidate_list = []

    for index, row in df.iterrows():
      verse = row['ayah']
      verse = Arabic_normalization(verse)
      if re.search(verse, text) and len(verse)>3:
        verse_dict = {}
        verse_dict['verse_number'] = row['verse number']
        verse_dict['chapter'] = row['surah']
        verse_dict['text'] = row['ayah']
        candidate_list.append(verse_dict)


    chapters_list = [candidate['chapter'] for candidate in candidate_list]

    try:
      chapter = most_frequent(chapters_list)
    except IndexError as error:
      exceptions.append(result)
      continue

    candidate_list = sorted(candidate_list, key=itemgetter('verse_number'))

    for candidate in candidate_list:
      if candidate['chapter'] != chapter:
        candidate_list.remove(candidate)
    
    passage = ' '.join([candidate['text'] for candidate in candidate_list])

    starting_verse = min([candidate['verse_number'] for candidate in candidate_list])
    ending_verse = max([candidate['verse_number'] for candidate in candidate_list])

    if ending_verse-starting_verse > 20:
      exceptions2.append(result)

    else:
      result['surah'] = chapter
      result['verses'] = str(starting_verse) + '-' + str(ending_verse)
      result['pq_id'] = str(chapter) + ':' + str(starting_verse) + '-' + str(ending_verse)
      dict_data_list.append(result)

fixing the exceptional records

In [None]:
for each in exceptions:
  each['passage'] = each['passage'].strip()

  if each['passage'].startswith('قال ما خطبكن'):
    result['surah'] = 12
    result['verses'] = '51-51'
    result['pq_id'] = '12:51-51'

  elif each['passage'].startswith('يا بني آدم قد أنزلنا عليكم لباسا'):
    result['surah'] = 7
    result['verses'] = '26-26'
    result['pq_id'] = '7:26-26'

  elif each['passage'].startswith('وذا ٱلنون إذ ذهب مغضبا'):
    result['surah'] = 21
    result['verses'] = '87-87'
    result['pq_id'] = '21:87-87'
  
  elif each['passage'].startswith('يا بني آدم لا يفتننكم الشيطان'):
    result['surah'] = 7
    result['verses'] = '27-27'
    result['pq_id'] = '7:27-27'

  elif each['passage'].startswith('يريدون ليطفؤوا نور'):
    result['surah'] = 61
    result['verses'] = '8-8'
    result['pq_id'] = '61:8-8'

  elif each['passage'].startswith('إذ قالت امرأة عمران'):
    result['surah'] = 3
    result['verses'] = '35-35'
    result['pq_id'] = '3:35-35'

  elif each['passage'].startswith('أو كالذي مر على قرية'):
    result['surah'] = 2
    result['verses'] = '259-259'
    result['pq_id'] = '2:259-259'

  elif each['passage'].startswith('وكذلك جعلناكم أمة'):
    result['surah'] = 2
    result['verses'] = '143-143'
    result['pq_id'] = '2:143-143'
  
  elif each['passage'].startswith('قال الملأ الذين كفروا'):
    result['surah'] = 11
    result['verses'] = '27-27'
    result['pq_id'] = '11:27-27'

  elif each['passage'].startswith('وتحسبهم أيقاظا وهم رقود'):
    result['surah'] = 18
    result['verses'] = '18-18'
    result['pq_id'] = '18:18-18'

  elif each['passage'].startswith('الذين يتبعون الرسول النبي'):
    result['surah'] = 7
    result['verses'] = '157-157'
    result['pq_id'] = '7:157-157'

  elif each['passage'].startswith('الزانية والزاني فاجلدوا'):
    result['surah'] = 24
    result['verses'] = '2-2'
    result['pq_id'] = '24:2-2'
  
  dict_data_list.append(result)

In [None]:
for each in exceptions2:
  each['passage'] = each['passage'].strip()

  if each['passage'].startswith('إذ قال ربك للملائكة'):
    result['surah'] = 38
    result['verses'] = '71-79'
    result['pq_id'] = '38:71-79'

  elif each['passage'].startswith('إذ قال لهم أخوهم صالح ألا تتقون'):
    result['surah'] = 26
    result['verses'] = '142-155'
    result['pq_id'] = '26:142-155'

  elif each['passage'].startswith('كذب أصحاب الأيكة المرسلين') and each['passage'].endswith('مفسدين'):
    result['surah'] = 26
    result['verses'] = '176-183'
    result['pq_id'] = '26:87-87'
  
  elif each['passage'].startswith('فعقروها فأصبحوا نادمين'):
    result['surah'] = 26
    result['verses'] = '157-158'
    result['pq_id'] = '26:157-158'

  elif each['passage'].startswith('فأوحينا إلى موسى أن اضرب بعصاك البحر'):
    result['surah'] = 26
    result['verses'] = '63-68'
    result['pq_id'] = '26:63-68'

  elif each['passage'].startswith('وقال موسى يا فرعون إني رسول من رب العالمين'):
    result['surah'] = 7
    result['verses'] = '104-108'
    result['pq_id'] = '7:104-108'

  elif each['passage'].startswith('إن عبادي ليس لك عليهم سلطان إلا من اتبعك من الغاوين'):
    result['surah'] = 15
    result['verses'] = '42-48'
    result['pq_id'] = '15:42-48'

  elif each['passage'].startswith('قد أفلح المؤمنون'):
    result['surah'] = 23
    result['verses'] = '1-11'
    result['pq_id'] = '23:1-11'
  
  elif each['passage'].startswith('إن الإنسان خلق هلوعا'):
    result['surah'] = 70
    result['verses'] = '19-35'
    result['pq_id'] = '70:19-35'

  elif each['passage'].startswith('فإذا انشقت السماء'):
    result['surah'] = 55
    result['verses'] = '37-44'
    result['pq_id'] = '55:37-44'

  elif each['passage'].startswith('كذبت قوم لوط المرسلين'):
    result['surah'] = 26
    result['verses'] = '160-174'
    result['pq_id'] = '26:160-174'

  elif each['passage'].startswith('كذب أصحاب الأيكة المرسلين') and each['passage'].endswith('عظيم'):
    result['surah'] = 26
    result['verses'] = '176-189'
    result['pq_id'] = '26:176-189'
  
  elif each['passage'].startswith('ويقولون متى هذا الوعد إن كنتم صادقين'):
    result['surah'] = 67
    result['verses'] = '25-27'
    result['pq_id'] = '67:25-27'

  elif each['passage'].startswith('فأرادوا به كيدا فجعلناهم الأسفلين'):
    result['surah'] = 37
    result['verses'] = '98-105'
    result['pq_id'] = '37:98-105'

  elif each['passage'].startswith('قالت يا أيها الملأ'):
    result['surah'] = 27
    result['verses'] = '29-44'
    result['pq_id'] = '27:29-44'
  
  dict_data_list.append(result)

# save new dataset

In [None]:
def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

In [None]:
output_path = '/content/drive/MyDrive/ARCDrefined.jsonl'
dump_jsonl(dict_data_list, output_path)