In [132]:
import requests
import numpy as np

url = "https://api.quran.com/api/v3/chapters/11"
data = requests.get(url).json()
data

{'chapter': {'id': 11,
  'revelation_place': 'makkah',
  'revelation_order': 52,
  'bismillah_pre': True,
  'name_simple': 'Hud',
  'name_complex': 'Hūd',
  'name_arabic': 'هود',
  'verses_count': 123,
  'pages': [221, 235],
  'chapter_number': 11,
  'translated_name': {'language_name': 'english', 'name': 'Hud'}}}

In [2]:
url = "https://api.quran.com/api/v3/chapters/11/verses/11:2"
data = requests.get(url).json()
data

{'verse': {'id': 1475,
  'verse_number': 2,
  'chapter_id': 11,
  'verse_key': '11:2',
  'text_indopak': 'اَلَّا تَعۡبُدُوۡۤا اِلَّا اللّٰهَ\u200b ؕ اِنَّنِىۡ لَـكُمۡ مِّنۡهُ نَذِيۡرٌ وَّبَشِيۡرٌ ۙ\u200f',
  'juz_number': 11,
  'hizb_number': 22,
  'rub_el_hizb_number': 88,
  'sajdah_number': None,
  'page_number': 221,
  'sajdah': None,
  'text_madani': 'أَلَّا تَعْبُدُوٓا۟ إِلَّا ٱللَّهَ ۚ إِنَّنِى لَكُم مِّنْهُ نَذِيرٌ وَبَشِيرٌ',
  'words': [{'id': 1749,
    'position': 1,
    'text_indopak': 'اَلَّا',
    'verse_key': '11:2',
    'line_number': 10,
    'page_number': 221,
    'code': '&#xfba0;',
    'class_name': 'p221',
    'text_madani': 'أَلَّا',
    'char_type': 'word',
    'transliteration': {'text': 'allā', 'language_name': 'english'},
    'translation': {'language_name': 'english', 'text': 'That "Not'},
    'audio': {'url': 'wbw/011_002_001.mp3'}},
   {'id': 1750,
    'position': 2,
    'text_indopak': 'تَعۡبُدُوۡۤا',
    'verse_key': '11:2',
    'line_number': 10,
    'pag

## Using version 3

In [3]:
# class verse():
#     def __init__(self, chapter, verse, base_url="https://api.quran.com/api/v3/chapters/"):
#         # -------------------------------- Definitions ------------------------------- #
#         self.chapter = chapter
#         self.verse = verse
#         self.base_url = base_url
#         self.url = self.base_url + "{}/verses/{}:{}".format(chapter, chapter, verse)
#         # ------------------------------- Download data ------------------------------ #
#         self.download()
#         # ------------------------------ important data ------------------------------ #
#         self.text = self.data["verse"]["text_madani"]
#         self.verse_id = self.data["verse"]["id"]
        
#     def download(self):
        # resp = requests.get(self.url)
        # assert resp.status_code == 200, "Status != 200 when downloading from {}, found status {}".format(self.url, resp.status_code)
        # self.data = resp.json()

In [4]:
# verse(11, 2).verse

In [5]:
# QURAN_CHAPTERS = 114

# class Quran():
#     def __init__(self,
#                 base_link = "https://api.quran.com/api/v3/chapters/"):
#         self.base_link = base_link
#         self.chapters = []
            
#     def download(self):

## Using Version 4

In [6]:
import asyncio
import aiohttp
import time
import json
import requests
from tqdm import tqdm

def get_tafsir(tafsir_id, key):
    """
    description: key could be verse key or a verse id
    """
    url = "https://api.quran.com/api/v4/tafsirs/{}/by_ayah/{}".format(tafsir_id, key)
    resp = requests.get(url)
    assert resp.status_code == 200, "Status != 200 when downloading from {}, found status {}".format(url, resp.status_code)
    return resp.json()

def get_all_tafsirs_info():
    url = "https://api.quran.com/api/v4/resources/tafsirs"
    resp = requests.get(url)
    assert resp.status_code == 200, "Status != 200 when downloading from {}, found status {}".format(url, resp.status_code)
    return resp.json()["tafsirs"]

def get_quran(text_type="imlaei"):
    url = "https://api.quran.com/api/v4/quran/verses/{}".format(text_type)
    resp = requests.get(url)
    assert resp.status_code == 200, "Status != 200 when downloading from {}, found status {}".format(url, resp.status_code)
    return resp.json()["verses"]


async def get(url, session):
    try:
        async with session.get(url=url) as response:
            resp = await response.read()
            return resp
    except Exception as e:
        print("Unable to get url {} due to {}.".format(url, e.__class__))


async def get_tafsirs(tafsir_id, keys, tafsirs):
    # -------------------------------- build urls -------------------------------- #
    urls = []
    for key in keys:
        url = "https://api.quran.com/api/v4/tafsirs/{}/by_ayah/{}".format(tafsir_id, key)
        urls.append(url)
    # --------------------------------- get urls --------------------------------- #
    async with aiohttp.ClientSession() as session:
        responses = await asyncio.gather(*[get(url, session) for url in urls])
        # tafsirs = []
        for resp in tqdm(responses, "creating jsons from reqponses for {}".format(tafsir_id)):
            try:
                json_resp = json.loads(resp)
            except:
                print(resp)
                json_resp = json.loads(resp)
            tafsirs.append(json_resp["tafsir"])
        # print(tafsirs)

In [7]:
get_tafsir(92,4821)

{'tafsir': {'verses': {'53:37': {'id': 4821}},
  'resource_id': 92,
  'resource_name': 'Arabic Tanweer Tafseer',
  'language_id': 9,
  'slug': 'ar-tafseer-tanwir-al-miqbas',
  'translated_name': {'name': 'Tafseer Tanwir al-Miqbas',
   'language_name': 'english'},
  'text': 'وَإِبْرَاهِيمَ الَّذِي وَفَّى (37('}}

In [8]:
QURAN_CHAPTERS = 114

class Quran():
    def __init__(self,
                tafsirs_getter = get_tafsirs,
                all_tafsirs_getter = get_all_tafsirs_info,
                quran_getter = get_quran,
                language = "arabic"
                ):
        self.tafsirs_getter     = tafsirs_getter
        self.quran_getter       = quran_getter
        self.all_tafsirs_getter = all_tafsirs_getter
        self.language           = language
        # self.download()
                    
    async def download(self):
        # ------------------------- Get quran and tafsir list ------------------------ #
        self.quran = self.quran_getter()
        self.all_tafsirs = self.all_tafsirs_getter()
        
        
        # ------------------ Only get tafsirs of language 'language' ----------------- #
        tafsir_ids = []
        for tafsir in self.all_tafsirs :
            if tafsir["language_name"] == self.language:
                tafsir_ids.append(tafsir["id"])
        # --------------------------------- vers keys -------------------------------- #
        keys = []
        for verse in self.quran:
            keys.append(verse["id"])
        # --------------------------- download the tafsirs -------------------------- #
        for verse in self.quran: # just build the list that will contain the tafsirs
            verse["tafsirs"] = []
        for tafsir_id in tafsir_ids:
            tafsirs = []
            await self.tafsirs_getter(tafsir_id, keys, tafsirs)
            self.tafsirs = tafsirs
            for verse, tafsir in zip(self.quran, self.tafsirs):
                verse["tafsirs"].append(tafsir)
    
        # for verse in self.quran:
        #     verse_id = verse["id"]
        #     for tafsir_id in tafsir_ids:
        #         tafsir = self.tafsir_getter(tafsir_id, verse_id)
        #         verse["tafsirs"].append(tafsir)
            

In [31]:
quran = Quran()
await quran.download()

creating jsons from reqponses for 14: 100%|██████████| 6236/6236 [00:00<00:00, 77044.94it/s]
creating jsons from reqponses for 16: 100%|██████████| 6236/6236 [00:00<00:00, 145911.62it/s]
creating jsons from reqponses for 92: 100%|██████████| 6236/6236 [00:00<00:00, 55481.11it/s]
creating jsons from reqponses for 93: 100%|██████████| 6236/6236 [00:00<00:00, 69282.16it/s]
creating jsons from reqponses for 15: 100%|██████████| 6236/6236 [00:00<00:00, 44550.03it/s]
creating jsons from reqponses for 90: 100%|██████████| 6236/6236 [00:00<00:00, 68638.19it/s]
creating jsons from reqponses for 91: 100%|██████████| 6236/6236 [00:00<00:00, 112015.28it/s]
creating jsons from reqponses for 94: 100%|██████████| 6236/6236 [00:00<00:00, 94468.49it/s]


In [26]:
#save the json file in the data folder
import json
with open("../data/quran_tafsirs.json", "w") as f:
    json.dump(quran.quran, f)

In [9]:
import json
with open("../data/quran_tafsirs.json", "r") as f:
    q = json.load(f)

Test if all the tafsirs go to the right verse

In [10]:
q[0]["tafsirs"][0]["verses"]["1:1"]["id"]

1

In [43]:
for verse in q:
    id = verse["id"]
    key = verse["verse_key"]
    for tafsir in verse["tafsirs"]:
        assert key in tafsir["verses"], "key {} not in tafsir {}, for tafsir {}".format(key, tafsir["verses"], tafsir["resource_id"])
        assert id == tafsir["verses"][key]["id"], "verse id {} != tafsir id {} for tafsir {}".format(id, tafsir["verses"][key]["id"], tafsir["resource_id"])

print("All done")

AssertionError: key 2:59 not in tafsir {'2:49': {'id': 56}, '2:50': {'id': 57}, '2:51': {'id': 58}, '2:52': {'id': 59}, '2:53': {'id': 60}, '2:54': {'id': 61}, '2:55': {'id': 62}, '2:56': {'id': 63}, '2:57': {'id': 64}, '2:58': {'id': 65}}, for tafsir 92

verses ids are correctly ordered ?

In [11]:
for i,verse in enumerate(q) :
    assert i + 1 == verse["id"], "verse id {} != {}".format(i, verse["id"])

In [12]:
len(q[0]["tafsirs"])

8

## Build continuous text

In [13]:
full_text = ""
for verse in q :
    for tafsir in verse["tafsirs"]:
        full_text += tafsir["text"]

In [18]:
len(full_text)

74355991

In [17]:
len(full_text.split())

14658433

In [23]:
full_text[:100]

'بسم الله الرحمن الرحيم سورة الفاتحة . يقال لها الفاتحة أي فاتحة الكتاب خطا وبها تفتح القراءة في الصل'

In [53]:
with open("../data/tafsirs_concat.txt", "w", encoding='utf-8') as f:
    f.write(full_text)

## Build as csv

In [158]:
import pandas as pd

tafsirs_pd = pd.DataFrame()

tafsirs_list = []
resource_name_list = []
verses_keys = []

for verse in q :
    for tafsir in verse["tafsirs"]:
        tafsirs_list.append(tafsir["text"].replace("\r", "\n"))       # There is one tafsir containing \r instead of \n => breaks the csv
        resource_name_list.append(tafsir["resource_name"])
        verses_keys.append(" + ".join(list(tafsir["verses"].keys())))

tafsirs_pd["text"] = tafsirs_list
tafsirs_pd["resource_name"] = resource_name_list
tafsirs_pd["verses_keys"] = verses_keys

tafsirs_pd.to_csv("../data/arabic_tafsirs.csv", index=False, encoding='utf-8')
tafsirs_pd

Unnamed: 0,text,resource_name,verses_keys
0,بسم الله الرحمن الرحيم سورة الفاتحة . يقال لها...,ابن كثير,1:1
1,سورة الفاتحة سميت هذه السورة بالفاتحة؛ لأنه يف...,المیسر,1:1
2,البسملة اسم لكلمة باسم الله، صيغ هذا الاسم على...,Arabic Tanweer Tafseer,1:1
3,{ بسم ٱلله الرَّحْمٰنِ الرَّحِيـمِ }الاسم: الل...,Arabic Waseet Tafseer,1:1
4,بسمالقول في تأويل { بسم} قال أبو جعفر : إن الل...,الطبري,1:1
...,...,...,...
49883,وقوله : ( مِنَ الجنة والناس ) زيادة بيان للذى ...,Arabic Waseet Tafseer,114:6
49884,وقوله: ( الَّذِي يُوَسْوِسُ فِي صُدُورِ النَّا...,الطبري,114:6
49885,قوله تعالى : من الجنة والناس أخبر أن الموسوس ق...,Arabic Qurtubi Tafseer,114:6
49886,وهذه السورة مشتملة على الاستعاذة برب الناس وما...,Arabic Saddi Tafseer,114:6


Verify if saved Correctly

In [170]:
verif = pd.read_csv("../data/arabic_tafsirs.csv")
for i in range(len(tafsirs_pd)):
    assert np.all(verif.iloc[i] == tafsirs_pd.iloc[i]), 'No match for i={}, got {} and {}'.format(i, verif.iloc[i], tafsirs_pd.iloc[i])
print("saved correctly 👍")

saved correctly 👍
