In [88]:
import os
import json
import glob
import numpy as np
from pathlib import Path
import duckdb
import pandas as pd
import pyarrow as pa

import tqdm

from natsort import natsorted, index_natsorted




In [115]:

an = glob.glob("pali_cannon/sutta/an/**/*.json", recursive=True)
sn = glob.glob("pali_cannon/sutta/sn/**/*.json", recursive=True)
dn = glob.glob("pali_cannon/sutta/dn/**/*.json", recursive=True)
mn = glob.glob("pali_cannon/sutta/mn/**/*.json", recursive=True)
kn = glob.glob("pali_cannon/sutta/kn/**/*.json", recursive=True)



files = {'an': an, 'sn': sn, 'dn': dn, 'mn': mn, 'kn': kn}



In [117]:
### JSON to CSV
df = pd.DataFrame(columns=["verse_id", "text"])


dfs = []

for nikaya, file_list in files.items():
    for file in tqdm.tqdm(file_list, desc=f"Loading {nikaya} JSON files"):
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)

            # Add nikaya label to each entry
            df = pd.DataFrame(
                [(k, v, nikaya) for k, v in data.items()],
                columns=["verse_id", "text", "nikaya"]
            )

            dfs.append(df)


df = pd.concat(dfs, ignore_index=True)

df.head()

Loading an JSON files: 100%|██████████| 1408/1408 [00:00<00:00, 4664.26it/s]
Loading sn JSON files: 100%|██████████| 1819/1819 [00:00<00:00, 3902.13it/s]
Loading dn JSON files: 100%|██████████| 34/34 [00:00<00:00, 1118.06it/s]
Loading mn JSON files: 100%|██████████| 152/152 [00:00<00:00, 2189.37it/s]
Loading kn JSON files: 100%|██████████| 2351/2351 [00:00<00:00, 3536.87it/s]


Unnamed: 0,verse_id,text,nikaya
0,an6.75:0.1,Aṅguttara Nikāya 6.75,an
1,an6.75:0.2,8. Arahattavagga,an
2,an6.75:0.3,Dukkhasutta,an
3,an6.75:1.1,"“Chahi, bhikkhave, dhammehi samannāgato bhikkh...",an
4,an6.75:1.2,Katamehi chahi?,an


In [114]:
data_df.head()

Unnamed: 0,nikaya,vagga,sutta_id,text
an1.1:0.1,an,an1,an1.1,Aṅguttara Nikāya 1
an1.1:0.2,an,an1,an1.1,1. Rūpādivagga
an1.1:1.0,an,an1,an1.1,1
an1.1:1.1,an,an1,an1.1,Evaṁ me sutaṁ—
an1.1:1.2,an,an1,an1.1,ekaṁ samayaṁ bhagavā sāvatthiyaṁ viharati jeta...


In [None]:
data_df = df.copy()


data_df["vagga"] = data_df["verse_id"].str.split(".").str[0]




data_df['sutta_id'] =  data_df['verse_id'].str.split(':').str[0]



raw_values = np.array([val.strip('mndsa ') for val in data_df['sutta_id']])



data_df['vagga'] = np.where(
    data_df['nikaya'].isin(['dn', 'mn']), 
    data_df['sutta_id'].str.strip('mndsa '),
    data_df['vagga']                       # If False: Keep existing value
)


data_df = data_df[["nikaya",  "vagga", "sutta_id", "verse_id", "text"]]


data_df.sort_values(by=["nikaya", "vagga"], inplace=True)



data_df["verse_id"] = data_df["verse_id"].str.strip()

data_df.set_index("verse_id", inplace=True)
data_df.index.name = None

data_df = data_df.iloc[index_natsorted(data_df.index)]


#data_df.to_csv("pali_canon.csv", index=True, index_label='index')


display(data_df.info())

data_df.head()

<class 'pandas.DataFrame'>
Index: 284708 entries, an1.1:0.1 to vv85:13.1
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   nikaya    284708 non-null  str   
 1   vagga     284708 non-null  str   
 2   sutta_id  284708 non-null  object
 3   text      284708 non-null  str   
dtypes: object(1), str(3)
memory usage: 32.0+ MB


None

Unnamed: 0,nikaya,vagga,sutta_id,text
an1.1:0.1,an,an1,an1.1,Aṅguttara Nikāya 1
an1.1:0.2,an,an1,an1.1,1. Rūpādivagga
an1.1:1.0,an,an1,an1.1,1
an1.1:1.1,an,an1,an1.1,Evaṁ me sutaṁ—
an1.1:1.2,an,an1,an1.1,ekaṁ samayaṁ bhagavā sāvatthiyaṁ viharati jeta...


In [112]:
data_df[data_df['nikaya'] == 'sn']

Unnamed: 0,nikaya,vagga,sutta_id,text
sn1.1:0.1,sn,sn1,sn1.1,Saṁyutta Nikāya 1.1
sn1.1:0.2,sn,sn1,sn1.1,1. Naḷavagga
sn1.1:0.3,sn,sn1,sn1.1,Oghataraṇasutta
sn1.1:1.1,sn,sn1,sn1.1,Evaṁ me sutaṁ—
sn1.1:1.2,sn,sn1,sn1.1,ekaṁ samayaṁ bhagavā sāvatthiyaṁ viharati jeta...
...,...,...,...,...
sn56.131:5.4,sn,sn56,sn56.131,jhānānāpānasaṁyutaṁ;
sn56.131:5.5,sn,sn56,sn56.131,"Sotāpatti saccañcāti,"
sn56.131:5.6,sn,sn56,sn56.131,mahāvaggoti vuccatīti.
sn56.131:5.7,sn,sn56,sn56.131,Mahāvaggasaṁyuttapāḷi niṭṭhitā.
