In [None]:
import json
import os
from collections import Counter
from zhconv import convert # traditional Chinese to simplified Chinese
# convert('欧陽炯', 'zh-cn')

DATAS_CONFIG = "./datas.json"

def count_elements(lst:list) -> dict:
    element_counts = Counter(lst)
    return dict(element_counts)

def count_and_sort_elements(lst:list, sort_up:bool=True) -> dict:
    element_counts = Counter(lst)
    return dict(sorted(element_counts.items(), key=lambda x: x[1], reverse=sort_up))

class PlainDataLoader():
    def __init__(self, config_path: str=DATAS_CONFIG) -> None:
        '''使用提供的jason文件初始化'''
        self._path = config_path
        with open(config_path, 'r', encoding='utf-8') as config:
            data = json.load(config)
            self.top_level_path:str = data["cp_path"]
            self.datasets:dict = data["datasets"]
            self.id_table = {
                v["id"]: k for (k, v) in self.datasets.items()
            }
            
    def target_path(self, target: str) -> str:
        '''return target path'''
        full_path = None
        
        if target not in self.datasets:
            print(f"{target} is not included in datas.json")
        else:
            configs = self.datasets[target]
            full_path = os.path.join(self.top_level_path, configs["path"])
            
        return full_path
    
    def body_extractor(self, target: str, extractor_tag: str = 'paragraphs') -> list:
        '''内容提取'''
        full_path = self.target_path(target)
        configs = self.datasets[target]
        tag = configs[extractor_tag]
        body = []  # may get a bit huge... 
        
        # single file json
        if os.path.isfile(full_path): 
            with open(full_path, mode='r', encoding='utf-8') as file:
                data = json.load(file)
                for item in data:
                    body += item[tag]
            return body
        
        # a dir, probably with a skip list
        subpaths = os.listdir(full_path)
        for filename in subpaths:
            if filename in configs["excludes"]:
                print(f"neglect {filename} as it is excluded in jason config")
                continue
            with open(os.path.join(full_path, filename), mode='r', encoding='utf-8') as file:
#                 print(f"check {file}...")
                data = json.load(file)
                for item in data:
                    body += item[tag]
        return body


    def contents_from_multiple(self, target: str, extr_tag: str, conv2: str = 'zh-cn') -> list:
        '''从诗词主体中提取内容
           conv2 = 'zh-tw' / 'zh-hans' / 'zh-cn'
        '''
        full_path = self.target_path(target)
        configs   = self.datasets[target]
        tag       = configs[extr_tag]
        body      = []  # may get a bit huge... 
        
        # a dir, probably with a skip list
        subpaths = os.listdir(full_path)
        for filename in subpaths:
            if filename in configs["excludes"]: # 在config文件（default "./datas.json"）中的 exclude定义文件会被忽略
                #print(f"neglect {filename} as it is excluded in jason config")
                continue
            with open(os.path.join(full_path, filename), mode='r', encoding='utf-8') as file:
                data = json.load(file)
                for item in data:
                    body.append(convert(item[tag], conv2))       
        return body
    
    
    def contents_from_single(self, target: str, single_file_n: str, extr_tag: str, conv2: str = 'zh-cn') -> list:
        '''从单独jason文件中提取内容
           conv2 = 'zh-tw' / 'zh-hans' / 'zh-cn'
        '''        
        full_path = self.target_path(target)
        configs   = self.datasets[target]
        tag       = configs[extr_tag]
        body      = []  # may get a bit huge... 
        
        subpaths = os.listdir(full_path)
        if single_file_n not in subpaths:
            print(f"{single_file_n} can not be found in {subpaths}!")
        else:
            with open(os.path.join(full_path, single_file_n), mode='r', encoding='utf-8') as file:
                data = json.load(file)
                for item in data:
                    body.append(convert(item[tag], conv2))
        
        return body
            

    def extract_from_json(self, target: str, single_file_n: str, extr_tag: str, conv2: str = 'zh-cn') -> list:
        '''从jason文件内提取内容
           conv2 = 'zh-tw' / 'zh-hans' / 'zh-cn'
        '''                
        
        if single_file_n != None:
            body = self.contents_from_single(target, single_file_n, tag, conv2)
        else:
            body = self.contents_from_multiple(target, tag, conv2)
                        
        return body  

    def extract_from_multiple(self, targets: list) -> list:
        results = []
        for target in targets:
            results += self.body_extractor(target)
        return results
    
    def extract_with_ids(self, ids: list) -> list:
        results = []
        for id in ids:
            results += self.body_extractor(
                self.id_table[id]
            )
        return results

In [None]:
cfg_path = DATAS_CONFIG
loader = PlainDataLoader(cfg_path)

In [None]:
print(loader.id_table)
# print(loader.datasets)

In [None]:
# loader.body_extractor("wudai-huajianji")
print(    loader.body_extractor("wudai-huajianji")[-1] )
# print(    len(loader.extract_from_multiple(["wudai-huajianji", "wudai-nantang"])))
# print(    loader.extract_with_ids([0]) )

---

In [None]:
author_list_muli = loader.contents_from_multiple("tangsong", 'author', 'zh-cn')

In [None]:
author_list = loader.contents_from_single("tangsong", 'authors.tang.json', 'author_name', 'zh-cn')