In [None]:
import json
import os
from collections import Counter

DATAS_CONFIG = "./datas.json"

def count_elements(lst):
    element_counts = Counter(lst)
    return dict(element_counts)

def count_and_sort_elements(lst:list, sort_up:bool=True) -> dict:
    element_counts = Counter(lst)
    return dict(sorted(element_counts.items(), key=lambda x: x[1], reverse=sort_up))


class PlainDataLoader():
    def __init__(self, config_path: str=DATAS_CONFIG) -> None:
        '''使用提供的jason文件初始化'''
        self._path = config_path
        with open(config_path, 'r', encoding='utf-8') as config:
            data = json.load(config)
            self.top_level_path:str = data["cp_path"]
            self.datasets:dict = data["datasets"]
            self.id_table = {
                v["id"]: k for (k, v) in self.datasets.items()
            }
    def target_path(self, target: str) -> str:
        '''return target path'''
        if target not in self.datasets:
            print(f"{target} is not included in datas.json as a dataset")
            return None
        configs = self.datasets[target]
        full_path = os.path.join(self.top_level_path, configs["path"])
        return full_path
    
    def body_extractor(self, target: str, extractor_tag: str = 'paragraphs') -> list:
        full_path = self.target_path(target)
        configs = self.datasets[target]
        tag = configs[extractor_tag]
        body = []  # may get a bit huge... 
        
        # single file json
        if os.path.isfile(full_path): 
            with open(full_path, mode='r', encoding='utf-8') as file:
                data = json.load(file)
                for item in data:
                    body += item[tag]
            return body
        
        # a dir, probably with a skip list
        subpaths = os.listdir(full_path)
        for filename in subpaths:
            if filename in configs["excludes"]:
                print(f"neglect {filename} as it is excluded in jason config")
                continue
            with open(os.path.join(full_path, filename), mode='r', encoding='utf-8') as file:
#                 print(f"check {file}...")
                data = json.load(file)
                for item in data:
                    body += item[tag]
        return body


    def author_from_poem(self, target: str) -> list:
        full_path = self.target_path(target)
        configs = self.datasets[target]
        tag = configs['author']
        body = []  # may get a bit huge... 
        
        # a dir, probably with a skip list
        subpaths = os.listdir(full_path)
        for filename in subpaths:
            if filename in configs["excludes"]:
                print(f"neglect {filename} as it is excluded in jason config")
                continue
            with open(os.path.join(full_path, filename), mode='r', encoding='utf-8') as file:
#                 print(f"check {file}...")
                data = json.load(file)
                for item in data:
#                     print(f"item is {item[tag]}")
                    body.append(item[tag])
        
        return body  
        

    def extract_from_multiple(self, targets: list) -> list:
        results = []
        for target in targets:
            results += self.body_extractor(target)
        return results
    
    def extract_with_ids(self, ids: list) -> list:
        results = []
        for id in ids:
            results += self.body_extractor(
                self.id_table[id]
            )
        return results

In [None]:
cfg_path = './datas.json'
loader = PlainDataLoader(cfg_path)

In [None]:
print(loader.id_table)
# print(loader.datasets)

In [None]:
# loader.body_extractor("wudai-huajianji")
print(    loader.body_extractor("wudai-huajianji")[-1] )
# print(    len(loader.extract_from_multiple(["wudai-huajianji", "wudai-nantang"])))
# print(    loader.extract_with_ids([0]) )

---

In [None]:
print(loader.author_from_poem("wudai-huajianji")[-1])

In [None]:
author_list = loader.author_from_poem("tangsong")

In [None]:
print(count_elements(author_list))

In [None]:
print(count_and_sort_elements(author_list))