In [1]:
import os
import fitz
import pandas as pd
import re
import numpy as np
from packages.Exception_handling import get_exception
import datetime


Retrieve from b_sheet

In [2]:
class GRIPointers_B:
    def __init__(self, csr_report_path: str, gri_pointers_csv_name: str):
        self.csr_report_path = csr_report_path
        self._files_list = os.listdir(self.csr_report_path)
        self.gri_pointers_csv_name = gri_pointers_csv_name
        self.csv_file = ''
        self.current_gri_pointer_number = 0
        self.reveal_number = 0
        self.pattern = ''

    #每間公司
    def catch_gri_pointers(self, csr_report_path: str, search_term: str):
        """
        catch_gri_pointers [summary]
            Detect gri pointers in each page including search_term 
        Args:
            csr_report_path (str): [description]  csr reports path
            search_term(str): search for the specified word in each page in each csr report file like "GRI 準則揭露項目"
        Returns:
            [type]: [description] completed csv file with b sheets
        """

        #init the requirment for the method
        current_company_number = 0  # to avoid the index in the first row

        try:
            for file in self.get_files_list():

                print(f'Now processing {file}')

                self.__fill_into_GRI_csv(
                    file=file,
                    pdf_document=fitz.open(os.path.join(csr_report_path,
                                                        file)),
                    current_company_number=current_company_number,
                    search_term=search_term)

                current_company_number = self.__shift_to_next_company(
                    current_company_number=current_company_number)

        except Exception as e:
            get_exception(e, file)

    #每間公司報告的每頁
    def __fill_into_GRI_csv(self, pdf_document, file, current_company_number,
                            search_term):
        """
        __fill_into_GRI_csv [summary] First initialize all the corporate name into
        csv files, then check each GRI pointers for each corporate. If ends, then do nothing.
        Args:
            pdf_document ([type]): [description]
            file ([type]): [description]
            current_company_number ([type]): [description]
            search_term ([type]): [description]
        """

        # First inserting all the corporates name into csv file.
        self.__fill_corporate_name(
            file=file, current_company_number=current_company_number)
        self.__shift_to_next_gri_pointer()
        #從這裡開始，所有的gri_pointer都從1開始

        #邏輯為，每一頁抓到Search term後，利用Regular expression存入list，一一比照dataframe的column與list內部項目
        #若list無比對成功者，該指標填0，換到下一個指標。
        #若list比對成功者，該指標填1
        ##########################################
        # Crawl into each page of current csr, if catch gri keywords then insert it into csv files
        for current_page in range(len(pdf_document)):
            self.__reset_gri_pointer()
            # Every page should traversal all the gri pointer
            page = pdf_document.loadPage(current_page)

            # 抓到每篇CSR報告附錄的GRI指標對照表
            if page.searchFor(search_term) or page.searchFor(
                        "指標") or page.searchFor("揭露項目"):
                self.__fill_into_single_csv(current_company_number, page)

        #抓到已揭露指標的數目
        for temp in range(1, 136):
            if self.csv_file.iat[current_company_number, temp] == 1:
                self.reveal_number += 1
            else:
                continue

        #每間公司結束之後，將該公司的揭露指標數與未揭露指標數填入dataframe
        self.__fill_in_each_reports_reveal_and_unreveal_numbers(
            current_company_number)
        self.reveal_number = 0

    #每間公司報告內部抓到的每頁揭露指標與column進行比對
    def __fill_into_single_csv(self, current_company_number, page):
        #Using normal expression to filter words caught.
        gri_pointers_disclosed_in_this_page = self.__gri_text_filter(
            re.findall(self.pattern, page.getText("text")))
        print(gri_pointers_disclosed_in_this_page)
        ##########################################
        for column in self.csv_file.columns:
            ####################################
            #處理是否揭露的判斷式
            if (column in gri_pointers_disclosed_in_this_page):
                self.csv_file.at[current_company_number, column] = 1
                if(column == '102-53'):
                    print('true')

        for column in self.csv_file.columns:
            ####################################
            #處理是否揭露的判斷式
            if self.csv_file.at[current_company_number, column] == '':
                self.csv_file.at[current_company_number, column] = 0

    ################################################################
    #basic functions

    def init_gri_pointers_csv_file(self, csv_name):
        """
        init_gri_pointers_csv_file [summary] Initailizing the gri pointers csv file with following task:
        1. remove unnamed column
        2. remove all nan column being regarded as float type, u
        nabling to process with str type

        Args:
            csv_name ([type]): [description] the unprocessed initial csv name 

        Returns:
            [type]: [description] the processed csv file
        """

        self.csv_file = pd.read_csv(f'{csv_name}.csv')
        self.csv_file = self.csv_file.loc[:, ~self.csv_file.columns.str.
                                          contains('^Unnamed')]
        self.csv_file = self.csv_file.replace(np.nan, '',
                                              regex=True)  # All data frame
        return self.csv_file

    def __is_contain_hyphen(self, text) -> bool:
        is_hyphen = False
        for single_char in text:
            if single_char == "-":
                is_hyphen = True
        return not is_hyphen

    def __fill_in_each_reports_reveal_and_unreveal_numbers(
            self, current_company_number):
        self.csv_file.iat[current_company_number, -2] = self.reveal_number
        self.csv_file.iat[current_company_number,
                          -1] = 136 - self.reveal_number

    def __fill_corporate_name(self, file, current_company_number):
        self.csv_file.iat[current_company_number, 0] = file

    def __shift_to_next_gri_pointer(self):
        self.current_gri_pointer_number += 1

    def __reset_gri_pointer(self):
        self.current_gri_pointer_number = 1

    def __shift_to_next_company(self, current_company_number: int) -> int:
        """
        __shift_to_next_company [summary] shift to process next company's gri pointers
        
        Args:
            current_company_number ([type]): [description] 
        """
        next_company_number = current_company_number + 1
        return next_company_number

    # handle full hyphen exception
    def str_dash_full_to_half(self, in_str: str) -> str:
        half_text = ''
        for character in in_str:
            if chr(45 + 65248).encode("utf-8") == character:
                character += "-"
            else:
                half_text += character
        return half_text

    def check_hyphen_exception(self, splited_text: list, index: int) -> str:
        splited_text[index] = " ".join(splited_text[index].split())
        splited_text[index] = splited_text[index].strip()
        #replace一a些在欄位中比較特別的符號
        splited_text[index] = splited_text[index].replace("–", "-")
        splited_text[index] = splited_text[index].replace(" - ", "-")
        splited_text[index] = splited_text[index].replace("－", "-")
        splited_text[index] = splited_text[index].replace(" ", "-")
        splited_text[index] = splited_text[index].replace("\t", "")

        return splited_text[index]

    def __gri_text_filter(self, gri_list_in_rex):
        """
        __gri_text_filter [summary] 
        filter the term from re.findall() (['1','0','2','-','1'])
        to more easier way like ['102-1','102-2'...]

        Args:
            gri_list_in_rex ([type]): [description] the term from using re.findall()

        Returns:
            [type]: [description] return the term like ['102-1','102-2'...]
        """
        splited_text = self.__get_gri_plain_text(
            gri_list_in_rex=gri_list_in_rex)

        return self.__get_numbers_part_from_gri_plain_text(splited_text)

    def __get_gri_plain_text(self, gri_list_in_rex):
        """
        __get_gri_plain_text [summary]
            get plain text splited with line (\n)W
        Args:
            gri_list_in_rex ([type]): [description]  the term from using re.findall()
        Returns:
            [type]: [description]
        """
        plain_text = ""
        empty_list = list()
        # turn the list of .findall function into more cleaner view
        for temp in range(len(gri_list_in_rex) - 1):
            plain_text = plain_text + (gri_list_in_rex[temp])
        splited_text = plain_text.splitlines()

        #remove the redundant part in the list
        for temp in range(len(splited_text) - 1):
            if splited_text[temp] == '':
                empty_list.append(temp)
        splited_text = list(
            set([i for i in splited_text if i not in empty_list]))

        no_hyphen_col = list()
        #replace the hyphens which is not offcial format
        for temp in range(len(splited_text) - 1):
            splited_text[temp] = self.check_hyphen_exception(
                splited_text, temp)

            if self.__is_contain_hyphen(splited_text[temp]):
                no_hyphen_col.append(splited_text[temp])

        with_hyphen_text = list(
            set([i for i in splited_text if i not in no_hyphen_col]))

        for temp in range(len(with_hyphen_text)-1):
            with_hyphen_text[temp] = with_hyphen_text[temp].strip()

        return with_hyphen_text

    def __get_non_numbers_part_from_gri_plain_text(self, splited_text):
        del_list = list()
        # delete some redundant data in the splited_text list
        # all we need is like 'xxx-x' term
        for temp in range(len(splited_text) - 1):
            if "-" not in splited_text[temp]:
                del_list.append(splited_text[temp])
            if "--" in splited_text[temp]:
                del_list.append(splited_text[temp])
        return del_list

    def __get_numbers_part_from_gri_plain_text(self, splited_text):
        del_list = self.__get_non_numbers_part_from_gri_plain_text(
            splited_text=splited_text)
        gri_pointers = set([i for i in splited_text if i not in del_list])
        return gri_pointers

    def output_B_pointers(self):
        today = datetime.date.today()
        self.csv_file.to_csv(f'.\\csv_file\\{today}_gri_pointers_b.csv',
                             encoding='utf-8-sig')

    def get_gri_pointers_csv_name(self):
        return self.gri_pointers_csv_name

    def get_files_list(self):
        return self._files_list

    def get_csr_report_path(self):
        return self.csr_report_path

    def set_pattern(self, pattern):
        self.pattern = pattern


In [3]:
b_sheets_process = GRIPointers_B(csr_report_path='C:\\Users\\user\\Desktop\\CSR_project\\csr_reports\\csr_reports_2020',
                                 gri_pointers_csv_name=".\\csv_file\\gri_pointers_b_frame")
b_sheets_process.set_pattern(pattern = r"[0-9-－–\s]")
b_sheets_process.init_gri_pointers_csv_file(
    b_sheets_process.gri_pointers_csv_name)
b_sheets_process.catch_gri_pointers(
    csr_report_path=b_sheets_process.get_csr_report_path(), search_term='GRI')
print(b_sheets_process.csv_file)
b_sheets_process.output_B_pointers()


Now processing 1101_台泥_2019(v1).pdf
{'1-7', '1-1', '-', '1-0', '8-1', '4-1', '2-1', '7-1', '1-3', '1-4', '11 '}
{'-34', '-25', '-24', '-', '2019-2018', ' 2019'}


mupdf: invalid page object


{'-', '2013-', '027719-7721', '022531-6897'}
{'-', '0008-2', '-2194', '-13715', '4-480'}


mupdf: invalid page object
mupdf: invalid page object


{'-02', '11-37', '-03', '-05', '12-41', '-04', '-01'}
{'3-5-1-2', '359'}
set()
{'-', '22-53', '24-66', '21-50', '23-63', '1'}
set()
{'331-504', '-'}
{'-', ' 2            '}


mupdf: invalid page object


{'43-98', '-', '41-90', '14001-50001-14064', '42-92'}
{'14064-14046-8001', '32018-2019', '2019-14046', '14001-50001', '1-2019', '3-', '42  '}
{'-', '2-2019-5239637', '14000-14064', '1-2019-2019', '2-0-1-9'}
{'-', '53-109', '51-102'}
{'12'}


mupdf: invalid page object
mupdf: invalid page object


{'62-122', '45001-15506', '-', '61-117'}
{'2019-965', '2019-876', '2018-795066', '2018-852', '2019-981532', '2018-826', '2018-963', '2019-1017'}
{'102-11', '102-14', '102-10', '102-53', '206-2016', '102-46', '102-45', '102-51', '103-2', '102-16', '102-49', '102-44', '102-13', '302-2016', '102-1', '102-40', '102-56', '102-3', '102-41', '301-1', '102-2', '201-1', '102-5', '205-3', '102-6', '102-42', '102-43', '102-12', '102-48', '102-8', '102-52', '102-54', '102-50', '102-55', '302-3', '302-1', '103-1', '102-7', '102-4', '301-2016', '301-2', '102-18', '206-1', '102-9', '103-3'}
true
{'306-2', '103-2', '419-1', '401-1', '304-1', '403-2', '401-3', '306-1', '401-2', '307-1', '412-2', '305-7', '305-1', '405-1', '404-1', '305-4', '103-1', '305-2', '303-1', '103-3', '116'}
{'2'}
{'100-200-300-400', '22-2020', '26000-20121-50001-8000', '-', '102-40-102-47', '1000-2008', '101-2016', '1000-2008-1', '5008-2005', '103-3'}
Now processing 1102_亞泥_2019(v1).pdf
{'36--', '32-40', '53-26000'}
{'207-3031'

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object


{'-', '2020-03'}
{'06'}
{'2007-12', '2019-6', '2019-1-1-12-31'}
{'3000-5', '14064-114001-18001', '110-3', '02-8759-9018', '2020-11', ' 02 8725-8253'}
{'2019-3', '2'}
{'102-11', '-', '2020-27', '205-3'}
{'-', '11-12', '2020-12-2019', '12'}
{'2020-37', '2019 '}
{'7-0045-2020-8', '2-2', '11-0028', '2020-57', '1-1-1', '0003'}
{'126-2019', '17672019-62', '2019-15836', ' 24 2019 '}
{'2020-105', '61-2018'}
{'2020-139', '- '}
{'102-14', '102-10', '102-46', '102-45', '102-16', '14-2019', '102-44', '102-13', '102-1', '102-40', '102-3', '102-41', '102-2', '102-5', '102-6', '102-42', '102-43', '102-12', '102-8', '-', '102-4', '102-7', '102-18', '102-11', '102-9', '11 '}
{'102-53', '102-51', '306-2', '103-2', '14-2019', '102-49', '2020-141', '102-56', '306-1', '203-1', '205-3', '102-48', '305-1', '102-52', '102-54', '102-50', '102-47', '102-55', '-', '103-1', '305-2', '103-3'}
true
{'403-3', '201-2', '305-5', '305-4', '418-1', '103-1', '305-7', '304-2', '103-2', '201-1', '403-1', '103-3'}
{'302-3',

mupdf: invalid page object


{'1-20', '2013-7', '20190101-20191231', '2019-70', '2021-6', '5-6-1-2019', '2020-6', '2019-6', '12019-4'}
{'12-7', '2-7', '2019-1000-2015'}
{'2014-27001', '2017-6'}
{'42'}
{'2030-50', '-', ' -   '}
{'397-160-40-9', '2019-2901-2018', '30-75', '2015-2019-47-462019'}
{'1 '}
{'5-7', '2009-2019', '-'}
{'102-47', '102-40', '-', '103-1', '100-200-300400', '101-2016-1000', '5008-4'}
{'102-14', '102-10', '102-27', '102-16', '102-30', '102-13', '102-1', '102-3', '102-15', '102-28', '102-19', '102-17', '102-2', '102-23', '102-5', '102-29', '102-6', '102-25', '102-26', '102-12', '102-22', '102-8', '102-20', '102-21', '102-7', '102-4', '102-24', '102-18', '102-11', '102-9', '222440'}
{'102-53', '102-46', '102-45', '102-38', '102-51', '103-2', '102-33', '102-36', '102-31', '102-49', '201-2', '102-44', '102-40', '102-56', '102-34', '102-41', '201-3', '102-42', '102-43', '102-35', '102-48', '102-52', '102-54', '102-50', '102-37', '102-32', '102-47', '102-55', '102-39', '103-1', '201-1', '103-3', '2224

mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R218'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R218'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R218'


{'6-1', '6-2', '   79'}
{'2010'}
{'20196-20206-20216', '1000-2018', '14001-18001', '0225071234150-0225071664', '20614     '}
{'           '}
{'-', '1 '}
{'247'}
{'15-2030', '2014-9-17', '17-169', '31'}


mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: cannot find Pattern resource 'R1300'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: cannot find Pattern resource 'R1315'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: cannot find Pattern resource 'R1300'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: cannot find Pattern resource 'R1315'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: cannot find Pattern resource 'R1300'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: cannot find Pattern resource 'R1315'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R221'
mupdf: cannot find ColorSpace resource 'R1374'
mupdf: cannot find Pattern resource 'R1373'
mu

{'2-0-1-5-2-0-1-9'}
{'67746579-8203', '31570-32103'}
{'102-14', '102-10', '102-16', '102-13', '102-1', '102-3', '102-15', '102-19', '102-2', '102-5', '102-6', '102-25', '102-12', '102-8', '102-21', '102-4', '102-7', '102-18', '102-11', '102-9', '24'}
{'102-53', '102-46', '102-45', '102-51', '103-2', '102-36', '102-49', '102-44', '102-56', '102-40', '102-41', '201-3', '102-42', '102-43', '102-26', '102-48', '102-52', '102-54', '102-50', '102-47', '102-55', '103-1', '201-1', '103-3'}
true
{'302-3', '302-4', '302-1', '103-1', '103-2', '204-1', '205-2', '301-1', '103-3'}
{'306-3', '303-2', '306-1', '413-2', '103-1', '305-2', '306-2', '305-7', '103-2', '305-1', '303-1', '307-1', '306-5', '306-4', '103-3'}
{'308-2', '403-2', '401-3', '402-1', '103-1', '401-2', '103-2', '403-4', '401-1', '308-1', '403-1', '103-3'}
{'417-2', '413-2', '103-1', '418-1', '404-3', '103-2', '419-1', '406-1', '404-1', '414-2', '103-3'}
Now processing 1710_東聯_2019(v1).pdf
{'2017-2018', '-', '1000-2008', '2021-6', '20

mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R2567'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R2567'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R2567'


{'102-48-102-49-102-50-102-54', '102-53', '775-6', '886-2-2706-6006-125', '102-50-102-51-102-52-102-56', '886-3-386-8081 618', '886-2-2706-6006-190'}
true
{'102-48-102-49-102-50-102-54', '102-53', '775-6', '886-2-2706-6006-125', '886-3-386-8081-618', '102-50-102-51-102-52-102-56', '886-2-2706-6006-190'}
true
{'102-14', '2018-12-417-39', '2019-2018-6-1486', '40-70', '793-2-2019', '67-80-2', '   10002008  2018 '}
{'12-2018-2019-417', '793-2-3', '-', '2008-2018', '2018-1486', '2019-6', '40-70-67-80'}
{'2018-2019-2018', '2019-2020'}
{'2020-7', '102-21-102-40-102-43-102-44'}
{'102-46-102-47'}
{'2-2019-2020-2020', '102-46-103-1-103-3', '1-2019-2018-2018'}
{'201-1', '102-15', '103-2-103-3', '1'}
{'306-2', '303-1', '302-1  302-4', '305-1-305-2-305-7', '307-1'}
{'416-1-416-2', '418-1', '401-2-401-3-402-1', '1'}
{'271-03386-8081', '399-03473-7366', '102-1', '1384-102-8', '2019-72', '102-3', '102-4', '55-102-7', '77-56-022706-6006', '937-03483-7682', '12-03483-8088', '102-5'}
{'-', '  102-2 '}
{'

mupdf: invalid page object
mupdf: invalid page object


{'765000-2019', '-', '765000  '}
{'16-8-24', '60 '}
{'2019-1', '3 '}
{'102-14', '102-10', '7-9', '37-41', '102-13', '102-1', '22-23', '102-40', '102-3', '102-15', '102-41', '102-2', '102-5', '102-6', '102-12', '102-8', '44-49', '-', '102-4', '102-7', '102-18', '102-11', '102-9', '58-59', '102-16 '}
{'102-53', '102-46', '102-45', '102-51', '7-9', '103-2', '102-49', '102-44', '102-56', '205-3', '102-42', '10-11', '102-43', '102-48', '102-52', '102-50', '102-54', '102-47', '102-55', '-', '103-1', '103-3'}
true
{'102-11', '20-21', '10-11', '102-15', '103-1', '7-9', '103-2', '419-1', '206-1', '25-26', '307-1', '201-1', '103-3'}
{'305-5', '7-9', '103-2', '403-4', '204-1', '403-3', '61-62', '403-2', '403-9', '305-6', '65-66', '73-74', '532-2019', '302-4', '10-11', '71-72', '303-3', '305-1', '403-5', '65-676', '403-7', '60-61', '61-63', '302-3', '302-1', '103-1', '305-2', '403-6', '61-64', '303-1', '70-71', '403-1', '103-3', '60 '}
{'55-58', '401-3', '306-1', '401-2', '306-2', '47-49', '406-1'

mupdf: invalid page object


{'11-3'}
{'1-0-0-0', '2019-11', '2019-2'}
{'2019    '}
{'90-14001', '2020-450012018', '5-3', ' 95'}
{'403-3', '305-4-305-5', '302-1-302-3', '307-1-419-1', '401-1-401-2', '401-3-404-1', '418-1', '404-2-405-1', '306-2', '403-1-403-2', '305-1-305-3', '302-4-302-5', '204-1', '305-7-306-1', '201-1', '416-2', '205-3'}
{'2020-7', '-', ' 182   '}
{'2-0800-15-2019', '15-5', '2-112-126', '2019    '}
{'98 '}
{'2017-2019', '2-2020', '2019-2020', '6-973', '1000-800', '963'}
{'413-2019', '6-2'}
{'168'}
{'102-14', '102-10', '102-53', '102-46', '102-45', '102-51', '102-16', '102-49', '102-44', '102-13', '102-1', '102-40', '102-3', '102-41', '102-2', '102-5', '102-6', '102-42', '102-43', '102-12', '102-48', '102-8', '102-52', '102-50', '102-47', '102-4', '102-7', '101-2016', '102-18', '102-11', '102-9', '102 2016'}
true
{'102-55', '102-56', '418-1', '103-1', '103-2', '416-2', '204-1', '419-1', '102-54', '307-1', '103-3', '205-3'}
{'305-5', '306-2', '103-2', '305-3', '401-1', '403-3', '403-2', '401-3', 

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object


{'24'}
{'2-627', '4-100', '1217-4512', '2-100', '5-100', '1-1440', '3-100', '1 '}
{'102-14', '102-10', '102-46', '102-45', '102-16', '102-49', '102-44', '102-13', '102-1', '102-40', '102-3', '102-41', '102-2', '102-5', '102-6', '102-42', '102-43', '102-12', '102-48', '102-8', '102-50', '102-47', '102-4', '102-7', '102-18', '102-11', '102-9', '12'}
{'203-1', '102-55', '102-56', '202-1', '201-4', '102-53', '-', '103-1', '102-51', '103-2', '204-1', '203-2', '102-52', '102-54', '202-2', '201-1', '201-2', '201-3'}
true
{'304-2', '205-1', '302-2', '304-1', '304-3', '301-1', '205-3', '303-2', '302-4', '303-3', '205-2', '302-5', '302-3', '302-1', '-', '301-2', '303-1', '304-4', '206-1', '301-3'}
{'305-5', '306-2', '305-3', '401-1', '306-3', '401-3', '306-1', '401-2', '305-6', '308-1', '307-1', '308-2', '305-7', '305-1', '306-5', '306-4', '305-4', '402-1', '-', '305-2'}
{'407-1', '405-2', '403-4', '403-3', '403-2', '408-1', '411-1', '404-2', '412-1', '412-2', '409-1', '404-3', '405-1', '404-1',

mupdf: expected object number


{'2019-1-1-2019-12-31', '300-7', '886-3-620-6789', '886-3-623-1111', '1 '}
{'--------------------------32'}
{'       '}
{'-', '12 '}
{'2019-100', '2019-988', '2019  20 '}
{'102-14', '102-10', '102-46', '102-45', '102-16', '102-49', '102-44', '102-13', '102-1', '102-40', '102-3', '102-41', '102-2', '102-5', '102-6', '102-42', '102-43', '102-12', '102-48', '102-8', '102-47', '102-4', '102-7', '102-18', '102-11', '102-9', '11 '}
{'102-55', '201-1', '102-56', '102-53', '103-1', '102-51', '103-2', '204-1', '102-18', '102-52', '102-54', '308-1', '102-50', '201-3', '103-3'}
true
{'403-3', '403-2', '202-1', '302-4', '402-1', '103-1', '401-2', '413-1', '304-2', '103-2', '404-1', '404-2', '103-3', '302 '}
{'103-1', '306-2', '103-2', '308-1', '307-1', '103-3'}
Now processing 3522_御頂_2019(v1).pdf
{'60 '}
{'2019-1-1', '2019-12-31', '-', '9-2', '2021-9', '351-23', ' 2018 7 ', '02-2226-6277'}
{'                          359 '}
{'41  '}
{'2019 '}
{'30-50', '2-3'}
{'2019 '}
{'102-14', '102-29-14', '102

mupdf: invalid page object


{'22-5', '5-5', '126-93', '5-75', '2019-1-2018', '2019-98', '5-40', '---'}
{'-', '307-2020', '2025-2020', '73-2020', '-  -'}
{' 1 ', '-', '2019-1'}
{'    '}
{'-3', '2016-1447', '-131', '156-5', '-1576', '-5', '-428', '-15', '-1', '-553', '-48', '  2016  108  '}
{'156-5', '-5', '-15', '-1', '-553', '4-5-5', '5-5-5', '2016-1447', '5-5', '-428', '-131', '4-9996', '4-5', '5-5-100', '-48', '-3', '5-35-5', '-1576', '-', '23-5', '3-5-5'}
{'2030-12', '1-5353', '-', '2019-6', '7-3', '2016-203-3-2016-172', '  12994 '}
{'5      '}
{' 5 '}
{'2019-2019', '2019-6-28', '2019-2020', '2019-1-22', ' 2020  2  21 2019  2020 '}
{'99-1-15-0990002770', '2020-4-21', '2019-1641', '2019-12-31-3828', '3-23-7-13'}
{'2019-160-15-2', '1-7-918-12-908', '2612-749', '30-8', '966-10-93', '912-10', '2019-768-56', ' 2019  242 '}
{'2019-100-15000-18000', '2019-12-9-1', '5-2019-5', ' 15 '}
{'1-2-5-477-3-12-10-966'}
{'1-7-475', '32-2400-15-64', '1-7-91', '8-12-912', '80-5', '10002-18295', '-', '2019-10', '1200-5-15-64', '8-

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object


{'1000-2015', ' 8 '}
set()
{'14 '}
{'50'}
{'2-2', '1-2020-600-10', '4-2020', '2-100', '2-2020-16', '62019-100', '2-107', '1-2020-3', '2-2019-91', '3-2', '6-625', '1-2020', '5-2020', '2-1', '1-45001-450012018', '3-5', '135-1035', '42019-4', '2025-500', '2-18001-45001', '1-2018', '4-2019', '1-2019-1715', '5-3', '1-2019-5', '1-30', '2-5', '2-108', '4-10352020', '2-3', '2-2020', '1-2019-10', '2-2019', '32019-9928', '600-30', '1-2019-2023-2019', '3-2018', '3-2019-3277', '9-10-5', '12019-9433'}
{'1-45001-45001', '102-87982000', '102-87982000-546', '106-698-9014', ' 02-87982000 515'}
{'2019-12', '2017-8'}
{'37'}
{'80-2019', '2019-12-31-2', '3-3-2019', '2019-12', '1530-50-3-50-10'}
{'2019-12-16-2020', '2019-72', '2020-12', '86-24', '42-2019-12', '2018-1035', '213 '}
{'5-2019-12-8180-20', '19262-47094', '2019-9928-15', '19300-8532', '2019-3', '19262 5887'}
{'86-2', '2019-9', '2017-2018-8', '1'}
{'2019--', '-', '7-875', '2019-494045-50', '45001-450012018'}
{'2019-12-31', '-', '1-2019-12-31', '11