In [1]:
import os
import fitz
import pandas as pd
import re
import numpy as np
from packages.Exception_handling import get_exception
import datetime


Retrieve from b_sheet

In [2]:
class GRIPointers_B:
    def __init__(self, csr_report_path: str, gri_pointers_csv_name: str):
        self.csr_report_path = csr_report_path
        self._files_list = os.listdir(self.csr_report_path)
        self.gri_pointers_csv_name = gri_pointers_csv_name
        self.csv_file = ''
        self.current_gri_pointer_number = 0
        self.reveal_number = 0
        self.pattern = ''

    #每間公司
    def catch_gri_pointers(self, csr_report_path: str, search_term: str):
        """
        catch_gri_pointers [summary]
            Detect gri pointers in each page including search_term 
        Args:
            csr_report_path (str): [description]  csr reports path
            search_term(str): search for the specified word in each page in each csr report file like "GRI 準則揭露項目"
        Returns:
            [type]: [description] completed csv file with b sheets
        """

        #init the requirment for the method
        current_company_number = 0  # to avoid the index in the first row

        try:
            for file in self.get_files_list():

                print(f'Now processing {file}')

                self.__fill_into_GRI_csv(
                    file=file,
                    pdf_document=fitz.open(os.path.join(csr_report_path,
                                                        file)),
                    current_company_number=current_company_number,
                    search_term=search_term)

                current_company_number = self.__shift_to_next_company(
                    current_company_number=current_company_number)

        except Exception as e:
            get_exception(e, file)

    #每間公司報告的每頁
    def __fill_into_GRI_csv(self, pdf_document, file, current_company_number,
                            search_term):
        """
        __fill_into_GRI_csv [summary] First initialize all the corporate name into
        csv files, then check each GRI pointers for each corporate. If ends, then do nothing.
        Args:
            pdf_document ([type]): [description]
            file ([type]): [description]
            current_company_number ([type]): [description]
            search_term ([type]): [description]
        """

        # First inserting all the corporates name into csv file.
        self.__fill_corporate_name(
            file=file, current_company_number=current_company_number)
        self.__shift_to_next_gri_pointer()
        #從這裡開始，所有的gri_pointer都從1開始

        #邏輯為，每一頁抓到Search term後，利用Regular expression存入list，一一比照dataframe的column與list內部項目
        #若list無比對成功者，該指標填0，換到下一個指標。
        #若list比對成功者，該指標填1
        ##########################################
        # Crawl into each page of current csr, if catch gri keywords then insert it into csv files
        for current_page in range(len(pdf_document)):
            self.__reset_gri_pointer()
            # Every page should traversal all the gri pointer
            page = pdf_document.loadPage(current_page)

            # 抓到每篇CSR報告附錄的GRI指標對照表
            if page.searchFor('GRI') or page.searchFor(
                        "指標") or page.searchFor("揭露") or page.searchFor("附錄"):
                print(page)
                self.__fill_into_single_csv(current_company_number, page)

        #抓到已揭露指標的數目
        for temp in range(1, 136):
            if self.csv_file.iat[current_company_number, temp] == 1:
                self.reveal_number += 1
            else:
                continue

        #每間公司結束之後，將該公司的揭露指標數與未揭露指標數填入dataframe
        self.__fill_in_each_reports_reveal_and_unreveal_numbers(
            current_company_number)
        self.reveal_number = 0

    #每間公司報告內部抓到的每頁揭露指標與column進行比對
    def __fill_into_single_csv(self, current_company_number, page):
        #Using normal expression to filter words caught.
        gri_pointers_disclosed_in_this_page = self.__gri_text_filter(
            re.findall(self.pattern, page.getText("text")))
        print(gri_pointers_disclosed_in_this_page)
        ##########################################
        for column in self.csv_file.columns:
            ####################################
            #處理是否揭露的判斷式
            if (column in gri_pointers_disclosed_in_this_page):
                self.csv_file.at[current_company_number, column] = 1
                
        for column in self.csv_file.columns:
            ####################################
            #處理是否揭露的判斷式
            if self.csv_file.at[current_company_number, column] == '':
                self.csv_file.at[current_company_number, column] = 0

    ################################################################
    #basic functions

    def init_gri_pointers_csv_file(self, csv_name):
        """
        init_gri_pointers_csv_file [summary] Initailizing the gri pointers csv file with following task:
        1. remove unnamed column
        2. remove all nan column being regarded as float type, u
        nabling to process with str type

        Args:
            csv_name ([type]): [description] the unprocessed initial csv name 

        Returns:
            [type]: [description] the processed csv file
        """

        self.csv_file = pd.read_csv(f'{csv_name}.csv')
        self.csv_file = self.csv_file.loc[:, ~self.csv_file.columns.str.
                                          contains('^Unnamed')]
        self.csv_file = self.csv_file.replace(np.nan, '',
                                              regex=True)  # All data frame
        return self.csv_file

    def __is_contain_hyphen(self, text) -> bool:
        is_hyphen = False
        for single_char in text:
            if single_char == "-":
                is_hyphen = True
        return not is_hyphen

    def __fill_in_each_reports_reveal_and_unreveal_numbers(
            self, current_company_number):
        self.csv_file.iat[current_company_number, -2] = self.reveal_number
        self.csv_file.iat[current_company_number,
                          -1] = 136 - self.reveal_number

    def __fill_corporate_name(self, file, current_company_number):
        self.csv_file.iat[current_company_number, 0] = file

    def __shift_to_next_gri_pointer(self):
        self.current_gri_pointer_number += 1

    def __reset_gri_pointer(self):
        self.current_gri_pointer_number = 1

    def __shift_to_next_company(self, current_company_number: int) -> int:
        """
        __shift_to_next_company [summary] shift to process next company's gri pointers
        
        Args:
            current_company_number ([type]): [description] 
        """
        next_company_number = current_company_number + 1
        return next_company_number

    # handle full hyphen exception
    def str_dash_full_to_half(self, in_str: str) -> str:
        half_text = ''
        for character in in_str:
            if chr(45 + 65248).encode("utf-8") == character:
                character += "-"
            else:
                half_text += character
        return half_text

    def check_hyphen_exception(self, splited_text: list, index: int) -> str:
        splited_text[index] = " ".join(splited_text[index].split())
        splited_text[index] = splited_text[index].strip()
        #replace一a些在欄位中比較特別的符號
        splited_text[index] = splited_text[index].replace("–", "-")
        splited_text[index] = splited_text[index].replace(" - ", "-")
        splited_text[index] = splited_text[index].replace("－", "-")
        splited_text[index] = splited_text[index].replace(" ", "-")
        splited_text[index] = splited_text[index].replace("\t", "")

        return splited_text[index]

    def __gri_text_filter(self, gri_list_in_rex):
        """
        __gri_text_filter [summary] 
        filter the term from re.findall() (['1','0','2','-','1'])
        to more easier way like ['102-1','102-2'...]

        Args:
            gri_list_in_rex ([type]): [description] the term from using re.findall()

        Returns:
            [type]: [description] return the term like ['102-1','102-2'...]
        """
        splited_text = self.__get_gri_plain_text(
            gri_list_in_rex=gri_list_in_rex)

        return self.__get_numbers_part_from_gri_plain_text(splited_text)

    def __get_gri_plain_text(self, gri_list_in_rex):
        """
        __get_gri_plain_text [summary]
            get plain text splited with line (\n)W
        Args:
            gri_list_in_rex ([type]): [description]  the term from using re.findall()
        Returns:
            [type]: [description]
        """
        plain_text = ""
        empty_list = list()
        # turn the list of .findall function into more cleaner view
        for temp in range(len(gri_list_in_rex) - 1):
            plain_text = plain_text + (gri_list_in_rex[temp])
        splited_text = plain_text.splitlines()

        #remove the redundant part in the list
        for temp in range(len(splited_text) - 1):
            if splited_text[temp] == '':
                empty_list.append(temp)
        splited_text = list(
            set([i for i in splited_text if i not in empty_list]))

        no_hyphen_col = list()
        #replace the hyphens which is not offcial format
        for temp in range(len(splited_text) - 1):
            splited_text[temp] = self.check_hyphen_exception(
                splited_text, temp)

            if self.__is_contain_hyphen(splited_text[temp]):
                no_hyphen_col.append(splited_text[temp])

        with_hyphen_text = list(
            set([i for i in splited_text if i not in no_hyphen_col]))

        for temp in range(len(with_hyphen_text)-1):
            with_hyphen_text[temp] = with_hyphen_text[temp].strip()

        return with_hyphen_text

    def __get_non_numbers_part_from_gri_plain_text(self, splited_text):
        del_list = list()
        # delete some redundant data in the splited_text list
        # all we need is like 'xxx-x' term
        for temp in range(len(splited_text) - 1):
            if "-" not in splited_text[temp]:
                del_list.append(splited_text[temp])
            if "--" in splited_text[temp]:
                del_list.append(splited_text[temp])
        return del_list

    def __get_numbers_part_from_gri_plain_text(self, splited_text):
        del_list = self.__get_non_numbers_part_from_gri_plain_text(
            splited_text=splited_text)
        gri_pointers = set([i for i in splited_text if i not in del_list])
        return gri_pointers

    def output_B_pointers(self):
        today = datetime.date.today()
        self.csv_file.to_csv(f'.\\csv_file\\{today}_gri_pointers_b.csv',
                             encoding='utf-8-sig')

    def get_gri_pointers_csv_name(self):
        return self.gri_pointers_csv_name

    def get_files_list(self):
        return self._files_list

    def get_csr_report_path(self):
        return self.csr_report_path

    def set_pattern(self, pattern):
        self.pattern = pattern


In [3]:
b_sheets_process = GRIPointers_B(
    # csr_report_path='.\\testing_reports',
    csr_report_path='C:\\Users\\user\\Desktop\\CSR_project\\csr_reports\\csr_reports_2020',
                                 gri_pointers_csv_name=".\\csv_file\\gri_pointers_b_frame")
b_sheets_process.set_pattern(pattern = r"[0-9-－–\s]")
b_sheets_process.init_gri_pointers_csv_file(
    b_sheets_process.gri_pointers_csv_name)
b_sheets_process.catch_gri_pointers(
    csr_report_path=b_sheets_process.get_csr_report_path(), search_term='GRI')
print(b_sheets_process.csv_file)
b_sheets_process.output_B_pointers()


Now processing 1101_台泥_2019(v1).pdf
page 2 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'2-1', '1-1', '1-3', '-', '1-4', '8-1', '1-7', '4-1', '7-1', '1-0'}
page 5 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'-34', '-24', '2019-2018', '-', '-25'}


mupdf: invalid page object


page 7 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'027719-7721', '022531-6897', '2013-', '-', '3000 '}
page 9 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'-', '3000'}
page 13 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'4-480', '-', '-2194', '0008-2', '-13715', '70'}


mupdf: invalid page object
mupdf: invalid page object


page 18 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'-01', '-05', '11-37', '-02', '-03', '-04', '12-41', '10'}
page 19 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'-'}
page 20 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'3-5-1-2', '119'}
page 21 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'22'}
page 22 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'22-53', '-', '23-63', '21-50'}
page 25 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'109512'}
page 28 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'-', '30156721'}
page 29 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'331-504', '-'}
page 30 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_

mupdf: invalid page object


page 35 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'32-81', '33-84', '31-72'}
page 44 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'-', '43-98', '41-90', '42-92', '14001-50001-14064'}
page 46 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'14064-14046-8001', '14001-50001', '3-', '1-2019', '2019-14046', '32018-2019', '643'}
page 47 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'1-2019-2019', '-', '2-0-1-9', '2-2019-5239637'}
page 50 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'53-109', '51-102', '-', '52-106', '202740000'}
page 54 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'202740000'}


mupdf: invalid page object
mupdf: invalid page object


page 58 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'62-122', '63-128', '61-117', '-', '45001-15506', '117'}
page 62 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'2018-826', '2018-852', '2019-876', '2019-981532', '2018-795066', '2019-965', '2018-963', '2019                    1017'}
page 64 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'31-50', '11931-5061351', '8'}
page 66 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1101_台泥_2019(v1).pdf
{'102-18', '103-2', '102-42', '102-49', '206-1', '206-2016', '205-3', '102-11', '102-7', '102-55', '103-1', '102-40', '102-44', '102-46', '301-1', '102-56', '301-2016', '102-41', '103-3', '102-9', '201-1', '102-53', '102-13', '102-5', '102-54', '102-6', '102-1', '302-1', '102-48', '102-4', '102-50', '302-2016', '102-8', '102-3', '302-3', '301-2', '102-2', '102-43', '102-12', '102-14', '102-51', '102-45',

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object


page 19 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'2019-3', ' 2008 '}
page 22 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'2020-23', '2019-13', '2019-6', '4-1912', '30 '}
page 23 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'2019-6-2-9-55', '24-2019', '2019-5', '7-9', '3-1-2019'}
page 26 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'-', '102-11', '205-3', '2020-27', '22'}
page 27 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'11-12', '-', '2020-12-2019'}
page 34 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'2020-35', '2    3 '}
page 36 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'2020-37'}
page 39 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'1

mupdf: invalid page object


page 138 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'2020-139', '- '}
page 139 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'14-2019', '102-18', '102-42', '102-11', '102-40', '102-7', '102-44', '102-46', '102-9', '102-41', '102-5', '102-13', '102-6', '102-1', '102-4', '102-8', '-', '102-3', '102-2', '102-43', '102-12', '102-14', '102-45', '102-16', '102-10', '41 '}
page 140 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'14-2019', '103-2', '102-49', '203-1', '306-1', '102-47', '205-3', '305-2', '102-55', '102-56', '103-1', '306-2', '103-3', '2020-141', '102-53', '102-54', '305-1', '102-48', '102-50', '-', '102-51', '102-52', '10'}
page 141 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1328_中油_2019(v1).pdf
{'201-1', '403-1', '305-5', '103-2', '418-1', '304-2', '403-3', '103-1', '201-2', '305-7', '305-4', '103-3'}
page 142 of C:\Users\user\D

mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R218'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R218'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R218'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R218'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: cannot find Pattern resource 'R1300'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: cannot find Pattern resource 'R1315'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: cannot find Pattern resource 'R1300'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: cannot find Pattern resource 'R1315'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1299'
mupdf: canno

page 16 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'1  '}
page 28 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'26'}
page 29 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
set()
page 30 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'-'}
page 32 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'30'}
page 33 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'15-2030', '17-169', '2014 9 17   '}
page 48 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'487386'}
page 49 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'554763'}


mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1632'
mupdf: cannot find Pattern resource 'R1633'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1632'
mupdf: cannot find Pattern resource 'R1633'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1632'
mupdf: cannot find Pattern resource 'R1633'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find ColorSpace resource 'R1632'
mupdf: cannot find Pattern resource 'R1633'


page 63 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'2-0-1-5-2-0-1-9', '579848'}
page 65 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'67746579-8203', '31570-32103', '2019164465'}
page 85 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'-111', '-9', '-', '-265'}
page 94 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'  '}
page 95 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'102-18', '102-11', '102-7', '102-9', '102-25', '102-5', '102-13', '102-6', '102-19', '102-1', '102-4', '102-8', '102-3', '102-2', '102-12', '102-14', '102-15', '102-16', '102-21', '102-10'}
page 96 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1709_和益化工_2019(v1).pdf
{'103-2', '102-49', '102-42', '201-3', '102-47', '102-40', '102-55', '102-46', '103-1', '102-56', '102-44', '102-4

mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R2567'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R2567'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R2567'
mupdf: cannot find ExtGState resource 'R216'
mupdf: cannot find Pattern resource 'R2567'


Now processing 1710_東聯_2019(v1).pdf
page 1 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1710_東聯_2019(v1).pdf
{'14064-1', '2021-6', '1000-2008', '2019-1-1-12-31', '-', '2019-6', ' 2017  2018 '}
page 3 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1710_東聯_2019(v1).pdf
{'41 '}
page 22 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1710_東聯_2019(v1).pdf
{'1998-14001-2018', '-'}
page 25 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1710_東聯_2019(v1).pdf
{'2019-11762636-2018', '3-2019-46', '20-22174-33618', '2-2019-62', '2018  '}
page 32 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1710_東聯_2019(v1).pdf
{'-'}
page 33 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1710_東聯_2019(v1).pdf
{'2-2019', '2020-2025', '1-2018'}
page 34 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\1710_東聯_2019(v1).pdf
{'1-1', '3-2', '2014-14064-1', '12019-116', '42018-2020-3'}
page 36

mupdf: invalid page object
mupdf: invalid page object



{'24 '}
page 26 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2023_燁輝_2019(v1).pdf
{' 2 '}
page 27 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2023_燁輝_2019(v1).pdf
{' 57'}
page 37 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2023_燁輝_2019(v1).pdf
{'  '}
page 38 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2023_燁輝_2019(v1).pdf
{'-', '--'}
page 55 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2023_燁輝_2019(v1).pdf
{'-', '765000-2019'}
page 61 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2023_燁輝_2019(v1).pdf
{'16-8-24', '60 '}
page 65 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2023_燁輝_2019(v1).pdf
{'2019-1'}
page 83 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2023_燁輝_2019(v1).pdf
{'102-18', '102-11', '102-40', '102-7', '7-9', '22-23', '102-9', '102-41', '102-13', '102-5', '102-6', '102-1', '102-4', '-', '102-3', '102-2', '

mupdf: invalid page object


page 22 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2204_中華汽車_2019(v1).pdf
{'11-3', '51 '}
page 28 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2204_中華汽車_2019(v1).pdf
{'2019-3', '3388 '}
page 36 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2204_中華汽車_2019(v1).pdf
{'2019-2', '1-0-0-0', '2019-11', '2019 '}
page 37 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2204_中華汽車_2019(v1).pdf
{'0800-8002200'}
page 39 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2204_中華汽車_2019(v1).pdf
{'        '}
page 40 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2204_中華汽車_2019(v1).pdf
{'5-3', '2020-450012018', '90-14001'}
page 41 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2204_中華汽車_2019(v1).pdf
{'922931-', '03-47831913625', '-', '1-2', '155062011-180012007', '14001-16949-9001', '0800-030-580', '0800-', '1-3-1', '03-47831912050', '03-47831912548', '03-2707781', ' 1400

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object


page 53 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2206_三陽工業_2019(v1).pdf
{'8'}
page 54 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2206_三陽工業_2019(v1).pdf
{'22'}
page 59 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2206_三陽工業_2019(v1).pdf
{'2-627', '2-100', '3-100', '4-100', '1217-4512', '1-1440', '5-100', '1217'}
page 79 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2206_三陽工業_2019(v1).pdf
{'102-18', '102-42', '102-49', '102-47', '102-11', '102-40', '102-7', '102-44', '102-46', '102-9', '102-41', '102-5', '102-13', '102-6', '102-1', '102-48', '102-4', '102-50', '102-8', '102-3', '102-2', '102-43', '102-12', '102-14', '102-45', '102-16', '102-10', '102'}
page 80 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\2206_三陽工業_2019(v1).pdf
{'103-2', '203-1', '201-3', '102-56', '102-55', '103-1', '204-1', '201-1', '102-53', '102-54', '201-4', '-', '202-1', '202-2', '102-51', '102-52', '201-2', 

mupdf: expected object number


Now processing 3504_揚明光學_2019(v1).pdf
page 1 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\3504_揚明光學_2019(v1).pdf
{'886-3-620-6789', '300-7', '886-3-623-1111', '2019-1-1-2019-12-31', '        '}
page 2 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\3504_揚明光學_2019(v1).pdf
{'   --------------------------54 '}
page 7 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\3504_揚明光學_2019(v1).pdf
{'  '}
page 8 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\3504_揚明光學_2019(v1).pdf
{'-', ' 302'}
page 16 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\3504_揚明光學_2019(v1).pdf
{'2006-50-23-40', '500-101', '2016-1000', '2011-397', '          '}
page 26 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\3504_揚明光學_2019(v1).pdf
{'33  '}
page 34 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\3504_揚明光學_2019(v1).pdf
{'6-100', '2-17'}
page 38 of C:\Users\user\Desktop\CSR_project\csr_repo

mupdf: invalid page object


page 28 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\4904_遠傳_2019(v1).pdf
{'2030-12', '7-3', '-', '2019-6', '2016-203-3-2016-172', '1-5353'}
page 44 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\4904_遠傳_2019(v1).pdf
{'2018 '}
page 46 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\4904_遠傳_2019(v1).pdf
{'--1-4'}
page 51 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\4904_遠傳_2019(v1).pdf
{'2020-6-88-100', '-', '1-1--'}
page 52 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\4904_遠傳_2019(v1).pdf
{'2019-6-28', '2019-2019', '2019-1-22', '2020-2-21-2019-2020', '2019  2020 '}
page 53 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\4904_遠傳_2019(v1).pdf
{'20-2021', ' 2018 －  2019  1  2019  2 '}
page 54 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\4904_遠傳_2019(v1).pdf
{'2018-123', '2019-3-2019', '2019-2', '8-3-2', '   5'}
page 55 of C:\Users\user\Desktop\CSR_proj

mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object


page 2 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\8926_台汽電_2019(v1).pdf
{'32  '}
page 3 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\8926_台汽電_2019(v1).pdf
{'4-2019-5', '12-16', '4-13', '100-2019', '2019-901-5', ' 150  1035  2020 '}
page 10 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\8926_台汽電_2019(v1).pdf
{'22865-27505', '114-149', '10-99', '3447634-2', '241-9433', '0-0', '115846-37656', '2019-6671356', '371059-2', '5-85', '1-48', '109-64', ' 1649835 \u3000\u3000  9956 '}
page 11 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\8926_台汽電_2019(v1).pdf
{'1000-2015'}
page 12 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\8926_台汽電_2019(v1).pdf
{'2019-9', '2019-4'}
page 13 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\8926_台汽電_2019(v1).pdf
{'16'}
page 14 of C:\Users\user\Desktop\CSR_project\csr_reports\csr_reports_2020\8926_台汽電_2019(v1).pdf
set()
page 15 of C:\Users\user\D