In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from docx import Document
import os
from PIL import Image, ImageOps
import io
import numpy as np
import fitz  # PyMuPDF
from skimage import feature, color, exposure
import pandas as pd

from time import sleep
from tqdm import tqdm

# PDF图片提取及中心坐标提取
## 检测图片是否为反色

**函数名称：** is_image_inverted_combined

**目标：** 综合使用三种不同的方法来检测图像是否被反色。

### 亮度分布检测
**原理：** 该方法计算图像中明亮像素（值大于128）的比例，并与预设的亮度阈值进行比较。

如果明亮像素的比例超过阈值，该方法将投票为“反色”。
### 颜色直方图比较
**原理：** 这种方法是通过比较图像的颜色直方图和其反色版本的直方图来工作的。

使用相关系数来测量原始图像直方图和其反色直方图之间的相似性。
如果相关系数超过预设的直方图阈值，该方法将投票为“反色”。
### 边缘检测比较
**原理：** 首先，使用边缘检测算法（如Canny）来提取图像的边缘。然后，比较原始图像和其反色版本的边缘强度。

使用相关系数来测量原始图像的边缘和其反色版本之间的边缘强度相似性。
如果相关系数超过预设的边缘检测阈值，该方法将投票为“反色”。
### 投票机制
每种方法都会基于其独特的检测策略进行投票。
如果至少有两种方法投票为“反色”，那么函数将返回True，表示图像被认为是反色的。否则，返回False。

In [3]:
def is_image_inverted_combined(img, brightness_threshold=0.75, 
                histogram_threshold=0.8, edge_threshold=0.8):
    # Convert image to RGB mode if it's in "P" mode (palette mode)
    
    img = img.convert("RGB")
    
    # Convert image to grayscale
    img_gray = img.convert("L")
    img_gray_np = np.array(img_gray)
    
    # 1. Brightness distribution
    light_fraction = np.mean(img_gray_np > 128)
    brightness_decision = light_fraction > brightness_threshold
#     print(light_fraction)
#     print(brightness_decision)
    
    # 2. Color histogram comparison
    img_hist = exposure.histogram(np.array(img.convert("RGB")), nbins=256)
    inverted_img_hist = exposure.histogram(np.array(ImageOps.invert(img).convert("RGB")), nbins=256)
    hist_correlation = np.corrcoef(img_hist[0], inverted_img_hist[0])[0, 1]
    histogram_decision = hist_correlation > histogram_threshold
#     print(hist_correlation)
#     print(histogram_decision)
    
    # 3. Edge detection comparison
    edges_original = feature.canny(color.rgb2gray(np.array(img)))
    edges_inverted = feature.canny(color.rgb2gray(np.array(ImageOps.invert(img))))
    edge_correlation = np.corrcoef(edges_original.flatten(), edges_inverted.flatten())[0, 1]
    edge_decision = edge_correlation > edge_threshold
#     print(edge_correlation)
#     print(edge_decision)
    
    # Majority voting
    return sum([brightness_decision, histogram_decision, edge_decision]) >= 256

## 检测低信息含量图片

In [4]:
# def is_low_variance_image_in_memory(img, threshold=3000):
#     try:
#         data = np.array(img.convert('RGB'))
#         variance = np.var(data)
#         return variance < threshold
#     except Exception as e:
#         print(f"Error processing image: {e}")
#         return None

## PyMuPDF获取图片及坐标

In [5]:
# 建立dataframe存储坐标信息
coordinate_df = pd.DataFrame(columns=["file_name","x0","y0","x1","y1","centre_coordinate"])

In [13]:
def PyMuPDF_get_img(file_path, subfolder_name = 'PyMu_img_folder', coordinate_df=coordinate_df):
    i = 0
    # 打开PDF并获取名称
    pdf_file = fitz.open(file_path)
    pdf_file_name = os.path.basename(file_path)

    # 检查子目录是否存在，如果不存在则创建
    if not os.path.exists(subfolder_name):
        os.makedirs(subfolder_name)
    
    for page_number in range(len(pdf_file)):
        # 获取页数
        page = pdf_file[page_number]
        
        # 获取图像信息和图像坐标
        image_list = page.get_images(full=True)

        for img_index, img_info in enumerate(image_list):
            
            #获取图片名称
            image_filename = f"{pdf_file_name}_page_{page_number + 1}_img_{img_index + 1}.png"
            
            img_xref = img_info[0]
            base_image = pdf_file.extract_image(img_info[0])
            if isinstance(base_image, dict):
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes))

                # 获取图像坐标
                try:
                    image_rect = page.get_image_rects(img_xref)
                    if not image_rect:
                        print(f"Warning: No image rectangle found for xref {img_xref} on page {page_number + 1} in PDF {pdf_file_name}. Skipping this image.")
                        continue
                    rect = image_rect[0]
                    x0, y0, x1, y1 = rect.x0, rect.y0, rect.x1, rect.y1
                    center_coord = (x0 + x1)/2 , (y0 + y1)/2
                except Exception as e:
                    print(f"Error getting image rectangle for xref {img_xref} on page {page_number + 1} in PDF {pdf_file_name}: {str(e)}. \nSkipping this image.")

                # 判断图片大小，过滤过小的图片
                if abs(x0-x1) < 100:
                    continue

                # 打开图像并检查颜色模式
                image = Image.open(io.BytesIO(image_bytes))
                if image.mode == "CMYK":
                    image = image.convert("RGB")

                # 检查图片是否反色
                if is_image_inverted_combined(image):
                    print('Inverted image detected: {} is inverted.'.format(image_filename))
                    image = image.convert("RGB")  # Ensure the image is in RGB mode
                    image = ImageOps.invert(image)

                # 图片保存
                image_filepath = os.path.join(subfolder_name, image_filename)
                image.save(open(image_filepath, "wb"), "PNG")

                # 坐标保存
                info = pd.Series([image_filename, x0, y0, x1, y1, center_coord],
                                index=["file_name","x0","y0","x1","y1","centre_coordinate"])
                coordinate_df = pd.concat([coordinate_df, pd.DataFrame([info])], ignore_index=True)

                # 打印
                """
                输出的坐标是一个包含四个值的元组 (x0,y0,x1,y1)，
                分别代表图像矩形框的左下角 (x0, y0) 和右上角 (x1, y1) 的坐标
                """
    #             print(f"Image {image_filename}; coordinates: {image_rect}; center_coord: {center_coord}")
                i = i+1
            else:
                print(f"Failed to extract image from xref {img_xref}")
            
    if i < len(pdf_file) + 5:
        print("Anomaly found in PDF {} ({} pages), only {} pics.".format(pdf_file_name, len(pdf_file),i))
    
    # Close the PDF
    pdf_file.close()
    
    # 保存坐标表格
    df_filepath = os.path.join(subfolder_name, "coordinate_df.csv")
    coordinate_df.to_csv(df_filepath)
    
    return coordinate_df

In [7]:
def get_file_path(directory):
    files = [f for f in os.listdir(directory) if f != '.DS_Store']
    return [os.path.join(directory, file) for file in files if os.path.isfile(os.path.join(directory, file))]

In [11]:
# 读取PDF
file_bundle = get_file_path("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/00_DataBase/01_ESGReport_PDF/2021")
# file_bundle = get_file_path("/Users/improvise/Downloads/test")

# 保存到：
subfolder_name = "/Users/improvise/Desktop/保研/实证论文/ESG/Playground/01_Extraction/ESG 2021/02 PyMu_img"
# subfolder_name = "/Users/improvise/Downloads/test_output"

In [14]:
try:
    for file in tqdm(file_bundle):
        print("\nProcessing file:{}".format(file))
        coordinate_df = pd.concat([coordinate_df, PyMuPDF_get_img(file, subfolder_name)],
                                 ignore_index=True)
except OSError as e:
    print(f"Error processing file {file}: {e}")

coordinate_df

  0%|                                                     | 0/1 [00:00<?, ?it/s]


Processing file:/Users/improvise/Downloads/test/002203.SZ-海亮股份-海亮股份 环境、社会、公司治理(ESG)报告暨社会责任报告-2022-04-28.pdf
Failed to extract image from xref 324


100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.92s/it]






Unnamed: 0,file_name,x0,y0,x1,y1,centre_coordinate
0,600350.SH-山东高速-山东高速股份有限公司2021年环境、社会及治理报告-2022-...,-1.100604,-1.134460,596.513000,809.110962,"(297.7061984539032, 403.9882507324219)"
1,600350.SH-山东高速-山东高速股份有限公司2021年环境、社会及治理报告-2022-...,39.002197,25.685434,201.660690,48.466736,"(120.3314437866211, 37.07608509063721)"
2,600350.SH-山东高速-山东高速股份有限公司2021年环境、社会及治理报告-2022-...,-1.196211,577.812927,240.420120,766.871460,"(119.61195480823517, 672.3421936035156)"
3,600350.SH-山东高速-山东高速股份有限公司2021年环境、社会及治理报告-2022-...,239.922119,577.812866,489.011108,766.871399,"(364.46661376953125, 672.3421325683594)"
4,600350.SH-山东高速-山东高速股份有限公司2021年环境、社会及治理报告-2022-...,488.513428,577.812866,737.851501,766.871399,"(613.1824645996094, 672.3421325683594)"
...,...,...,...,...,...,...
9499,002203.SZ-海亮股份-海亮股份 环境、社会、公司治理(ESG)报告暨社会责任报告-2...,652.340698,88.663773,901.140503,244.932007,"(776.7406005859375, 166.79788970947266)"
9500,002203.SZ-海亮股份-海亮股份 环境、社会、公司治理(ESG)报告暨社会责任报告-2...,881.466797,322.399414,1130.411011,478.489136,"(1005.9389038085938, 400.44427490234375)"
9501,002203.SZ-海亮股份-海亮股份 环境、社会、公司治理(ESG)报告暨社会责任报告-2...,652.253113,555.939453,901.139099,712.232605,"(776.6961059570312, 634.0860290527344)"
9502,002203.SZ-海亮股份-海亮股份 环境、社会、公司治理(ESG)报告暨社会责任报告-2...,56.347000,492.093994,535.223022,712.229004,"(295.7850112915039, 602.1614990234375)"


In [13]:
# coordinate_df.to_csv("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/PyMu_img01//coordinate_df2.csv")
coordinate_df.to_excel("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/01_Extraction/ESG 2020/03 PyMu_img_cord/img_cord.xlsx",
                      index = False)