In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from docx import Document
import os
from PIL import Image, ImageOps
import io
import numpy as np
import fitz  # PyMuPDF
from skimage import feature, color, exposure
import pandas as pd

from time import sleep
from tqdm import tqdm

# PDF图片提取及中心坐标提取
## 检测图片是否为反色

**函数名称：** is_image_inverted_combined

**目标：** 综合使用三种不同的方法来检测图像是否被反色。

### 亮度分布检测
**原理：** 该方法计算图像中明亮像素（值大于128）的比例，并与预设的亮度阈值进行比较。

如果明亮像素的比例超过阈值，该方法将投票为“反色”。
### 颜色直方图比较
**原理：** 这种方法是通过比较图像的颜色直方图和其反色版本的直方图来工作的。

使用相关系数来测量原始图像直方图和其反色直方图之间的相似性。
如果相关系数超过预设的直方图阈值，该方法将投票为“反色”。
### 边缘检测比较
**原理：** 首先，使用边缘检测算法（如Canny）来提取图像的边缘。然后，比较原始图像和其反色版本的边缘强度。

使用相关系数来测量原始图像的边缘和其反色版本之间的边缘强度相似性。
如果相关系数超过预设的边缘检测阈值，该方法将投票为“反色”。
### 投票机制
每种方法都会基于其独特的检测策略进行投票。
如果至少有两种方法投票为“反色”，那么函数将返回True，表示图像被认为是反色的。否则，返回False。

In [3]:
def is_image_inverted_combined(img, brightness_threshold=0.75, 
                histogram_threshold=0.8, edge_threshold=0.8):
    # Convert image to RGB mode if it's in "P" mode (palette mode)
    
    img = img.convert("RGB")
    
    # Convert image to grayscale
    img_gray = img.convert("L")
    img_gray_np = np.array(img_gray)
    
    # 1. Brightness distribution
    light_fraction = np.mean(img_gray_np > 128)
    brightness_decision = light_fraction > brightness_threshold
#     print(light_fraction)
#     print(brightness_decision)
    
    # 2. Color histogram comparison
    img_hist = exposure.histogram(np.array(img.convert("RGB")), nbins=256)
    inverted_img_hist = exposure.histogram(np.array(ImageOps.invert(img).convert("RGB")), nbins=256)
    hist_correlation = np.corrcoef(img_hist[0], inverted_img_hist[0])[0, 1]
    histogram_decision = hist_correlation > histogram_threshold
#     print(hist_correlation)
#     print(histogram_decision)
    
    # 3. Edge detection comparison
    edges_original = feature.canny(color.rgb2gray(np.array(img)))
    edges_inverted = feature.canny(color.rgb2gray(np.array(ImageOps.invert(img))))
    edge_correlation = np.corrcoef(edges_original.flatten(), edges_inverted.flatten())[0, 1]
    edge_decision = edge_correlation > edge_threshold
#     print(edge_correlation)
#     print(edge_decision)
    
    # Majority voting
    return sum([brightness_decision, histogram_decision, edge_decision]) >= 256

## 检测低信息含量图片

In [4]:
# def is_low_variance_image_in_memory(img, threshold=3000):
#     try:
#         data = np.array(img.convert('RGB'))
#         variance = np.var(data)
#         return variance < threshold
#     except Exception as e:
#         print(f"Error processing image: {e}")
#         return None

## PyMuPDF获取图片及坐标

In [5]:
# 建立dataframe存储坐标信息
coordinate_df = pd.DataFrame(columns=["file_name","x0","y0",
                                     "x1","y1","centre_coordinate"])

In [6]:
def PyMuPDF_get_img(file_path, subfolder_name = 'PyMu_img_folder', coordinate_df=coordinate_df):
    i = 0
    # 打开PDF并获取名称
    pdf_file = fitz.open(file_path)
    pdf_file_name = os.path.basename(file_path)

    # 检查子目录是否存在，如果不存在则创建
    if not os.path.exists(subfolder_name):
        os.makedirs(subfolder_name)
    
    for page_number in range(len(pdf_file)):
        # 获取页数
        page = pdf_file[page_number]
        
        # 获取图像信息和图像坐标
        image_list = page.get_images(full=True)

        for img_index, img_info in enumerate(image_list):
            
            #获取图片名称
            image_filename = f"{pdf_file_name}_page_{page_number + 1}_img_{img_index + 1}.png"
            
            img_xref = img_info[0]
            base_image = pdf_file.extract_image(img_info[0])
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # 获取图像坐标
            try:
                image_rect = page.get_image_rects(img_xref)
                if not image_rect:
                    print(f"Warning: No image rectangle found for xref {img_xref} on page {page_number + 1} in PDF {pdf_file_name}. Skipping this image.")
                    continue
                rect = image_rect[0]
                x0, y0, x1, y1 = rect.x0, rect.y0, rect.x1, rect.y1
                center_coord = (x0 + x1)/2 , (y0 + y1)/2
            except Exception as e:
                print(f"Error getting image rectangle for xref {img_xref} on page {page_number + 1} in PDF {pdf_file_name}: {str(e)}. \nSkipping this image.")
            
            # 判断图片大小，过滤过小的图片
            if abs(x0-x1) < 100:
                continue
            
            # 打开图像并检查颜色模式
            image = Image.open(io.BytesIO(image_bytes))
            if image.mode == "CMYK":
                image = image.convert("RGB")
                
            # 检查图片是否反色
            if is_image_inverted_combined(image):
                print('Inverted image detected: {} is inverted.'.format(image_filename))
                image = image.convert("RGB")  # Ensure the image is in RGB mode
                image = ImageOps.invert(image)
    
            # 图片保存
            image_filepath = os.path.join(subfolder_name, image_filename)
            image.save(open(image_filepath, "wb"), "PNG")
            
            # 坐标保存
            info = pd.Series([image_filename, x0, y0, x1, y1, center_coord],
                            index=["file_name","x0","y0","x1","y1","centre_coordinate"])
            coordinate_df = pd.concat([coordinate_df, pd.DataFrame([info])], ignore_index=True)
            
            # 打印
            """
            输出的坐标是一个包含四个值的元组 (x0,y0,x1,y1)，
            分别代表图像矩形框的左下角 (x0, y0) 和右上角 (x1, y1) 的坐标
            """
#             print(f"Image {image_filename}; coordinates: {image_rect}; center_coord: {center_coord}")
            i = i+1
            
    if i < len(pdf_file) + 5:
        print("Anomaly found in PDF {} ({} pages), only {} pics.".format(pdf_file_name, len(pdf_file),i))
    
    # Close the PDF
    pdf_file.close()
    
    # 保存坐标表格
    df_filepath = os.path.join(subfolder_name, "coordinate_df.csv")
    coordinate_df.to_csv(df_filepath)
    
    return coordinate_df

In [7]:
def get_file_path(directory):
    files = [f for f in os.listdir(directory) if f != '.DS_Store']
    return [os.path.join(directory, file) for file in files if os.path.isfile(os.path.join(directory, file))]

In [8]:
file_bundle = get_file_path("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/00_DataBase/00_SustainabilityReport_PDF/2020")

# 保存到：
subfolder_name = "/Users/improvise/Desktop/保研/实证论文/ESG/Playground/01_Extraction/SUS 2020/02 PyMu_img"

In [11]:
try:
    for file in tqdm(file_bundle):
        coordinate_df = pd.concat([coordinate_df, PyMuPDF_get_img(file, subfolder_name)],
                                 ignore_index=True)
except OSError as e:
    print(f"Error processing file {file}: {e}")

coordinate_df

  9%|███▊                                        | 3/35 [01:06<10:32, 19.75s/it]

Anomaly found in PDF 300347.SZ-泰格医药-泰格医药_泰格医药2020年可持续发展报告暨环境、社会与公司治理报告-2021-03-30.pdf (82 pages), only 17 pics.


 14%|██████▎                                     | 5/35 [02:32<15:17, 30.59s/it]

Anomaly found in PDF 300349.SZ-金卡智能-金卡智能_2020年可持续发展报告-2021-04-27.pdf (44 pages), only 38 pics.
Error processing file /Users/improvise/Desktop/保研/实证论文/ESG/Playground/00_DataBase/00_SustainabilityReport_PDF/2020/601866.SH-中远海发-中远海发2020年度可持续发展报告-2021-03-31.pdf: broken data stream when reading image file





Unnamed: 0,file_name,x0,y0,x1,y1,centre_coordinate
0,603650.SH-彤程新材-彤程新材2020可持续发展报告-2021-04-16.pdf_...,7.484946,395.415039,445.039856,744.113525,"(226.2624008655548, 569.7642822265625)"
1,603650.SH-彤程新材-彤程新材2020可持续发展报告-2021-04-16.pdf_...,662.362305,370.616730,1145.026001,692.643921,"(903.6941528320312, 531.6303253173828)"
2,603650.SH-彤程新材-彤程新材2020可持续发展报告-2021-04-16.pdf_...,100.456299,723.191040,218.392700,776.971741,"(159.42449951171875, 750.0813903808594)"
3,603650.SH-彤程新材-彤程新材2020可持续发展报告-2021-04-16.pdf_...,340.077515,378.196716,446.974518,442.814911,"(393.52601623535156, 410.5058135986328)"
4,603650.SH-彤程新材-彤程新材2020可持续发展报告-2021-04-16.pdf_...,331.465393,225.435257,459.083496,262.310730,"(395.2744445800781, 243.87299346923828)"
...,...,...,...,...,...,...
946,300349.SZ-金卡智能-金卡智能_2020年可持续发展报告-2021-04-27.pd...,198.649994,76.249969,397.190002,225.149963,"(297.9199981689453, 150.69996643066406)"
947,300349.SZ-金卡智能-金卡智能_2020年可持续发展报告-2021-04-27.pd...,181.220001,228.999969,298.410004,385.249969,"(239.81500244140625, 307.1249694824219)"
948,300349.SZ-金卡智能-金卡智能_2020年可持续发展报告-2021-04-27.pd...,298.230011,229.500000,415.280029,385.559998,"(356.75502014160156, 307.5299987792969)"
949,300349.SZ-金卡智能-金卡智能_2020年可持续发展报告-2021-04-27.pd...,167.520004,75.599976,427.929993,255.659973,"(297.7249984741211, 165.62997436523438)"


In [12]:
# coordinate_df.to_csv("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/PyMu_img01//coordinate_df2.csv")
coordinate_df.to_excel("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/01_Extraction/SUS 2020/03 PyMu_img_cord/img_cord.xlsx",
                      index = False)