In [None]:
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import cv2
import matplotlib.pyplot as plt
import statistics
from copy import deepcopy


def get_pdf_images(pdf_path):
    doc = fitz.open(pdf_path)
    images_list = []

    for page_num in range(doc.page_count):
        page = doc[page_num]
        img = page.get_pixmap()

        # Convert the RGB image to a PIL Image
        pil_img = Image.frombytes("RGB", (img.width, img.height), img.samples)

        # Append the PIL Image to the list
        images_list.append(pil_img)

    doc.close()
    return images_list

def images_to_numpy(images_list):
    return [np.array(img) for img in images_list]

def mask_extracted(image_array):
  combined_images = []
  for i in range(len(image_array)):

    # BGR形式の画像をRGB形式に変換
    #image_rgb = cv2.cvtColor(image_array[i], cv2.COLOR_BGR2RGB)
    image_rgb = image_array[i]

    # 赤い部分の抽出
    red_mask = cv2.inRange(image_rgb, np.array([150, 0, 0]), np.array([255, 100, 100]))
    red_extracted = cv2.bitwise_and(image_rgb, image_rgb, mask=red_mask)

    # 水色の部分の抽出
    cyan_mask = cv2.inRange(image_rgb, np.array([0, 100, 150]), np.array([100, 255, 255]))
    cyan_extracted = cv2.bitwise_and(image_rgb, image_rgb, mask=cyan_mask)

    # オレンジの部分の抽出
    orange_mask = cv2.inRange(image_rgb, np.array([200, 100, 0]), np.array([255, 200, 100]))
    orange_extracted = cv2.bitwise_and(image_rgb, image_rgb, mask=orange_mask)

    # 赤、水色、オレンジの部分を合成
    combined_image = cv2.add(red_extracted, cv2.add(cyan_extracted, orange_extracted))
    combined_images.append(combined_image)
  return combined_images


def process_pdf_images(pdf_images):
    combined_images = []

    for img_array in pdf_images:
        # Apply mask_extracted to each page
        processed_img = mask_extracted(img_array)
        combined_images.append(processed_img)

    return combined_images

def resize_and_crop(images):
    # 元画像のサイズを取得
    cropped_images = []
    for image in images:
      height, width = image.shape[:2]

      # 左右中心から切り取る範囲を計算
      crop_start = (width - height) // 2
      crop_end = crop_start + height

      # 切り取り
      cropped_image = image[:, crop_start:crop_end, :]
      cropped_images.append(cropped_image)

    return cropped_images

def masking(cropped_images):
    mask_images = []
    for cropped_image in cropped_images:
      img = cropped_image
      img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

      #最頻値を調べる
      R_mode = statistics.mode(img[:,:,0].flatten())
      G_mode = statistics.mode(img[:,:,1].flatten())
      B_mode = statistics.mode(img[:,:,2].flatten())

      mask = np.zeros((img.shape[0], img.shape[1]))
      mask[(img[:,:,0] != R_mode) & (img[:,:,1] != G_mode) &  (img[:,:,2] != B_mode)] = 255

      contours, hierarchy = cv2.findContours(mask.astype("uint8"), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
      img_with_area = deepcopy(img)
      for i in range(len(contours)):
        img_with_area = cv2.fillPoly(img_with_area, [contours[i][:,0,:]], (0,255,0), lineType=cv2.LINE_8, shift=0)
      mask_images.append(img_with_area)
    return mask_images

if __name__ == "__main__":
    pdf_path = r"C:\Users\ME-PC2\OneDrive - Hiroshima University\デスクトップ\try_pptx-nii-convrt\s2023-03-31_14-00-140952-00001-00001-1.pdf"  # Specify the path to the target PDF file
    pdf_images = get_pdf_images(pdf_path)

    # Convert PIL Images to NumPy arrays
    numpy_images = images_to_numpy(pdf_images)

    # Process each page with mask_extracted and store the results in combined_images
    #combined_images = process_pdf_images(numpy_images)
    combined_images =  mask_extracted(numpy_images)

    cropped_images = resize_and_crop(combined_images)

    mask_images = masking(cropped_images)

    # Display the original and processed images
    for original, processed in zip(numpy_images, mask_images):
        # Display the original image
        plt.subplot(1, 2, 1)
        plt.imshow(original)
        plt.title('Original')
        plt.axis('off')

        # Display the processed image
        plt.subplot(1, 2, 2)
        plt.imshow(processed)
        plt.title('Processed')
        plt.axis('off')

        plt.show()
