In [24]:
import pytesseract
from PIL import Image
import os
from pdf2image import convert_from_path
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter,column_index_from_string 
import re
import easyocr
import pandas as pd
pdf_folder_path = './pdf/'
excel_template_path = 'template.xlsx'
output_excel_path = 'output.xlsx'
temp_img_path = "./temp_img/" 

""" 
    用于将pdf文件夹中的文件逐个提取成为图片 
    pdf_folder_path: pdf文件夹的路径
    return: images，提取出来的图像列表
"""
def pdf2img(pdf_folder_path):
    # 初始化列表用于存储从pdf文件中提取出来的图片
    images = []
    # for循环遍历pdf文件夹中的每个pdf文件并将其转换为原始图像存入列表
    for pdf_file in sorted(os.listdir(pdf_folder_path)):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, pdf_file)
            image_list_pdf = convert_from_path(pdf_path)
            image = image_list_pdf[0]
            images.append(image)     
    return images

""" 
    裁剪图片函数
    images: 图像列表
    return: person_info_img_cropped_list, 用户信息；test_data_img_cropped_list, 测试数据；data_curve_img_cropped_list, 数据曲线图
"""
def crop_images(images):
    # 初始化三个列表，用于存储裁剪后的图像（用户信息、测试数据以及数据曲线图）
    person_info_img_cropped_list = []
    test_data_img_cropped_list = []
    data_curve_img_cropped_list = []

    person_info_left = 245 # 用户信息左边界
    person_info_top = 360    # 用户信息上边界
    person_info_right = 1340  # 用户信息右边界
    person_info_bottom = 540 # 用户信息下边界

    test_data_left = 245 # 测试数据左边界
    test_data_top = 540    # 测试数据上边界
    test_data_right = 1050  # 测试数据右边界
    test_data_bottom = 1630 # 测试数据下边界

    data_curve_left = 1050 # 数据曲线图左边界
    data_curve_top = 540    # 数据曲线图上边界
    data_curve_right = 1340  # 数据曲线图右边界
    data_curve_bottom = 1630 # 数据曲线图下边界

    for i, image in enumerate(images):
        # 将图片保存以便进行OCR识别（也可以直接对PIL.Image对象进行处理）
        image.save(temp_img_path+f"page_{i+1}.png")

        # 打开原图片开始进行裁剪
        img = Image.open(temp_img_path+f"page_{i+1}.png")

        # 确保裁剪区域的坐标不会导致图像超出范围
        # 裁剪用户信息
        person_info_left = max(0, person_info_left)
        person_info_top = max(0, person_info_top)
        person_info_right = min(img.width, person_info_right)
        person_info_bottom = min(img.height, person_info_bottom)
        # 开始裁剪
        person_info_img_cropped = img.crop((person_info_left, person_info_top, person_info_right, person_info_bottom))
        person_info_img_cropped.save(temp_img_path+f"person_info_{i+1}_cropped.png")
        person_info_img_cropped_list.append(person_info_img_cropped)

        # 裁剪测试数据
        test_data_left = max(0, test_data_left)
        test_data_top = max(0, test_data_top)
        test_data_right = min(img.width, test_data_right)
        test_data_bottom = min(img.height, test_data_bottom)
        # 开始裁剪
        test_data_img_cropped = img.crop((test_data_left, test_data_top, test_data_right, test_data_bottom))
        test_data_img_cropped.save(temp_img_path+f"test_data_{i+1}_cropped.png")
        test_data_img_cropped_list.append(test_data_img_cropped)

        # 裁剪数据曲线图
        data_curve_left = max(0, data_curve_left)
        data_curve_top = max(0, data_curve_top)
        data_curve_right = min(img.width, data_curve_right)
        data_curve_bottom = min(img.height, data_curve_bottom)
        # 开始裁剪
        data_curve_img_cropped = img.crop((data_curve_left, data_curve_top, data_curve_right, data_curve_bottom))
        data_curve_img_cropped.save(temp_img_path+f"data_curve_{i+1}_cropped.png")
        data_curve_img_cropped_list.append(data_curve_img_cropped)

        
    return person_info_img_cropped_list, test_data_img_cropped_list, data_curve_img_cropped_list
    
""" 
    使用Tesseract进行OCR识别 
    img_cropped_list: 裁剪后的图像列表
    return: person_info_list，识别的用户信息文本列表；test_data_list，识别的测试数据文本列表
""" 
def ocr(person_info_img_cropped_list, test_data_img_cropped_list):

    # 用于存储两部分信息的列表
    person_info_list = []
    test_data_list = []

    for i, person_info_img_cropped in enumerate(person_info_img_cropped_list):
        reader = easyocr.Reader(['ch_sim','en']) 
        person_info = reader.readtext(f'./temp_img/person_info_{i+1}_cropped.png',detail = 0)
        person_info_list.append(person_info)
    
    for i, test_data_img_cropped in enumerate(test_data_img_cropped_list):
        reader = easyocr.Reader(['ch_sim','en']) 
        test_data = reader.readtext(f'./temp_img/test_data_{i+1}_cropped.png',detail = 0)
        test_data_list.append(test_data)

    return person_info_list, test_data_list

""" 判断字符串是否为小数 """
def is_float(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def text2excel(person_info_list, test_data_list):
    
    # 加载Excel工作簿和工作表
    template_excel_path = 'template.xlsx'
    # 加载Excel工作簿和工作表
    workbook = load_workbook(template_excel_path)
    sheet = workbook.active  # 假设数据填充在第一个工作表

    # 获取表头，只需要前11个元素作为个人信息的key
    headers = [cell.value for cell in next(sheet.iter_rows())]  # 假设表头在第一行
    headers = headers[:11]

    # 初始化一个字典来存储数据名称和数据值的配对
    data_dict = {header: None for header in headers}

    # 预处理患者个人信息：遍历数据列表，将数据名称和数据值配对
    index = 0
    for person_info in person_info_list:
        for i, item in enumerate(person_info):
            if item.endswith(":") or item.endswith("："):
                item = item[:-1]
            person_info[i] = item
        print(person_info)

        # 数据赋值给对应的键
        for header in data_dict.keys():
            if header in person_info:
                header_index = person_info.index(header)
                if header_index + 1 < len(person_info) and person_info[header_index + 1] not in data_dict.keys():
                    value = person_info[header_index + 1]
                    if header_index + 2 < len(person_info) and person_info[header_index + 2] not in data_dict:
                        value += '' + person_info[header_index + 2]
                        data_dict[header] = value
                if person_info[header_index + 1] in data_dict.keys():
                    continue
                data_dict[header] = value
        print(data_dict)
    
    # 预处理测试数据
    for test_data in test_data_list:
        test_data = test_data[2:]
        list1 = test_data.copy()
        del list1[0]
        del list1[1]

        for item in list1[2:]:
            if item.isdigit() or is_float(item):
                continue
            list1.remove(item)
        test_data = list1
        print(test_data)


"""         # 填充数据到Excel表格
        for key, value in data_dict.items():
            # 找到对应的列号
            col_index = column_index_from_string(key)  # 使用column_index_from_string获取列号
            # 从第二行开始填充数据（假设表头在第一行）
            for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, values_only=False):
                for cell in row:
                    if cell.value == key:  # 找到匹配的列名
                        cell.offset(row=-1).value = value  # 填充数据到上一行的对应列
                        break  # 找到后不需要继续迭代

    # 保存工作簿
    workbook.save('filled_template.xlsx') """
            

def main():
    images = []
    images = pdf2img(pdf_folder_path)
    person_info_img_cropped_list, test_data_img_cropped_list, data_curve_img_cropped_list = crop_images(images)
    person_info_list, test_data_list = ocr(person_info_img_cropped_list, test_data_img_cropped_list)
    """ print("\n")
    for person_info in person_info_list:
        print(person_info)
    print("--------------------------------------------------------")
    for test_data in test_data_list:
        print(test_data)
    print("--------------------------------------------------------") """
    text2excel(person_info_list,test_data_list)
    

if __name__ == '__main__':
    main()

['姓名', '杨会琴', '测试号', '16889648-19', '住院号', '1973315', '临床印象', '出生日期', '1957/11/30', '操作者', '性别', '女', '籍贯', '身高', '160', 'Cm', '体重', '73', 'kg', '年龄', '66', '岁']
['24/3/04', '16:05:20"', '0.52', '0.72', '138.8', '20.00', '53.44', '267.2', '10.43', '38.67', '370.8', '0.70', '0.49', '70.4', '2.59', '2.32', '89.5', '2.48', '2.17', '87.4', '2.07', '1.64', '79.3', '75.68', '76.56', '70.78', '92.5', '5.71', '5.06', '88.6', '75', '5.10', '3.87', '75.9', '3.43', '1.43', '41.7', '1.14', '0.46', '40.6', '2.68', '0.98', '36.7', '84.80', '56.96', '67.2', '84.80', '49.26', '58.1', '1.95', '1.49', '76.3', '4.77', '3.49', '73.2', '41.40', '42.64', '103.0', '2.65', '1.98', '74.8', '55.66', '56.70', '101.9', '7.11', '6.27', '88.1']
{'姓名': '杨会琴', '测试号': '16889648-19', '住院号': '1973315', '临床印象': None, '出生日期': '1957/11/30', '操作者': None, '性别': '女', '籍贯': None, '身高': '160Cm', '体重': '73kg', '年龄': '66岁'}


In [23]:
a = ['VT']
print(a[0].isdigit())

False
