In [None]:
import os
import re
from collections import defaultdict
from docx import Document
from docx.shared import Inches
import pandas as pd
from PIL import Image
import io
from datetime import datetime

def extract_text_from_table_cell(cell):
    """从表格单元格中提取纯文本"""
    text = ""
    for paragraph in cell.paragraphs:
        text += paragraph.text
    return text.strip()

def get_unique_folder_name(base_name, existing_names):
    """生成唯一的文件夹名称"""
    if base_name not in existing_names:
        existing_names.add(base_name)
        return base_name
    
    counter = 1
    while f"{base_name}({counter})" in existing_names:
        counter += 1
    
    unique_name = f"{base_name}({counter})"
    existing_names.add(unique_name)
    return unique_name

def extract_images_from_table(table, folder_path, page_code, doc):
    """从表格中提取图片并保存，同时记录图片信息用于日志"""
    image_info = []
    expected_images = []  # 记录期望的图片信息
    
    # 首先遍历一次，收集所有图片元素和对应的标题信息
    image_elements_info = []
    
    for row_idx, row in enumerate(table.rows):
        for cell_idx, cell in enumerate(row.cells):
            # 获取单元格上方的标题（用于图片命名）
            cell_title = ""
            if row_idx > 0:  # 如果不是第一行，尝试获取上方单元格的文本作为标题
                try:
                    cell_title = extract_text_from_table_cell(table.rows[row_idx-1].cells[cell_idx])
                except:
                    cell_title = ""
            else:
                cell_title = extract_text_from_table_cell(cell)
            
            # 查找单元格中的图片元素
            for paragraph in cell.paragraphs:
                for run in paragraph.runs:
                    # 查找图片元素
                    for drawing in run._element.xpath('.//w:drawing'):
                        try:
                            # 获取图片的关系ID
                            blip_elements = drawing.xpath('.//a:blip[@r:embed]')
                            for blip in blip_elements:
                                embed_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                                if embed_id:
                                    image_elements_info.append({
                                        'embed_id': embed_id,
                                        'cell_title': cell_title,
                                        'row_idx': row_idx,
                                        'cell_idx': cell_idx
                                    })
                        except Exception as e:
                            print(f"检测图片元素时出错: {e}")
                            continue
    
    # 现在处理收集到的所有图片元素
    for idx, img_info in enumerate(image_elements_info):
        try:
            embed_id = img_info['embed_id']
            cell_title = img_info['cell_title']
            
            # 通过文档部分获取图片数据
            image_part = doc.part.related_parts[embed_id]
            image_data = image_part._blob
            
            # 创建安全的文件名
            safe_title = re.sub(r'[^\w\u4e00-\u9fff\-_()（）]', '_', cell_title)
            safe_title = safe_title.strip('_')[:50]  # 限制长度并去除首尾下划线
            
            # 构造文件名
            if safe_title:
                filename = f"{safe_title}（{page_code}）.png"
            else:
                filename = f"图片{idx+1}（{page_code}）.png"
            
            filepath = os.path.join(folder_path, filename)
            
            # 保存图片
            with open(filepath, 'wb') as f:
                f.write(image_data)
            
            # 记录成功保存的图片信息
            image_info.append({
                'title': cell_title,
                'filename': filename,
                'filepath': filepath
            })
            
            # 同时记录到期望图片列表（标记为已保存）
            expected_images.append({
                'title': cell_title,
                'expected_filename': filename,
                'saved': True
            })
            
            print(f"    已保存图片: {filename} (标题: {cell_title})")
            
        except Exception as e:
            print(f"保存图片时出错: {e}")
            # 如果保存失败，仍然记录到期望图片列表（标记为未保存）
            cell_title = img_info['cell_title']
            safe_title = re.sub(r'[^\w\u4e00-\u9fff\-_()（）]', '_', cell_title)
            safe_title = safe_title.strip('_')[:50]
            
            if safe_title:
                expected_filename = f"{safe_title}（{page_code}）.png"
            else:
                expected_filename = f"图片{idx+1}（{page_code}）.png"
            
            expected_images.append({
                'title': cell_title,
                'expected_filename': expected_filename,
                'saved': False
            })
    
    return image_info, expected_images

def parse_info_card_from_table(table):
    """从表格中解析信息卡数据"""
    info_card = {}
    
    try:
        # 遍历表格的所有行和列来提取信息
        for row_idx, row in enumerate(table.rows):
            for cell_idx, cell in enumerate(row.cells):
                cell_text = extract_text_from_table_cell(cell)
                
                # 基本信息字段匹配
                if '编号' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['编号'] = extract_text_from_table_cell(row.cells[cell_idx + 1]) #右边一个单元格
                elif '天气' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['天气'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '路面材质' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['路面材质'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '长(m)' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['长(m)'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '宽(m)' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['宽(m)'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '埋深(m)' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['埋深(m)'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '净深(m)' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['净深(m)'] = extract_text_from_table_cell(row.cells[cell_idx + 1])               
                # 面积字段识别，使用更灵活的匹配 (如信息卡中出现其他字段，可使用"or"添加到下方elif语句中)
                elif ('面积' in cell_text and ('m²' in cell_text or '㎡' in cell_text or '平方米' in cell_text)) and cell_idx + 1 < len(row.cells):
                    info_card['面积(m²)'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                # 体积字段识别，如需使用更灵活的匹配，修改逻辑同上。
                elif ('体积' in cell_text and ('m³' in cell_text or '立方米' in cell_text)) and cell_idx + 1 < len(row.cells):
                    info_card['体积(m³)'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '位置描述' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['位置描述'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '验证时间' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['验证时间'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '隐患等级' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['隐患等级'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '隐患类型' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['隐患类型'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '雷达型号' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['雷达型号'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '天线主频' in cell_text and 'MHz' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['天线主频（MHz）'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                #  经纬度坐标处理
                elif cell_text.strip() == 'E' and cell_idx - 1 >= 0 and cell_idx + 1 < len(row.cells):
                    # 检查左边单元格是否包含"经纬度坐标"
                    left_cell_text = extract_text_from_table_cell(row.cells[cell_idx - 1])
                    if '经纬度坐标' in left_cell_text:
                        info_card['经纬度坐标E'] = extract_text_from_table_cell(row.cells[cell_idx + 2]) # 修正为 +2，因为经纬度坐标值为两个单元格后
                elif cell_text.strip() == 'N' and cell_idx - 1 >= 0 and cell_idx + 1 < len(row.cells):
                    # 检查左边单元格是否包含"经纬度坐标"
                    left_cell_text = extract_text_from_table_cell(row.cells[cell_idx - 1])
                    if '经纬度坐标' in left_cell_text:
                        info_card['经纬度坐标N'] = extract_text_from_table_cell(row.cells[cell_idx + 2]) # 修正为 +2，因为经纬度坐标值为两个单元格后
                
                elif '成因初步分析' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['成因初步分析'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '初步处置建议' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['初步处置建议'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
                elif '钻探及其他验证结果' in cell_text and cell_idx + 1 < len(row.cells):
                    info_card['钻探及其他验证结果'] = extract_text_from_table_cell(row.cells[cell_idx + 1])
        
        # 调试输出，帮助识别问题
        print(f"  解析结果预览:")
        for key, value in info_card.items():
            if value:  # 只显示有值的字段
                print(f"    {key}: {value[:50]}{'...' if len(str(value)) > 50 else ''}")
        
    except Exception as e:
        print(f"解析信息卡时出错: {e}")
    
    return info_card

def check_data_completeness(info_card, expected_images):
    """检查数据完整性，返回缺失信息"""
    # 必要字段列表
    required_fields = [
        '编号', '天气', '路面材质', '长(m)', '宽(m)', '埋深(m)', '净深(m)', 
        '面积(m²)', '体积(m³)', '位置描述', '验证时间', '隐患等级', '隐患类型',
        '雷达型号', '天线主频（MHz）', '经纬度坐标E', '经纬度坐标N',
        '成因初步分析', '初步处置建议', '钻探及其他验证结果'
    ]
    
    missing_fields = []
    missing_images = []
    
    # 检查缺失字段
    for field in required_fields:
        if field not in info_card or not info_card[field].strip():
            missing_fields.append(field)
    
    # 检查缺失图片
    for expected_img in expected_images:
        if not expected_img['saved']:
            missing_images.append(expected_img['title'])
    
    return missing_fields, missing_images

def generate_log_report(all_info_cards, all_expected_images, log_file_path):
    """生成日志报告"""
    complete_cards = 0
    incomplete_cards = 0
    
    with open(log_file_path, 'w', encoding='utf-8') as log_file:
        # 写入日志头部
        log_file.write("=" * 60 + "\n")
        log_file.write(f"信息卡处理日志报告\n")
        log_file.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        log_file.write("=" * 60 + "\n\n")
        
        # 逐个检查每个信息卡
        for i, (info_card, expected_images) in enumerate(zip(all_info_cards, all_expected_images), 1):
            card_number = info_card.get('编号', f'信息卡_{i}')
            missing_fields, missing_images = check_data_completeness(info_card, expected_images)
            
            log_file.write(f"编号：{card_number}信息卡\n")
            
            if not missing_fields and not missing_images:
                log_file.write("状态：完整输出\n")
                complete_cards += 1
            else:
                log_file.write("状态：存在缺失\n")
                incomplete_cards += 1
                
                if missing_fields:
                    log_file.write(f"缺失字段：{', '.join(missing_fields)}\n")
                
                if missing_images:
                    for img_title in missing_images:
                        log_file.write(f"图片未保存：{img_title}\n")
            
            log_file.write("-" * 40 + "\n")
        
        # 写入总结
        log_file.write("\n" + "=" * 60 + "\n")
        log_file.write("处理总结\n")
        log_file.write("=" * 60 + "\n")
        log_file.write(f"总信息卡数量：{complete_cards + incomplete_cards}\n")
        log_file.write(f"完整输出：{complete_cards}张\n")
        log_file.write(f"存在缺失：{incomplete_cards}张\n")
        
        if complete_cards + incomplete_cards > 0:
            completion_rate = (complete_cards / (complete_cards + incomplete_cards)) * 100
            log_file.write(f"完整率：{completion_rate:.2f}%\n")
        
        log_file.write("=" * 60 + "\n")

def process_word_document(word_file_path, output_excel_path, images_folder_path, log_file_path):
    """处理Word文档，提取信息卡并保存到Excel和图片，生成日志"""
    
    # 创建图片保存文件夹
    os.makedirs(images_folder_path, exist_ok=True)
    
    # 打开Word文档
    doc = Document(word_file_path)
    
    # 存储所有信息卡数据和图片信息
    all_info_cards = []
    all_expected_images = []
    
    # 用于跟踪已使用的文件夹名称
    used_folder_names = set()
    
    # 统计表格数量（每个表格对应一页信息卡）
    table_count = 0
    
    for table in doc.tables:
        table_count += 1
        print(f"正在处理第 {table_count} 个信息卡...")
        
        try:
            # 解析信息卡数据
            info_card = parse_info_card_from_table(table)
            
            # 获取编号作为文件夹名
            page_code = info_card.get('编号', f'信息卡_{table_count}')
            
            # 确保文件夹名唯一
            unique_folder_name = get_unique_folder_name(page_code, used_folder_names)
            
            # 创建子文件夹
            page_folder_path = os.path.join(images_folder_path, unique_folder_name)
            os.makedirs(page_folder_path, exist_ok=True)
            
            # 提取并保存图片，同时获取期望图片信息
            image_info, expected_images = extract_images_from_table(table, page_folder_path, page_code, doc)
            
            print(f"  - 编号: {page_code}")
            print(f"  - 文件夹: {unique_folder_name}")
            print(f"  - 提取图片数量: {len(image_info)}")
            print(f"  - 期望图片数量: {len(expected_images)}")
            
            # 将信息卡数据和图片信息添加到列表
            all_info_cards.append(info_card)
            all_expected_images.append(expected_images)
            
        except Exception as e:
            print(f"处理第 {table_count} 个信息卡时出错: {e}")
            # 即使出错也要添加一个空的信息卡，保持行数一致
            all_info_cards.append({'编号': f'错误_信息卡_{table_count}'})
            all_expected_images.append([])
    
    # 生成日志报告
    generate_log_report(all_info_cards, all_expected_images, log_file_path)
    print(f"日志报告已生成: {log_file_path}")
    
    # 创建Excel文件
    if all_info_cards:
        # 定义Excel列的顺序
        columns = [
            '编号', '天气', '路面材质', '长(m)', '宽(m)', '埋深(m)', '净深(m)', 
            '面积(m²)', '体积(m³)', '位置描述', '验证时间', '隐患等级', '隐患类型',
            '雷达型号', '天线主频（MHz）', '经纬度坐标E', '经纬度坐标N',
            '成因初步分析', '初步处置建议', '钻探及其他验证结果',
        ]
        
        # 创建DataFrame，确保所有列都存在
        df_data = []
        for info_card in all_info_cards:
            row = {}
            for col in columns:
                row[col] = info_card.get(col, '')  # 缺失字段用空字符串填充
            df_data.append(row)
        
        df = pd.DataFrame(df_data, columns=columns)
        
        # 保存到Excel（使用xlsxwriter引擎，如果openpyxl不可用）
        try:
            df.to_excel(output_excel_path, index=False, engine='openpyxl')
        except ImportError:
            try:
                df.to_excel(output_excel_path, index=False, engine='xlsxwriter')
            except ImportError:
                # 如果都没有，保存为CSV
                csv_path = output_excel_path.replace('.xlsx', '.csv')
                df.to_csv(csv_path, index=False, encoding='utf-8-sig')
                print(f"注意：由于缺少Excel库，数据已保存为CSV格式: {csv_path}")
                return
        
        print(f"\n处理完成!")
        print(f"总共处理了 {len(all_info_cards)} 个信息卡")
        print(f"Excel文件已保存到: {output_excel_path}")
        print(f"图片文件夹: {images_folder_path}")
        print(f"日志文件: {log_file_path}")
    else:
        print("未找到任何信息卡数据!")

# 使用示例
def main():
    # 文件路径配置
    word_file_path = "～/深圳市探地雷达检测地面坍塌隐患信息卡.docx"  # 替换为你的Word文件路径
    output_excel_path = "～/信息卡数据.xlsx"
    images_folder_path = "～/信息卡图片"
    log_file_path = "～/处理日志.txt"
    
    # 检查Word文件是否存在
    if not os.path.exists(word_file_path):
        print(f"错误: Word文件 '{word_file_path}' 不存在!")
        print("请将Word文件放在脚本同目录下，或修改word_file_path变量为正确路径")
        return
    
    # 检查必要的库
    try:
        import openpyxl
    except ImportError:
        try:
            import xlsxwriter
            print("注意：将使用xlsxwriter引擎保存Excel文件")
        except ImportError:
            print("警告：没有安装openpyxl或xlsxwriter，将保存为CSV格式")
    
    try:
        process_word_document(word_file_path, output_excel_path, images_folder_path, log_file_path)
    except Exception as e:
        print(f"处理文档时发生错误: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()
    