In [1]:
from pptx import Presentation
from docx import Document
import os
import re

def extract_english_words_from_slide(slide):
    """
    从PPT幻灯片中提取英文单词
    """
    text = []
    for shape in slide.shapes:
        if shape.has_text_frame:
            text.append(shape.text)
    combined_text = ' '.join(text)
    # 提取英文单词，忽略数字和其他字符
    words = re.findall(r'\b[a-zA-Z]+\b', combined_text)
    return words

def process_pptx_files(input_folder, output_word_file):
    """
    处理文件夹中的所有PPTX文件并导出英文单词到Word文档
    """
    # 初始化一个Word文档
    doc = Document()
    doc.add_heading("Extracted English Words", level=1)

    # 遍历文件夹中的所有PPTX文件
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.pptx'):
            pptx_path = os.path.join(input_folder, file_name)
            print(f"Processing: {pptx_path}")
            
            # 打开PPTX文件
            presentation = Presentation(pptx_path)
            all_words = []
            
            # 遍历幻灯片并提取英文单词
            for slide in presentation.slides:
                words = extract_english_words_from_slide(slide)
                all_words.extend(words)
            
            # 将每个PPT的单词写入Word文档
            if all_words:
                doc.add_heading(file_name, level=2)
                doc.add_paragraph(' '.join(all_words))
    
    # 保存Word文档
    doc.save(output_word_file)
    print(f"All English words have been saved to {output_word_file}")

# 指定输入文件夹和输出Word文件路径
input_folder = r"Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)"  # 替换为存放PPTX文件的文件夹路径
output_word_file = r"Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\your_output_word_file.docx"  # 替换为生成的Word文件路径

process_pptx_files(input_folder, output_word_file)


Processing: Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\3.2. 细胞核移植技术.pptx
All English words have been saved to Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\your_output_word_file.docx


In [2]:
import os
import re
from pptx import Presentation
from docx import Document
from googletrans import Translator
from win32com import client

def convert_ppt_to_pptx(file_path):
    """
    将 .ppt 文件转换为 .pptx 格式（需要 Windows 环境和安装 Office）
    """
    ppt_app = client.Dispatch("PowerPoint.Application")
    ppt = ppt_app.Presentations.Open(file_path, WithWindow=False)
    new_file_path = file_path + "x"
    ppt.SaveAs(new_file_path, 24)  # 24 表示 pptx 格式
    ppt.Close()
    ppt_app.Quit()
    return new_file_path

def extract_english_phrases_from_slide(slide):
    """
    从PPT幻灯片中提取英文短语（以空格分隔的短语保持在一起）
    """
    text = []
    for shape in slide.shapes:
        if shape.has_text_frame:
            text.append(shape.text)
    combined_text = ' '.join(text)
    # 只保留英文短语，忽略数字和其他字符
    phrases = re.findall(r'\b[a-zA-Z][a-zA-Z\s]*[a-zA-Z]\b', combined_text)
    return [phrase.strip() for phrase in phrases if len(phrase.split()) > 1]  # 保留多词短语

def process_ppt_and_pptx_files(input_folder, output_word_file):
    """
    处理文件夹中的所有PPT和PPTX文件，提取英文短语并翻译成中文，保存到Word文档
    """
    # 初始化一个Word文档
    doc = Document()
    doc.add_heading("提取的英文短语和翻译", level=1)

    # 初始化翻译器
    translator = Translator()

    # 遍历文件夹中的所有PPT和PPTX文件
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.ppt') or file_name.endswith('.pptx'):
            pptx_path = os.path.join(input_folder, file_name)
            # 如果是 .ppt 文件，先转换为 .pptx
            if file_name.endswith('.ppt'):
                try:
                    pptx_path = convert_ppt_to_pptx(pptx_path)
                    print(f"Converted {file_name} to {pptx_path}")
                except Exception as e:
                    print(f"Failed to convert {file_name}: {e}")
                    continue
            
            print(f"Processing: {pptx_path}")
            presentation = Presentation(pptx_path)
            all_phrases = []

            # 遍历幻灯片并提取英文短语
            for slide in presentation.slides:
                phrases = extract_english_phrases_from_slide(slide)
                all_phrases.extend(phrases)

            # 将每个PPT的短语和翻译写入Word文档
            if all_phrases:
                doc.add_heading(file_name, level=2)
                for phrase in all_phrases:
                    # 翻译短语
                    try:
                        translation = translator.translate(phrase, src='en', dest='zh-cn').text
                    except Exception as e:
                        translation = f"翻译失败: {e}"
                    
                    # 写入短语和翻译
                    doc.add_paragraph(f"{phrase} -> {translation}")
    
    # 保存Word文档
    doc.save(output_word_file)
    print(f"All phrases and translations have been saved to {output_word_file}")

# 指定输入文件夹和输出Word文件路径
input_folder = r"Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)"  # 替换为存放PPTX文件的文件夹路径
output_word_file = r"Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\your_output_word_file.docx"  # 替换为生成的Word文件路径

process_ppt_and_pptx_files(input_folder, output_word_file)


Converted 1.细胞培养.ppt to Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\1.细胞培养.pptx
Processing: Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\1.细胞培养.pptx
Converted 1绪论cul.ppt to Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\1绪论cul.pptx
Processing: Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\1绪论cul.pptx
Converted 2-1实验室.ppt to Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\2-1实验室.pptx
Processing: Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\2-1实验室.pptx
Converted 2-2基本培养技术.ppt to Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\2-2基本培养技术.pptx
Processing: Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\2-2基本培养技术.pptx
Converted 2.干细胞工程.ppt to Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\2.干细胞工程.pptx
Processing: Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\2.干细胞工程.pptx
Converted 3-1植物脱毒.ppt to Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\3-1植物脱毒.pptx
Processing: Z:\Jiacheng Zheng\山东大学课程\24 cell engineering (1)\3-1植物脱毒.pptx
Converted 3-2植物快繁.ppt to Z:\Jiacheng 