In [21]:
from openai import OpenAI
from PIL import Image, ImageDraw, ImageFont
import base64
from io import BytesIO
import json
import os

class ImageTranslator:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",)
        self.font_paths = [
            "/System/Library/Fonts/PingFang.ttc",
            "/System/Library/Fonts/STHeiti Light.ttc",
            "/Library/Fonts/Arial Unicode.ttf"
        ]
        self.font_cache = {}
        
    def get_font(self, size):
        """获取可用的中文字体"""
        print("尝试加载字体，大小:", size)
        if size in self.font_cache:
            return self.font_cache[size]
            
        for font_path in self.font_paths:
            print(f"尝试字体路径: {font_path}")
            try:
                if os.path.exists(font_path):
                    print(f"找到字体文件: {font_path}")
                    font = ImageFont.truetype(font_path, size)
                    self.font_cache[size] = font
                    return font
                else:
                    print(f"字体文件不存在: {font_path}")
            except Exception as e:
                print(f"加载字体失败: {e}")
        
    def encode_image(self, image_path):
        """将图片转换为base64编码"""
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    def get_text_positions(self, image_path):
        """使用GPT-4 Vision获取图片中的文本及位置"""
        base64_image = self.encode_image(image_path)
        
        # 构建提示词，要求输出JSON格式的文本位置信息
        prompt = """请识别图片中的所有英文文本。对每个文本，提供以下信息：
        1. 原始英文文本
        2. 文本在图片中的位置（左上角x,y坐标和右下角x,y坐标）
        3. 文本的大概字体大小
        
        请以JSON格式返回，格式如下：
        {
            "texts": [
                {
                    "original": "English text",
                    "position": {
                        "x1": 100,
                        "y1": 100,
                        "x2": 200,
                        "y2": 120
                    },
                    "font_size": 20
                }
            ]
        }"""

        response = self.client.chat.completions.create(
            model="qwen-vl-plus",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=1000
        )
        
        try:
            # 从响应中提取JSON字符串并解析
            json_str = response.choices[0].message.content
            print(json_str)
            # 找到JSON内容的开始和结束
            start = json_str.find('{')
            end = json_str.rfind('}') + 1
            return json.loads(json_str[start:end])
        except Exception as e:
            print(f"解析响应时出错: {e}")
            return None

    def translate_texts(self, texts):
        """使用GPT-4翻译文本列表"""
        text_list = [text["original"] for text in texts]
        prompt = f"请将以下英文文本翻译成中文，保持原始格式：\n{text_list}"
        
        response = self.client.chat.completions.create(
            model="qwen-plus",
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        
        translations = response.choices[0].message.content.split('\n')
        print("translations", translations)
        return translations

    def process_image(self, input_path, output_path):
        """处理图片，将英文替换为中文"""
        # 获取文本位置信息
        text_info = self.get_text_positions(input_path)
        if not text_info:
            return False
            
        # 获取翻译
        translations = self.translate_texts(text_info["texts"])
        if not translations:
            return False
            
        # 打开原始图片，保持 RGBA 模式
        img = Image.open(input_path)
        if img.mode != 'RGBA':
            img = img.convert('RGBA')
            
        # 创建一个新图层用于绘制文本
        txt_layer = Image.new('RGBA', img.size, (255, 255, 255, 0))
        draw = ImageDraw.Draw(txt_layer)
        
        # 处理每个文本块
        for text_block, translation in zip(text_info["texts"], translations):
            pos = text_block["position"]
            font_size = text_block["font_size"]
            
            # 获取字体
            font = self.get_font(font_size)
            
            # 计算文本大小以调整位置
            text_bbox = draw.textbbox((0, 0), translation, font=font)
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]
            
            # 创建白色背景矩形
            rect_coords = [
                pos["x1"], 
                pos["y1"], 
                pos["x2"], 
                pos["y2"]
            ]
            draw.rectangle(
                rect_coords,
                fill=(255, 255, 255, 255)  # 白色不透明背景
            )
            
            # 计算文本居中位置
            x = pos["x1"]
            y = pos["y1"] + (pos["y2"] - pos["y1"] - text_height) / 2
            
            # 绘制文本
            draw.text(
                (x, y),
                translation,
                font=font,
                fill=(0, 0, 0, 255)  # 黑色不透明文本
            )
        
        # 将文本图层与原图合并
        result = Image.alpha_composite(img, txt_layer)
        
        # 保存结果
        if output_path.lower().endswith('.png'):
            result.save(output_path, 'PNG')
        else:
            # 如果不是 PNG，转换为 RGB 模式
            result = result.convert('RGB')
            result.save(output_path)
            
        return True

# 使用示例
if __name__ == "__main__":
    API_KEY = "xxxxx"
    
    translator = ImageTranslator(API_KEY)
    success = translator.process_image(
        "input_image.png",
        "output_image.png"
    )
    
    if success:
        print("图片处理完成!")
    else:
        print("处理失败，请检查错误信息。")

```json
{
    "texts": [
        {
            "original": "Sift through hundreds of original wallpapers hand crafted in house by the the Backdrops team.",
            "position": {
                "x1": 45, 
                "y1": 678, 
                "x2": 980, 
                "y2": 730
            }, 
            "font_size": 24
        },
        {
            "original": "Enjoy exclusive walls designed specifically for your devices.", 
            "position": {  
                "x1": 120, 
                "y1": 730, 
                "x2": 980, 
                "y2": 780
            }, 
            "font_size": 24
        }
    ]
}
```

在这个例子中，有两个段落被提取出来作为文本内容：

第一个段落在图像中心偏上的位置，其原始文字是"Sift through hundreds of original wallpapers hand crafted in house by the the Backdrops team."。

第二个段落在第一行下面一点的位置，原文为"Enjoy exclusive walls designed specifically for your devices."

这两个段落都使用了相同的字体，并且都是大号字幕。
translations ["['浏览由Backdrops团队精心手工制作的数百张原创壁纸。', '享受专为您的设备设计的独家墙纸。']"]
尝试加载字体，大小: 24
尝试字体路径: 