<a href="https://colab.research.google.com/github/KKKKeybird/LightNovelTranslate/blob/main/LNT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GoogleDrive挂载

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 查看配置

In [None]:
!cat /etc/lsb-release
!uname -a
!/opt/bin/nvidia-smi

# 环境配置

## 第一次使用下载模型

In [None]:
!git clone https://github.com/oobabooga/text-generation-webui.git
!wget https://huggingface.co/SakuraLLM/Sakura-14B-LNovel-v0.9b-GGUF/resolve/main/sakura-13b-lnovel-v0.9b-Q4_K_M.gguf
!mv  /content/sakura-13b-lnovel-v0.9b-Q4_K_M.gguf /content/text-generation-webui/models/
!pip install -r /content/text-generation-webui/requirements.txt
!pip install ebooklib

# 端口转发

## 使用Tailscale内网穿透访问WebUI

In [None]:
!curl -fsSL https://pkgs.tailscale.com/stable/ubuntu/bionic.gpg | sudo apt-key add -
!curl -fsSL https://pkgs.tailscale.com/stable/ubuntu/bionic.list | sudo tee /etc/apt/sources.list.d/tailscale.list
!sudo apt-get update
!sudo apt-get install tailscale
!rm -rf /tmp/tailscaled
!mkdir -p /tmp/tailscaled
!chown irc.irc /tmp/tailscaled
!rm -rf /var/run/tailscale
!mkdir -p /var/run/tailscale
!chown irc.irc /var/run/tailscale
!cp /var/lib/tailscaled/tailscaled.state /tmp/tailscaled/tailscaled.state
!chown irc.irc /tmp/tailscaled/tailscaled.state
!nohup sudo -u irc tailscaled --tun=userspace-networking --socks5-server=localhost:1055 --state=/tmp/tailscaled/tailscaled.state --socket=/var/run/tailscale/tailscaled.sock --port 41641 &
!until tailscale up; do sleep 1; done

In [None]:
!nohup sudo -u irc tailscaled --tun=userspace-networking --socks5-server=localhost:1055 --state=/tmp/tailscaled/tailscaled.state --socket=/var/run/tailscale/tailscaled.sock --port 41641 &
!until tailscale up; do sleep 1; done
!curl --socks5-hostname localhost:1055 http://100.100.1.1

nohup: appending output to 'nohup.out'
curl: (97) connection to proxy closed


# 文件目录格式
- content
-- 原文.epub
-- 替换字典.json


# 服务启动

In [None]:
!nohup bash /content/text-generation-webui/start_linux.sh &

nohup: appending output to 'nohup.out'


### 服务启动后访问"http://{tailscale_ip}:7860"，在Session页面开启api并Apply and Restart，重启后在Model里加载模型

# 翻译脚本启动

In [None]:
import requests
import time
import sys
import json
import unicodedata
import os
import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub


def strQ2B(ustring):
    """把字符串半角转全角"""
    ss = []
    for s in ustring:
        rstring = ""
        for uchar in s:
            inside_code = ord(uchar)
            if inside_code == 12288:  # 全角空格直接转换
                inside_code = 32
            elif (inside_code >= 65281 and inside_code <= 65374):  # 全角字符（除空格）根据关系转化
                inside_code -= 65248
            rstring += chr(inside_code)
        ss.append(rstring)
    return ''.join(ss)


#处理epub文件获得html解析文本
if not os.path.exists("原文.epub"):
    print(f"原文.epub不存在")
    sys.exit(1)

script_folder = os.path.dirname(os.path.abspath(__file__))

book = epub.read_epub(os.path.join(script_folder,"原文.epub"))
original_htmls=[]
for item in book.get_items():
    if item.get_type() == ebooklib.ITEM_DOCUMENT:
        soup=BeautifulSoup(item.get_content(),'html')
        text=[tag.extract() for tag in soup.select('rt')]
        original_htmls.append(strQ2B(soup.get_text()))


#处理名词替换


def is_empty(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return isinstance(data, dict) and not bool(data)
    except (FileNotFoundError, json.JSONDecodeError):
        return False


replacements_file = os.path.join(script_folder, '特殊名词替换.json')

if is_empty(replacements_file):
    print("特殊名词替换文件内部为空，请手动配置")
    print("exiting")
else:
    with open('替换字典.json', 'r', encoding='utf-8') as file:
        replacement_dict = json.load(file)
        # 对替换字典中的值进行全角转换为半角
        for key, value in replacement_dict.items():
            replacement_dict[key] = unicodedata.normalize('NFKC', value)

    htmls=[]
    for content in original_htmls:

        for old_text, new_text in replacement_dict.items():
            content = content.replace(old_text, new_text)
        if content.isspace()==False:
            htmls.append(content)

print("特殊名词替换任务完成。")

APIURL = "http://127.0.0.1:5000" + "/v1/chat/completions"


def translate_text(text, temperature=0.2, frequency_penalty=0.0):
    attempts = 0
    max_attempts = 5
    last_exception = None
    while attempts < max_attempts:
        try:
            url = APIURL
            prompt_with_text = f"将下面的日文文本翻译成中文：{text}"
            messages = [{"role": "user", "content": prompt_with_text}]
            payload = {
                "messages": messages,
                "max_tokens": 1024,
                "temperature": temperature,
                "mode": "instruct",
                "instruction_template": "ChatML",
                "frequency_penalty": frequency_penalty,
                "negative_prompt": "你是一个轻小说翻译模型，可以流畅通顺地以日本轻小说的风格将日文翻译成简体中文，并联系上下文正确使用人称代词，不擅自添加原文中没有的代词。",
                "stop": ["\n###", "\n\n", "[PAD151645]", "<|im_end|>"]
            }
            response = requests.post(url, json=payload)
            if response.status_code == 200:
                translated_text = response.json()['choices'][0]['message']['content'].strip()
                return translated_text
            else:
                attempts += 1
                time.sleep(1)
        except Exception as e:
            print(f"尝试 {attempts + 1}/{max_attempts} 次失败: {e}")
            attempts += 1
            time.sleep(1)
            last_exception = e
    print(f"API调用出错: {last_exception}")
    raise Exception(f"API调用连续失败{max_attempts}次，停止脚本。")


def print_progress_bar(iteration, total, prefix='', suffix='', length=50, fill='█'):
    percent = "{0:.1f}".format(100 * (iteration / float(total)))
    filled_length = int(length * iteration // total)
    bar = fill * filled_length + '-' * (length - filled_length)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end="\r")
    # 输出换行
    if iteration == total:
        print()


def split_paragraphs(text):
    paragraphs = text.split('\n')
    combined_paragraphs = []
    current_paragraph = ""

    for paragraph in paragraphs:
        if not paragraph.strip():
            if current_paragraph:
                combined_paragraphs.append(current_paragraph)
                current_paragraph = ""  # 重置累积段落
            combined_paragraphs.append("")  # 分隔
        else:
            # 合并逻辑
            if len(current_paragraph + '\n' + paragraph) < 600:
                current_paragraph += ('\n' + paragraph if current_paragraph else paragraph)
            else:
                if not current_paragraph:
                    current_paragraph = paragraph
                else:
                    combined_paragraphs.append(current_paragraph)
                    current_paragraph = paragraph

    # 添加最后累积的段落
    if current_paragraph:
        combined_paragraphs.append(current_paragraph)

    return combined_paragraphs


def main():
    for j,html in enumerate(htmls):
        paragraphs = split_paragraphs(html)

        translated_paragraphs = []
        total_paragraphs = len(paragraphs)
        try:
            for i, paragraph in enumerate(paragraphs):
                if not paragraph.strip():
                    translated_paragraphs.append(paragraph)
                else:
                    print('\n')
                    print(f'正在翻译：\n{paragraph}')
                    translated_text = translate_text(paragraph)
                    print('\n')
                    print(f'翻译完成：\n{translated_text}')
                    if translated_text:
                        translated_paragraphs.append(translated_text)
                    else:
                        print("翻译失败，跳过该段落。")
                        translated_paragraphs.append(paragraph)  # 保留原文以防万一
                print_progress_bar(i + 1, total_paragraphs, prefix='进度:', suffix='完成', length=50)
        except Exception as e:
            print(e)
            with open('翻译完成'+str(j)+'.txt', 'w', encoding='utf-8') as file:
                file.write('\n'.join(translated_paragraphs))
            with open('断点记录'+str(j)+'.txt', 'w', encoding='utf-8') as file:
                remaining_paragraphs = paragraphs[len(translated_paragraphs):]
                file.write('\n'.join(remaining_paragraphs))
            return

        with open('翻译完成'+str(j)+'.txt', 'w', encoding='utf-8') as file:
            file.write('\n'.join(translated_paragraphs))
            print("翻译完成")


if __name__ == "__main__":
    main()


# 打包文件

In [16]:
!rm -rf /content/翻译.zip
!zip -r 翻译.zip /content/翻译完成*
!rm -rf /content/翻译完成*

  adding: content/翻译完成0.txt (deflated 1%)
  adding: content/翻译完成10.txt (deflated 53%)
  adding: content/翻译完成11.txt (deflated 51%)
  adding: content/翻译完成12.txt (deflated 56%)
  adding: content/翻译完成13.txt (deflated 50%)
  adding: content/翻译完成14.txt (deflated 54%)
  adding: content/翻译完成15.txt (deflated 54%)
  adding: content/翻译完成16.txt (deflated 49%)
  adding: content/翻译完成17.txt (deflated 30%)
  adding: content/翻译完成18.txt (deflated 56%)
  adding: content/翻译完成19.txt (deflated 47%)
  adding: content/翻译完成1.txt (deflated 18%)
  adding: content/翻译完成20.txt (deflated 47%)
  adding: content/翻译完成21.txt (deflated 37%)
  adding: content/翻译完成22.txt (deflated 6%)
  adding: content/翻译完成23.txt (deflated 11%)
  adding: content/翻译完成24.txt (deflated 14%)
  adding: content/翻译完成2.txt (stored 0%)
  adding: content/翻译完成3.txt (deflated 28%)
  adding: content/翻译完成4.txt (deflated 36%)
  adding: content/翻译完成5.txt (deflated 53%)
  adding: content/翻译完成6.txt (deflated 50%)
  adding: content/翻译完成7.txt (deflated 56%)
 