In [1]:
import yaml
import os

def load_config(filename='config.yml'):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            data = yaml.safe_load(file)
            return data if data is not None else []  # 如果文件为空或无内容，返回空列表或字典
    except FileNotFoundError:
        return []  # 文件不存在时返回空列表

def write_yaml(path,data):
    with open(path, 'w', encoding='utf-8') as file:
        yaml.dump(data, file, allow_unicode=True)

In [6]:
import requests
import json

config = load_config("config.yml")

# --------------------------------------------------
"""单词,句子翻译相关"""
def private_sentence_translate(text):
    """
    这个api-key在Linux.do的connect中可以查看，
    创建./key.yml，写入:
    deeplx:XXXXX
    """
    key = load_config("key.yml")
    api_key = key["deeplx"]
    if api_key is None:
        print("请先创建key.yml文件，并写入:\ndeeplx: api-key\n注意:后留一个空格\n或者使用local deeplx")
    url = f"https://api.deeplx.org/{api_key}/translate"
    headers = {
        "Content-Type": "application/json"
    }
    data = {"text":text, "source_lang":"EN","target_lang":"ZH"}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    if response.status_code == 200:
        return response.json()["data"]
    else:
        print(response)
        return text

def local_deeplx_sentence_translate(text):
    """本地并发请求现存的api接口,
       感谢:https://github.com/ycvk/deeplx-local"""
    url = "http://localhost:62155/translate"
    headers = {
        "Content-Type": "application/json"
    }
    data = {"text":text, "source_lang":"EN","target_lang":"ZH"}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    if response.status_code == 200:
        return response.json()["data"]
    else:
        print(response)
        return text

trans = private_sentence_translate

In [None]:
import pandas as pd
from tqdm import tqdm
from time import sleep

def combine_paras(non_empty_para, char_limit=10000):
    combined_paras = []
    current_para = ""
    for para in non_empty_para:
        if len(current_para) + len(para) + 1 <= char_limit:  # +1 for the newline character
            current_para += para + "\n"
        else:
            if current_para:
                combined_paras.append(current_para.strip())
            current_para = para + "\n"

    return combined_paras

def trans_combine_and_split_into_para(combine_paras,df):
    trans_combine_paras = trans(combine_paras)
    if 'en' not in df:
        df = pd.DataFrame({'en': [], 'cn': []})
    en_ = combine_paras.split("\n")
    cn_ = trans_combine_paras.split("\n")
    if len(en_) != len(cn_):
        raise "Except en_ para length == cn_ para length"

    df = pd.concat([df, pd.DataFrame({'en': en_, 'cn': cn_})], ignore_index=True)
    return df



def start_single_book_trans_task(book_txt_path,sleep_time):
    from pathlib import Path
    df = pd.DataFrame()
    book = Path(book_txt_path)
    csv = Path(book_txt_path.replace('.txt','.csv'))
    with book.open() as f:
        input_data = f.read()
    input_data = english_text_preprocess(input_data)
    para = input_data.split("\n")
    non_empty_para = [item for item in para if item != '']
    for combine_para in tqdm(combine_paras(non_empty_para)):
        df = trans_combine_and_split_into_para(combine_para,df)
        sleep(sleep_time)
    if csv.exists():
        csv.unlink()
    df.to_csv(book_txt_path.replace('.txt','.csv'), index=False, encoding='utf-8')
    return df

In [14]:
from pathlib import Path
from basic_clean import english_text_preprocess
book = Path("./input/海明威.txt")
with book.open() as f:
    input_data = f.read()
input_data = english_text_preprocess(input_data)
para = input_data.split("\n")
non_empty_para = [item for item in para if item != '']
len(non_empty_para),len(combine_paras(non_empty_para))

(40293, 615)

In [16]:
book_path_list = os.listdir("./input")
for book_path in tqdm(book_path_list):
    if os.path.isfile("./input/"+book_path.replace(".txt",".csv")):
        print(f"{book_path} 已经存在，跳过.")
        continue
    if os.path.isdir("./input/"+book_path.replace(".txt",".csv")):
        print(f"{book_path}是文件夹,跳过.")
        continue
    if book_path[-3:] == "csv":
        print(f"{book_path}已经处理完成,跳过.")
        continue
    start_single_book_trans_task("./input/"+book_path,sleep_time=10)
    

  6%|▌         | 2/36 [00:27<07:50, 13.84s/it]
  0%|          | 0/15 [00:27<?, ?it/s]


SSLError: HTTPSConnectionPool(host='api.deeplx.org', port=443): Max retries exceeded with url: /nE_5THBPLGytmmkJXqcFz14vDDUMiQo35hkCrDvEO1c/translate (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))