In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep
data = pd.read_csv("../csv/rulate_books.csv")
data = data.drop_duplicates(subset=['link'])
data.shape

(70310, 17)

In [None]:
from transformers import MarianMTModel, MarianTokenizer
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from transformers import MarianTokenizer
import nltk
import torch
import time
import re
import warnings


warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=TracerWarning)



start_time = time.time()


def clean_text(text: str) -> str:
    # –†–∞–∑—Ä–µ—à–∞–µ–º –ª–∞—Ç–∏–Ω–∏—Ü—É, —Ü–∏—Ñ—Ä—ã, —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—É—é –ø—É–Ω–∫—Ç—É–∞—Ü–∏—é –∏ –∫–∞–≤—ã—á–∫–∏
    allowed = re.compile(r"[^a-zA-Z0-9\s.,!?;:()\"'\-‚Äì‚Äî]")
    text = allowed.sub("", text)
    # –£–±–∏—Ä–∞–µ–º –¥–≤–æ–π–Ω—ã–µ –ø—Ä–æ–±–µ–ª—ã
    text = re.sub(r"\s+", " ", text).strip()
    return text





def translate_texts(texts, tokenizer, model, device="cpu", batch_size=8):
    translations = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)
        translated = model.generate(
            **batch,
            num_beams=4,
            length_penalty=1.1,
            no_repeat_ngram_size=3,
            max_length=512
        )
        decoded = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        translations.extend(decoded)
        print(f"‚úÖ {i + len(batch_texts)}/{len(texts)} done")
    return translations


def main():
    start_time = time.time()

    model_name = "Helsinki-NLP/opus-mt-en-ru"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = ORTModelForSeq2SeqLM.from_pretrained(model_name, export=True)

    text = open("Data/en/chapter_1.txt", "r", encoding="utf-8").read()
    text = clean_text(text)

    nltk.download('punkt', quiet=True)
    sentences = nltk.sent_tokenize(text)

    torch.set_num_threads(4)

    translations = translate_texts(sentences, tokenizer, model, device="cpu")

    with open("Data/ru/chapter_1_translated.txt", "w", encoding="utf-8") as f:
        for line in translations:
            f.write(line + "\n")

    print(f"üèÅ Completed in {time.time() - start_time:.1f} s")

if __name__ == "__main__":
    main()


In [None]:
import os
import re
import time
import warnings
import subprocess
import nltk
import torch
from langdetect import detect, LangDetectException
from transformers import M2M100Tokenizer
from optimum.onnxruntime import ORTModelForSeq2SeqLM

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# üìÇ –ü—É—Ç–∏ –∫ –º–æ–¥–µ–ª—è–º
BASE_MODEL_NAME = "facebook/m2m100_418M"
ONNX_MODEL_DIR = "./m2m-onnx"
QUANT_MODEL_DIR = "./m2m-onnx-quantized"

# üìÇ –ü—É—Ç–∏ –∫ –¥–∞–Ω–Ω—ã–º
INPUT_FILE = "Data/en/chapter_1.txt"
OUTPUT_FILE = "Data/ru/chapter_1_translated.txt"


# üßº --- –û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞ ---
def clean_text(text: str) -> str:
    text = re.sub(r"[^a-zA-Z0-9\s.,!?;:()\"'‚Äô\-‚Äì‚Äî]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"\b(\w+)( \1\b)+", r"\1", text)
    return text


# üåê --- –ü—Ä–æ–≤–µ—Ä–∫–∞ —è–∑—ã–∫–∞ ---
def is_english(sentence: str) -> bool:
    try:
        return detect(sentence) == "en"
    except LangDetectException:
        return False


# üß† --- –ü—Ä–æ–≤–µ—Ä–∫–∞ –∏ –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –º–æ–¥–µ–ª–∏ ---
def ensure_quantized_model():
    if os.path.exists(QUANT_MODEL_DIR):
        print(f"‚úÖ –ù–∞–π–¥–µ–Ω–∞ –∫–≤–∞–Ω—Ç–æ–≤–∞–Ω–Ω–∞—è –º–æ–¥–µ–ª—å: {QUANT_MODEL_DIR}")
        return

    print("‚ö†Ô∏è –ö–≤–∞–Ω—Ç–æ–≤–∞–Ω–Ω–∞—è –º–æ–¥–µ–ª—å –Ω–µ –Ω–∞–π–¥–µ–Ω–∞. –í—ã–ø–æ–ª–Ω—è—é —ç–∫—Å–ø–æ—Ä—Ç ONNX...")
    subprocess.run([
        "optimum-cli", "export", "onnx",
        "--model", BASE_MODEL_NAME,
        ONNX_MODEL_DIR
    ], check=True)

    print("‚ö° –í—ã–ø–æ–ª–Ω—è—é –∫–≤–∞–Ω—Ç–æ–≤–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏...")
    subprocess.run([
        "optimum-cli", "onnxruntime", "quantize",
        "--onnx_model", ONNX_MODEL_DIR,
        "--output", QUANT_MODEL_DIR,
        "--avx2"
    ], check=True)

    print(f"‚úÖ –ö–≤–∞–Ω—Ç–æ–≤–∞–Ω–Ω–∞—è –º–æ–¥–µ–ª—å —É—Å–ø–µ—à–Ω–æ —Å–æ–∑–¥–∞–Ω–∞: {QUANT_MODEL_DIR}")


# üåç --- –ü–µ—Ä–µ–≤–æ–¥ ---
def translate_batch(sentences, tokenizer, model, device="cpu", batch_size=8):
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        tokenizer.src_lang = "en"
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)

        outputs = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.get_lang_id("ru"),
            num_beams=4,
            max_length=512
        )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        translations.extend(decoded)
        print(f"‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ {i + len(batch)}/{len(sentences)} –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π")
    return translations


def main():
    start_time = time.time()

    # üì• 1. –ü—Ä–æ–≤–µ—Ä–∫–∞ –∏ –∑–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏
    ensure_quantized_model()
    tokenizer = M2M100Tokenizer.from_pretrained(BASE_MODEL_NAME)
    model = ORTModelForSeq2SeqLM.from_pretrained(QUANT_MODEL_DIR)

    # üìÑ 2. –ó–∞–≥—Ä—É–∑–∫–∞ –∏ –æ—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        raw_text = f.read()

    nltk.download("punkt", quiet=True)
    sentences = nltk.sent_tokenize(raw_text)
    print(f"üìÑ –ù–∞–π–¥–µ–Ω–æ {len(sentences)} –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π –≤ –∏—Å—Ö–æ–¥–Ω–æ–º —Ç–µ–∫—Å—Ç–µ")

    # üßº 3. –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞
    cleaned = []
    for s in sentences:
        s = clean_text(s)
        if len(s) > 3 and is_english(s):
            cleaned.append(s)

    print(f"üßº –ü–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏ –æ—Å—Ç–∞–ª–æ—Å—å {len(cleaned)} –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π")

    if not cleaned:
        print("‚ùå –ù–µ—Ç –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π –¥–ª—è –ø–µ—Ä–µ–≤–æ–¥–∞.")
        return

    # ‚ö° 4. –ü–µ—Ä–µ–≤–æ–¥
    torch.set_num_threads(4)
    translations = translate_batch(cleaned, tokenizer, model)

    # üíæ 5. –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for t in translations:
            f.write(t + "\n")

    print(f"üèÅ –ü–µ—Ä–µ–≤–æ–¥ –∑–∞–≤–µ—Ä—à—ë–Ω –∑–∞ {time.time() - start_time:.1f} —Å–µ–∫.")
    print(f"üìÅ –†–µ–∑—É–ª—å—Ç–∞—Ç: {OUTPUT_FILE}")


if __name__ == "__main__":
    main()


In [1]:
import os, re, time, random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys





# full_text = open("../Data/en/chapter_1.txt", "r", encoding="utf-8").read()
origigin_path = "../Data/Dragon_Ball_-_The_True_Legendary_Super_Saiyan_.txt"

def sleep(multiply = 1, base_wait_time = 1):
    time.sleep(base_wait_time * multiply)
    
def translate_chapter(chapter_path, base_wait_time = 1):
    full_text = open(chapter_path,"r", encoding="utf-8").read()

    url = "https://www.deepl.com/translator#en/ru/"
    driver = uc.Chrome()
    driver.get(url)
    sleep()
    input_box = driver.find_element(By.NAME, "source")
    output_box = driver.find_element(By.NAME, "target")



    parts = re.split(r'(?<!\.)\.(?!\.)', full_text)
    parts = [p.strip() + ('.' if not p.endswith('.') else '') for p in parts if p.strip()]


    final_text = ""
    for  part in parts:
        input_box.click()
        if  len(input_box.text) + len(part) > 1400:
            sleep(3)
            if len(input_box.text) > 0.75 * len(output_box.text):
                sleep(10)
            final_text += output_box.text
            driver.execute_script("arguments[0].value = '';", input_box)
            print(f"next batch after{part[:30]}")

            
        input_box.send_keys(part)
        sleep(random.uniform(0.4, 0.6))
    final_text += input_box.text


In [None]:
translate_

14

TypeError: Random.random() takes no arguments (1 given)

–ë—É–º–∞–∂–Ω—ã–µ –¥–µ–Ω—å–≥–∏, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ –ø–µ—Ä–µ—Å—Ç–∞–≤–∞–ª–∏ –≥–æ—Ä–µ—Ç—å, –∑–∞–ø–æ–ª–Ω—è–ª–∏ –≤–æ–∑–¥—É—Ö, –≥—É—Å—Ç–æ–π –æ—Ç –¥—ã–º–∞ –ª–∞–¥–∞–Ω–∞. –ß—É –•–∞–æ, –æ–¥–µ—Ç—ã–π –≤ –º–æ–Ω–∞—à–µ—Å–∫—É—é —Ä—è—Å—É —Å –±—Ä–∏—Ç–æ–π –≥–æ–ª–æ–≤–æ–π, —Ç–∏—Ö–æ –Ω–∞–ø–µ–≤–∞–ª, —á—É–≤—Å—Ç–≤—É—è —Å–µ–±—è —Å–æ–≤–µ—Ä—à–µ–Ω–Ω–æ —Å–ø–æ–∫–æ–π–Ω–æ.
