In [1]:
import os
import torch
from transformers import MarianMTModel, MarianTokenizer, pipeline

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [43]:
import os
import torch
from transformers import MarianMTModel, MarianTokenizer

# Function to read content from a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to write content to a new file
def write_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# Function to split the text into individual sentences
def split_into_sentences(text):
    return text.split('。')  # Splitting by Chinese period

# Function to translate text using a MarianMTModel
def translate_text(sentence):
    # Load pre-trained model and tokenizer
    model_name = 'Helsinki-NLP/opus-mt-zh-en'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Translate the sentence
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    translated_tokens = model.generate(**inputs, max_length=512)
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    
    return translated_text

# Main function to read, translate, and write the translated content
def main(input_file_path):
    # Read the original content
    original_text = read_file(input_file_path)
    
    # Split the text into sentences
    sentences = split_into_sentences(original_text)
    
    # Translate each sentence
    translated_sentences = [translate_text(sentence) for sentence in sentences if sentence.strip()]
    
    # Combine the translated sentences into a single string
    translated_text = ' '.join(translated_sentences)
    
    # Define the path for the translated file
    input_file_name = os.path.basename(input_file_path)
    translated_file_name = f'translated_{input_file_name}'
    translated_file_path = os.path.join(os.path.dirname(input_file_path), translated_file_name)
    
    # Write the translated content to the new file
    write_file(translated_file_path, translated_text)
    
    print(f'Translation completed. Translated content written to {translated_file_path}.')

# Example usage
if __name__ == "__main__":
    input_file_path = r'C:\Users\yule_\M1Bert\THUCNews\浣撹偛\0.txt'
    main(input_file_path)


Translation completed. Translated content written to C:\Users\yule_\M1Bert\THUCNews\浣撹偛\translated_0.txt.


In [45]:
import os
import torch
from transformers import MarianMTModel, MarianTokenizer

# Function to read content from a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to write content to a new file
def write_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# Function to split the text into paragraphs
def split_into_paragraphs(text):
    return text.split('\n\n')  # Assuming paragraphs are separated by double newlines

# Function to split a paragraph into sentences
def split_into_sentences(paragraph):
    return paragraph.split('。')  # Splitting by Chinese period

# Function to translate a paragraph while preserving sentence structure
def translate_paragraph(paragraph, model, tokenizer, device):
    sentences = split_into_sentences(paragraph)
    translated_sentences = []

    for sentence in sentences:
        if sentence.strip():  # Avoid translating empty sentences
            inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}
            translated_tokens = model.generate(**inputs, max_length=512)
            translated_sentence = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
            translated_sentences.append(translated_sentence)

    # Join the translated sentences with a period and space for readability
    return '. '.join(translated_sentences)

# Main function to read, translate, and write the translated content
def main(input_file_path):
    # Load pre-trained model and tokenizer
    model_name = 'Helsinki-NLP/opus-mt-zh-en'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Read the original content
    original_text = read_file(input_file_path)
    
    # Split the text into paragraphs
    paragraphs = split_into_paragraphs(original_text)
    
    # Translate each paragraph
    translated_paragraphs = [translate_paragraph(paragraph, model, tokenizer, device) for paragraph in paragraphs if paragraph.strip()]
    
    # Combine the translated paragraphs into a single string, with paragraphs separated by double newlines
    translated_text = '\n\n'.join(translated_paragraphs)
    
    # Define the path for the translated file
    input_file_name = os.path.basename(input_file_path)
    translated_file_name = f'translated_{input_file_name}'
    translated_file_path = os.path.join(os.path.dirname(input_file_path), translated_file_name)
    
    # Write the translated content to the new file
    write_file(translated_file_path, translated_text)
    
    print(f'Translation completed. Translated content written to {translated_file_path}.')

# Example usage
if __name__ == "__main__":
    input_file_path = r'C:\Users\yule_\M1Bert\THUCNews\浣撹偛\0.txt'
    main(input_file_path)


Translation completed. Translated content written to C:\Users\yule_\M1Bert\THUCNews\浣撹偛\translated_0.txt.
