<a href="https://colab.research.google.com/github/Hiromi06/machine-translation/blob/main/KOKORO_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from transformers import MarianMTModel, MarianTokenizer
import os
import re
from tqdm import tqdm
import time

In [2]:
# Define model and tokenizer directories
original_model_name = 'Helsinki-NLP/opus-mt-ja-en'
model_dir = '/content/drive/MyDrive/machine_learning/MarianMT/marian_model_chunk10'

# Load the original tokenizer
tokenizer = MarianTokenizer.from_pretrained(original_model_name)

# Load the trained model
model = MarianMTModel.from_pretrained(model_dir)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/782k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



In [3]:
# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(60716, 512, padding_idx=60715)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(60716, 512, padding_idx=60715)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [4]:
start_time = time.time()

def jpNum_to_number(kanji):
    jpNum_dict = {
        '零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
        '十': 10
    }

    def parse_number(s):
        number = 0
        unit = 0
        current_multiplier = 1

        for char in reversed(s):
            if char in jpNum_dict:
                value = jpNum_dict[char]
                if value == 10:
                    if unit == 0:
                        unit = 1
                    current_multiplier = 10
                else:
                    unit += value
                    number += unit * current_multiplier
                    unit = 0
                    current_multiplier = 1

        number += unit * current_multiplier
        return number

    return re.sub(
        r'([一二三四五六七八九十]+)',
        lambda m: str(parse_number(m.group(1))),
        kanji
    )

def translate_sentence(sentence):
    batch = tokenizer([sentence], return_tensors="pt", padding=True, truncation=True)
    batch = {k: v.to(device) for k, v in batch.items()}  # Move tensors to the same device as the model
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    return tgt_text

def preprocess_and_translate(text):
    # Convert Kanji chapter numbers to Arabic numerals
    text = re.sub(r'[一二三四五六七八九十]+', lambda x: jpNum_to_number(x.group()), text)

    # Preserve PART sections and their titles
    part_titles = ["上\n先生と私", "中\n両親と私", "下\n先生と遺書"]
    for part in part_titles:
        text = text.replace(part, f"\n\n{part}\n\n")

    # Split the document into paragraphs based on two newlines
    paragraphs = text.split('\n\n')

    translated_paragraphs = []
    for paragraph in tqdm(paragraphs, desc="Translating"):
        if paragraph.strip():  # Ensure that we don't translate empty paragraphs
            translated_paragraph = translate_sentence(paragraph.strip())
            translated_paragraphs.append(translated_paragraph)

    return '\n\n'.join(translated_paragraphs)

def preprocess_and_translate_file(input_file, output_file):
    # Load text from file
    with open(input_file, mode='r', encoding='utf-8') as infile:
        content = infile.read()

    # Preprocess and translate the text
    translated_text = preprocess_and_translate(content)

    # Write translated text to an output file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        outfile.write(translated_text)

    print(f"Preprocessing and translation complete. The result is saved at: {output_file}")

if __name__ == "__main__":
    input_file = '/content/drive/MyDrive/machine_learning/ja_preprocessed_kokoro.txt'
    output_file = '/content/drive/MyDrive/machine_learning/MarianMT/translated_text/translation_chunk10/Marian_translated_kokoro_chunk10.txt'
    preprocess_and_translate_file(input_file, output_file)

end_time = time.time()
processing_time = end_time - start_time

def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"Processing time: {hours}h {minutes}m {seconds:.2f}s"

print(format_time(processing_time))

Translating: 100%|██████████| 1335/1335 [11:07<00:00,  2.00it/s]

Preprocessing and translation complete. The result is saved at: /content/drive/MyDrive/machine_learning/MarianMT/translated_text/translation_chunk10/Marian_translated_kokoro_chunk10.txt
Processing time: 0h 11m 7.97s





In [None]:
start_time = time.time()



def translate_chapter(chapter, model, tokenizer, device):
    # Tokenize the input text
    encoded_input = tokenizer(chapter, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    # Generate translation
    translated_tokens = model.generate(**encoded_input)
    # Decode the translated tokens back to text
    decoded_output = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return decoded_output[0]


def main(output_dir):
    # Define model and tokenizer directories
    original_model_name = 'Helsinki-NLP/opus-mt-ja-en'
    model_dir = '/content/drive/MyDrive/machine_learning/marian_model_chunk1_test'

    # Load the original tokenizer
    tokenizer = MarianTokenizer.from_pretrained(original_model_name)

    # Load the trained model
    model = MarianMTModel.from_pretrained(model_dir)

    # Move model to device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Preprocess the text and get chapters
    print("Preprocessing text...")
    with tqdm(total=1, desc="Preprocessing") as pbar:
        chapters = preprocess()
        pbar.update(1)

    # Translate each chapter
    translated_chapters = []
    model.eval()
    with torch.no_grad():
        for chapter in chapters:
            translated_text = translate_chapter(chapter, model, tokenizer, device)
            translated_chapters.append(translated_text)

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    output_file_path = os.path.join(output_dir, 'translated_kokoro.txt')

    # Translate and save each chapter individually
    model.eval()
    with torch.no_grad():
      with open(output_file_path, 'w', encoding='utf-8') as f:
          for i, chapter in enumerate(tqdm(chapters, desc="Translating", unit="chapter")):
              f.write(f"Chapter {i + 1} Translated:\n")
              f.write(translated_text)
              f.write("\n\n")
              # Clear memory after processing each chapter
              torch.cuda.empty_cache()

end_time = time.time()
processing_time = end_time - start_time

def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"Processing time: {hours}h {minutes}m {seconds:.2f}s"

print(format_time(processing_time))


if __name__ == "__main__":
  input_dir =
  output_dir = '/content/drive/MyDrive/machine_learning/MarianMT/translated_text'
  main(output_dir)
