<a href="https://colab.research.google.com/github/Hiromi06/machine-translation/blob/main/Marian_encoding_JESC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import torch
from tqdm import tqdm
from transformers import MarianTokenizer
import time

In [2]:
# Function to load text data
def load_data(file_path):
    lines = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            lines.append(line.strip())
    return lines


# File paths
en_train_path = '/content/drive/MyDrive/machine_learning/JESC/JESC_en_train.txt'
en_test_path = '/content/drive/MyDrive/machine_learning/JESC/JESC_en_test.txt'
ja_train_path = '/content/drive/MyDrive/machine_learning/JESC/JESC_ja_train.txt'
ja_test_path = '/content/drive/MyDrive/machine_learning/JESC/JESC_ja_test.txt'

# Load data
en_train = load_data(en_train_path)
en_test = load_data(en_test_path)
ja_train = load_data(ja_train_path)
ja_test = load_data(ja_test_path)

In [3]:
# Tokenizer
model_name = 'Helsinki-NLP/opus-mt-ja-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/782k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



In [4]:
# Tokenize and save data in chunks
def tokenize_and_save_chunks(texts, tokenizer, prefix, output_dir, num_chunks=5, max_length=512):
    chunk_size = len(texts) // num_chunks
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size if i != num_chunks - 1 else len(texts)
        chunk_texts = texts[start_idx:end_idx]

        tokenized_data = {
            'input_ids': [],
            'attention_mask': []
        }

        for text in tqdm(chunk_texts, desc=f"Tokenizing chunk {i + 1}/{num_chunks}"):
            tokenized = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length)
            tokenized_data['input_ids'].append(tokenized['input_ids'].squeeze(0))
            tokenized_data['attention_mask'].append(tokenized['attention_mask'].squeeze(0))

        # Convert lists to tensors
        tokenized_data['input_ids'] = torch.stack(tokenized_data['input_ids'])
        tokenized_data['attention_mask'] = torch.stack(tokenized_data['attention_mask'])

        # Save chunk
        file_path = os.path.join(output_dir, f'{prefix}_encoded_chunk_{i + 1}.pt')
        torch.save(tokenized_data, file_path)

start_time = time.time()

# Define output directory
output_dir = '/content/drive/MyDrive/machine_learning/JESC/MarianMT/MarianMT_encoded_data'

# Tokenize and save each dataset
tokenize_and_save_chunks(en_train, tokenizer, 'JESC_en_train', output_dir)
tokenize_and_save_chunks(en_test, tokenizer, 'JESC_en_test', output_dir)
tokenize_and_save_chunks(ja_train, tokenizer, 'JESC_ja_train', output_dir)
tokenize_and_save_chunks(ja_test, tokenizer, 'JESC_ja_test', output_dir)

tokenizer_save_path = '/content/drive/MyDrive/machine_learning/JESC/MarianMT/tokenizer'
tokenizer.save_pretrained(tokenizer_save_path)


print("Data has been tokenized, split into chunks, and saved successfully.")
print(f"Tokenizer has been saved to {tokenizer_save_path}.")

end_time = time.time()
processing_time = end_time - start_time

def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"Processing time: {hours}h {minutes}m {seconds:.2f}s"

print(format_time(processing_time))

Tokenizing chunk 1/5: 100%|██████████| 504249/504249 [02:16<00:00, 3684.65it/s]
Tokenizing chunk 2/5: 100%|██████████| 504249/504249 [02:33<00:00, 3290.31it/s]
Tokenizing chunk 3/5: 100%|██████████| 504249/504249 [02:29<00:00, 3374.96it/s]
Tokenizing chunk 4/5: 100%|██████████| 504249/504249 [02:26<00:00, 3448.01it/s]
Tokenizing chunk 5/5: 100%|██████████| 504253/504253 [02:26<00:00, 3443.67it/s]
Tokenizing chunk 1/5: 100%|██████████| 56027/56027 [00:15<00:00, 3514.41it/s]
Tokenizing chunk 2/5: 100%|██████████| 56027/56027 [00:16<00:00, 3310.41it/s]
Tokenizing chunk 3/5: 100%|██████████| 56027/56027 [00:17<00:00, 3207.21it/s]
Tokenizing chunk 4/5: 100%|██████████| 56027/56027 [00:17<00:00, 3174.81it/s]
Tokenizing chunk 5/5: 100%|██████████| 56031/56031 [00:16<00:00, 3402.79it/s]
Tokenizing chunk 1/5: 100%|██████████| 504249/504249 [02:05<00:00, 4012.78it/s]
Tokenizing chunk 2/5: 100%|██████████| 504249/504249 [02:12<00:00, 3814.37it/s]
Tokenizing chunk 3/5: 100%|██████████| 504249/5042

Data has been tokenized, split into chunks, and saved successfully.
Tokenizer has been saved to /content/drive/MyDrive/machine_learning/JESC/MarianMT/tokenizer.
Processing time: 0h 29m 18.36s
