<a href="https://colab.research.google.com/github/Hiromi06/machine-translation/blob/main/MarianMT_train_chunk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import torch
import gc
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from transformers import MarianMTModel, MarianTokenizer, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import time

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
class TranslationDataset(Dataset):
    def __init__(self, input_data, label_data):
        self.input_ids = input_data['input_ids']
        self.attention_mask = input_data['attention_mask']
        self.labels = label_data['input_ids']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

def load_chunk(file_path):
    return torch.load(file_path)

def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

def train_on_chunk(model, optimizer, scheduler, device, input_chunk_path, label_chunk_path, batch_size=16):
    input_data = load_chunk(input_chunk_path)
    label_data = load_chunk(label_chunk_path)

    #print(f"Input data input_ids shape: {input_data['input_ids'].shape}")
    #print(f"Label data input_ids shape: {label_data['input_ids'].shape}")


    dataset = TranslationDataset(input_data, label_data)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    total_loss = 0
    model.train()
    scaler = GradScaler()

    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

         # トークンIDの範囲をチェック
        #print(f"Max token ID in input_ids: {input_ids.max()}")
        #print(f"Min token ID in input_ids: {input_ids.min()}")
        #print(f"Max token ID in labels: {labels.max()}")
        #print(f"Min token ID in labels: {labels.min()}")

        # 追加：データの範囲チェック
        #print(f"Batch input_ids shape: {input_ids.shape}")
        #print(f"Batch attention_mask shape: {attention_mask.shape}")
        #print(f"Batch labels shape: {labels.shape}")

        optimizer.zero_grad()

        with autocast():
            # 入力形状を確認
            #print(f"Input IDs: {input_ids.shape}, Attention Mask: {attention_mask.shape}, Labels: {labels.shape}")
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            # 出力の形状を確認
            #print(f"Model outputs shape: {outputs.logits.shape}")
            #print(f"Model loss: {loss.item()}")


        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

        loop.set_postfix(loss=total_loss/len(dataloader))

        scheduler.step()

    avg_train_loss = total_loss / len(dataloader)
    print(f"Train Loss: {avg_train_loss}")

    clear_memory()

def validate_on_chunks(model, device, input_chunk_paths, label_chunk_paths, batch_size=16):
    model.eval()
    total_eval_loss = 0

    for input_chunk_path, label_chunk_path in zip(input_chunk_paths, label_chunk_paths):
        input_data = load_chunk(input_chunk_path)
        label_data = load_chunk(label_chunk_path)

        dataset = TranslationDataset(input_data, label_data)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        with torch.no_grad():
            loop = tqdm(dataloader, leave=True)
            for i, batch in enumerate(loop):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_eval_loss += loss.item()

                loop.set_postfix(loss=total_eval_loss / ((i+1) * len(dataloader)))

        clear_memory()

    avg_val_loss = total_eval_loss / len(input_chunk_paths)
    print(f"Validation Loss: {avg_val_loss}")



In [4]:
start_time = time.time()

model_dir = '/content/drive/MyDrive/machine_learning/MarianMT/ep2/marian_model_chunk8_ep2'
original_model_name = 'Helsinki-NLP/opus-mt-ja-en'

model = MarianMTModel.from_pretrained(model_dir)
#print("model: ", model)
tokenizer = MarianTokenizer.from_pretrained(original_model_name)

model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

encoded_data_dir = '/content/drive/MyDrive/machine_learning/MarianMT/Marian_encoded_data'
ja_train_chunk = os.path.join(encoded_data_dir, 'ja_train_encoded_chunk_9.pt')
en_train_chunk = os.path.join(encoded_data_dir, 'en_train_encoded_chunk_9.pt')
ja_test_chunk = os.path.join(encoded_data_dir, 'ja_test_encoded_chunk_9.pt')
en_test_chunk = os.path.join(encoded_data_dir, 'en_test_encoded_chunk_9.pt')


num_training_steps = 1 * 1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

print("Training on the chunk")
train_on_chunk(model, optimizer, scheduler, device, ja_train_chunk, en_train_chunk, batch_size=16)

# Perform validation on the first chunk
print("Validation on the chunk")
validate_on_chunks(model, device, [ja_test_chunk], [en_test_chunk], batch_size=16)

# Save the model
model_save_path_2 = '/content/drive/MyDrive/machine_learning/MarianMT/ep2/marian_model_chunk9_ep2'
model.save_pretrained(model_save_path_2)
tokenizer.save_pretrained(model_save_path_2)

end_time = time.time()
processing_time = end_time - start_time

def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"Processing time: {hours}h {minutes}m {seconds:.2f}s"

print(format_time(processing_time))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/782k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



Training on the chunk


100%|██████████| 49298/49298 [1:37:10<00:00,  8.46it/s, loss=0.611]


Train Loss: 0.6113307106232052
Validation on the chunk


100%|██████████| 5478/5478 [10:01<00:00,  9.11it/s, loss=0.000108]
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[60715]], 'forced_eos_token_id': 0}


Validation Loss: 3233.9736083447933
Processing time: 1h 49m 20.34s
