### RecurrentMemoryTransformer Training using Huggingface Trainer

This notebook demonstrates how to train RecurrentMemoryTransformer model using Huggingface Trainer.

In [1]:
# Import necessary libraries
import os
import torch
from transformers import (
    Trainer, 
    TrainingArguments, 
    AutoTokenizer, 
    AutoConfig, 
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# Import RecurrentMemoryTransformer modules
import sys
sys.path.append("../..")
from recurrent_memory_transformer.RecurrentMemoryTransformer import RecurrentMemoryTransformer
from recurrent_memory_transformer.PreTrainedRMTConfig import PreTrainedRMTConfig

  from .autonotebook import tqdm as notebook_tqdm


### Training Parameter Setup

In [12]:
# Dataset path and model parameters setup
dataset_path = "HuggingFaceFW/fineweb-edu"
dataset_name = "CC-MAIN-2024-10"

# Base model setup
base_model_name = "gpt2"  # Can be changed to any base model

# RMT parameters
is_memory_all = True
max_n_segments = 3
input_seg_len = 512
output_seg_len = 512
align = "left"
num_mem_tokens = 10

# Training parameters
output_dir = "./rmt_model_output"
learning_rate = 5e-5
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
num_train_epochs = 0.1
max_seq_length = 1024

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Loading and Preprocessing the Dataset

In [3]:
# データセットのロード
dataset = load_dataset(dataset_path, dataset_name)

dataset['train'] = dataset['train'].train_test_split(test_size=0.999, seed=42)['train']
if  "test" not in dataset:
    try:
        dataset = dataset["train"].train_test_split(test_size=100, seed=42)
    except:
        dataset = dataset.train_test_split(test_size=100, seed=42)
            
print(f"Dataset loaded: {dataset}")

Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
        num_rows: 19883
    })
    test: Dataset({
        features: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
        num_rows: 100
    })
})


In [4]:
# トークナイザーのロード
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# データセットの前処理関数
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        padding="max_length",
        truncation=True,
        max_length=max_seq_length
    )

# データセットの前処理
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=[col for col in dataset["train"].column_names if col != "text"],
    desc="Tokenizing dataset",
)

# データセットのフォーマットを設定
tokenized_dataset = tokenized_dataset.with_format("torch")

### Initialize RecurrentMemoryTransformer Model

In [5]:
# ベースモデルの設定をロード
base_config = AutoConfig.from_pretrained(base_model_name)

# RecurrentMemoryTransformerの設定を作成
rmt_config = PreTrainedRMTConfig(
    base_model_config=base_config,
    base_model_type=base_model_name,
    is_memory_all=is_memory_all,
    max_n_segments=max_n_segments,
    input_seg_len=input_seg_len,
    output_seg_len=output_seg_len,
    align=align,
    num_mem_tokens=num_mem_tokens
)

# ベースモデルをロード
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

# RecurrentMemoryTransformerモデルを初期化
model = RecurrentMemoryTransformer(rmt_config, base_model=base_model)
print(f"Model initialized with config: {rmt_config}")

Model initialized with config: PreTrainedRMTConfig {
  "_attn_implementation_autoset": true,
  "activation_function": "gelu_new",
  "align": "left",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "base_model_config": {
    "_attn_implementation_autoset": false,
    "_name_or_path": "gpt2",
    "activation_function": "gelu_new",
    "add_cross_attention": false,
    "architectures": [
      "GPT2LMHeadModel"
    ],
    "attn_pdrop": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": 50256,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "embd_pdrop": 0.1,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": 50256,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "id2label": {
    

### Data Collator Setup

In [6]:
# データコレーター（言語モデリング用）の設定
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # MLMではなくCLMを使用
)

### Trainer Setup and Training Execution

In [7]:
# トレーニング引数の設定
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    logging_dir=os.path.join(output_dir, "logs"),
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=1000,
    fp16=torch.cuda.is_available(),  # GPUが利用可能なら半精度で学習
    gradient_accumulation_steps=2,   # 勾配蓄積ステップ
)

# Trainerの初期化
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"] if "test" in tokenized_dataset else None,
    tokenizer=tokenizer,
)

  trainer = Trainer(


[2025-03-17 19:25:11,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [8]:
# モデルの学習を実行
print("Starting training...")
train_result = trainer.train()

# 学習結果と指標の表示
print(f"Training metrics: {train_result.metrics}")

# モデルの保存
trainer.save_model()
print(f"Model saved to {output_dir}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Starting training...


[34m[1mwandb[0m: Currently logged in as: [33mshin2021001[0m ([33mshin2021001-osaka-city-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss


Training metrics: {'train_runtime': 181.4286, 'train_samples_per_second': 10.959, 'train_steps_per_second': 0.689, 'total_flos': 1045262499840000.0, 'train_loss': 3.598515930175781, 'epoch': 0.1005631536604988}
Model saved to ./rmt_model_output


### Model Evaluation

In [9]:
# モデルの評価（検証データセットがある場合）
if "validation" in tokenized_dataset:
    eval_results = trainer.evaluate()
    print(f"Evaluation results: {eval_results}")

### Testing Text Generation

In [13]:
# Test text generation
test_input = "Today is a wonderful day."
inputs = tokenizer(test_input, return_tensors="pt").to(device)
out = model.generate(input_ids = inputs["input_ids"], max_length=50)
out_text = tokenizer.decode(out[0], skip_special_tokens=True)
print(out_text)

 It is a day of celebration for all of us. It is a day of celebration for all of us. It is a day of celebration for all of us. It is a day of celebration for all of us.
