# 论文复现：Munkhdalai et al. - 2024 - Leave No Context Behind Efficient Infinite Contex

韩子坚 2024.6.9

## 准备工作
### 数据集说明

使用论文中提到的的 PG-19 和 BookSum 数据集

`./data/pg19`

`./data/booksum`

### 环境说明
`conda=24.4.0`

`python=3.12.3`

`cuda=12.1.105`

`transformers=4.41.2` transformers必须大于等于4.40.0才能运行 qwen2


## 实验一 Long-context Language Modeling

论文中给模型输入超长上下文，但是并没有提及分段的方式。

Pytorch中的几种DataCollator似乎都无法实现这种分段的需求，于是尝试实现一个 segmented_data_collator。

In [1]:
import torch
from typing import Any, Dict, List, NewType, Mapping
InputDataClass = NewType("InputDataClass", Any)


def segmented_data_collator(features: List[InputDataClass], segment_length: int) -> Dict[str, Any]:
    batch = {}

    for k in batch.keys():
        if k != "labels":
            batch[k] = torch.tensor_split(batch[k], list(range(segment_length, batch[k].shape[1], segment_length)), dim=1)
        else:
            batch[k] = torch.tensor_split(batch[k], list(range(segment_length, batch[k].shape[1], segment_length)), dim=1)

    return batch

分析论文：

> We trained and evaluated small Infini-Transformer models on PG19 (Rae et al., 2019) and Arxiv-math (Wu et al., 2022) benchmarks.

可知论文使用了 PG19 和 Arxiv-math 数据集

接下来引入相关依赖，前面照抄transformer的run_clm_no_trainer.py，后面再多引入一个 Dataset 即可


In [None]:
! pip install transformers datasets accelerate torch huggingface-hub tqdm pathlib flash_attn

In [1]:
! pip install pipreqs
! pipreqs . --force

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting pipreqs
  Downloading http://mirrors.aliyun.com/pypi/packages/36/38/cc1343c3a63655e18328e51e00c6e6851be648f1b8babffc5131f1b9f226/pipreqs-0.5.0-py3-none-any.whl (33 kB)
Collecting docopt==0.6.2 (from pipreqs)
  Downloading http://mirrors.aliyun.com/pypi/packages/a2/55/8f8cab2afd404cf578136ef2cc5dfb50baa1761b68c9da1fb1e4eed343c9/docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting ipython==8.12.3 (from pipreqs)
  Downloading http://mirrors.aliyun.com/pypi/packages/8d/97/8fe103906cd81bc42d3b0175b5534a9f67dccae47d6451131cf8d0d70bb2/ipython-8.12.3-py3-none-any.whl (798 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.3/798.3 kB[0m [31m393.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nbconvert<8.0.0,>=7.11.0 (from pipreqs)
  Downloading http://mirrors.aliyun.com/pypi/packages/b8/bb/bb5b6a515d1584aa2fd89965b11db6632e4bdc69495a52374bcc36e56cfa/

In [3]:
import logging
import math
import os
import random
from itertools import chain
from pathlib import Path

import datasets
import torch
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    SchedulerType,
    default_data_collator,
    get_scheduler,
)
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from datasets import Dataset

check_min_version("4.40.0.dev0")

logger = get_logger(__name__)

require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

  from .autonotebook import tqdm as notebook_tqdm


分析论文：

> We set the Infini-attention segment length N to 2048 for all attention layers and the input sequence length to 32768 for training. 

> For the long-context language modeling task, we set the learning rate to 0.01 by performing small search over values of 0.003, 0.005, 0.01 and 0.03.

确定参数：

`segment_length=2048`

`block_size=32768`

`learning_rate=0.01`


设置参数

In [4]:
accelerator_log_kwargs = {}


accelerator_log_kwargs["log_with"] = "all"
accelerator_log_kwargs["project_dir"] = "None"
segment_length = 2048 # 根据论文 We set the Infini-attention segment length N to 2048 for all attention layers
# gradient_accumulation_steps = args.block_size // segment_length
accelerator = Accelerator(**accelerator_log_kwargs)

# Make one log on every process with the configuration for debugging.
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.info(accelerator.state, main_process_only=False)
if accelerator.is_local_main_process:
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()
else:
    datasets.utils.logging.set_verbosity_error()
    transformers.utils.logging.set_verbosity_error()

accelerator.wait_for_everyone()

Detected kernel version 4.19.90, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
06/09/2024 13:23:20 - INFO - __main__ - Distributed environment: DistributedType.NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu

Mixed precision type: no



下载数据集

下载数据集失败了，设置一下代理

In [4]:
import os
proxy = 'http://127.0.0.1:7897'
os.environ['http_proxy'] = proxy
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy

load数据集的时候特别慢，关闭之后又得重新下，查阅手册发现三个优化方法：

-  并行加载，num_proc=8
-  使用cache_dir保存数据集，cache_dir="./cache_directory"
-  保存数据集，`.save_to_disk()`

In [5]:
dataset_name = "pg19"
dataset_config_name = None
validation_split_percentage = 5
data_path = "./data/pg19"
raw_datasets = load_dataset(data_path, dataset_config_name, num_proc=8)
if "validation" not in raw_datasets.keys():
    raw_datasets["validation"] = load_dataset(
        dataset_name,
        dataset_config_name,
        num_proc=8, 
        # cache_dir='./cache_directory',
        split=f"train[:{validation_split_percentage}%]",
    )
    raw_datasets["train"] = load_dataset(
        dataset_name,
        dataset_config_name,
        num_proc=8, 
        # cache_dir='./cache_directory',
        split=f"train[{validation_split_percentage}%:]",
    )

save_directory = "./data/pg19"  
raw_datasets.save_to_disk(save_directory)

Saving the dataset (1/1 shards): 100%|██████████| 28602/28602 [00:00<00:00, 1018771.88 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 50/50 [00:00<00:00, 22021.97 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 45202.11 examples/s]


选用Qwen2-7B

在 jupyter 里面一直下不动，代理和镜像也无济于事，只好在命令行 git clone 了

需要先配置一下 git lfs

```bash
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
apt install git-lfs

git lfs install
```


In [6]:
import os
os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
model_name_or_path = "Qwen/Qwen2-7B-Instruct"
trust_remote_code = False
save_directory = "./local_model_save"
config = AutoConfig.from_pretrained(
            model_name_or_path,
            trust_remote_code=trust_remote_code,
        )

tokenizer_name = model_name_or_path
tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_name, use_fast=True, trust_remote_code=trust_remote_code)
tokenizer.save_pretrained(save_directory)
model = AutoModelForCausalLM.from_pretrained(
            model_name_or_path,
            from_tf=bool(".ckpt" in model_name_or_path),
            config=config,
            trust_remote_code=trust_remote_code,
        )
model.save_pretrained(save_directory)


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2-7B-Instruct/snapshots/41c66b0be1c3081f13defc6bdf946c2ef240d6a6/config.json
Model config Qwen2Config {
  "_name_or_path": "Qwen/Qwen2-7B-Instruct",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 3584,
  "initializer_range": 0.02,
  "intermediate_size": 18944,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 28,
  "num_hidden_layers": 28,
  "num_key_value_heads": 4,
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "sliding_window": 131072,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 152064
}

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2-7B

KeyboardInterrupt: 

tokenize

In [None]:
preprocessing_num_workers = None
overwrite_cache = False
block_size = 32768 
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model.resize_token_embeddings(len(tokenizer))


column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

with accelerator.main_process_first():
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not overwrite_cache,
        desc="Running tokenizer on dataset",
    )



if block_size > tokenizer.model_max_length:
    logger.warning(
        f"The block_size passed ({block_size}) is larger than the maximum length for the model "
        f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
    )
block_size = min(block_size, tokenizer.model_max_length)



使用自定义的 segmented_data_collator 切块

In [None]:

def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    total_length = (total_length // block_size) * block_size

    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


with accelerator.main_process_first():
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=preprocessing_num_workers,
        load_from_cache_file=not overwrite_cache,
        desc=f"Grouping texts in chunks of {block_size}",
    )

train_dataset = Dataset(lm_datasets["train"], segment_length)
eval_dataset = Dataset(lm_datasets["validation"], segment_length)


for index in random.sample(range(len(train_dataset)), 3):
    logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

per_device_train_batch_size=1
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=lambda batch: segmented_data_collator(batch, segment_length), batch_size=per_device_train_batch_size
)
per_device_eval_batch_size=8
eval_dataloader = DataLoader(
    eval_dataset, collate_fn=lambda batch: segmented_data_collator(batch, segment_length), batch_size=per_device_eval_batch_size
)

no_decay = ["bias", "layer_norm.weight"]
weight_decay=0.0
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]


训练

In [None]:
learning_rate=0.01
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)

overrode_max_train_steps = False
gradient_accumulation_steps=1
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)

num_train_epochs=10
max_train_steps = num_train_epochs * num_update_steps_per_epoch
overrode_max_train_steps = True

lr_scheduler_type="linear"
num_warmup_steps=0
lr_scheduler = get_scheduler(
    name=lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps * accelerator.num_processes,
    num_training_steps=max_train_steps
    if overrode_max_train_steps
    else max_train_steps * accelerator.num_processes,
)

model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

if accelerator.distributed_type == DistributedType.TPU:
    model.tie_weights()


num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
if overrode_max_train_steps:
    max_train_steps = num_train_epochs * num_update_steps_per_epoch
num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)


# Train!
total_batch_size = per_device_train_batch_size * accelerator.num_processes * gradient_accumulation_steps

logger.info("***** Running training *****")
logger.info(f"  Num examples = {len(train_dataset)}")
logger.info(f"  Num Epochs = {num_train_epochs}")
logger.info(f"  Instantaneous batch size per device = {per_device_train_batch_size}")
logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f"  Total optimization steps = {max_train_steps}")

progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
starting_epoch = 0



progress_bar.update(completed_steps)

for epoch in range(starting_epoch, num_train_epochs):
    model.train()
    # model.gradient_checkpointing_enable()
    total_loss = 0

    active_dataloader = train_dataloader
    for step, batch in enumerate(active_dataloader):
        # Segment the batch items into smaller chunks of 2048 tokens
        for i in range(len(batch["input_ids"])):   
            outputs = model(input_ids=batch["input_ids"][i], attention_mask=batch["attention_mask"][i],labels=batch["labels"][i])                                 
            loss = outputs.loss
            accelerator.backward(loss)
            total_loss += loss.detach().float()
        model.reset_memory()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()


        if accelerator.sync_gradients:
            progress_bar.update(1)
            completed_steps += 1
        # Log the training loss and lr every 100 steps
        if completed_steps % 100 == 0:
            print(f"Step: {completed_steps}, Loss: {loss.item()}, LR: {lr_scheduler.get_last_lr()[0]}")

        if completed_steps >= max_train_steps:
            break
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        for i in range(len(batch["input_ids"])):
            with torch.no_grad():
                outputs = model(input_ids=batch["input_ids"][i], attention_mask=batch["attention_mask"][i],labels=batch["labels"][i])
        model.reset_memory()
        loss = outputs.loss
        losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))

    losses = torch.cat(losses)
    try:
        eval_loss = torch.mean(losses)
        perplexity = math.exp(eval_loss)
    except OverflowError:
        perplexity = float("inf")

    logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")


## 实验二 keypass retrieval

分析论文：

> The passkey task hides a random number into a long text and asks it back at the model output. The length of the distraction text is varied by repeating a text chunk multiple times. The previous work (Chen et al., 2023a) showed that a 8B LLaMA model can solve the task up to 32K length when fine-tuned with the same 32K length inputs with Position Interpolation. We take this challenge further and fine-tune on only 5K length inputs to test on 1M length regime.

可知，论文选用的是 32k 上下文长度的模型，fine-tune 的是 32k 长度的数据集，测试的是 1M 长度的数据集。

确定参数：

`num_tokens=32000`



设计 PasskeyRetrievalDataset 类

In [None]:
import os
import math
import torch
import argparse
import random
import numpy as np
from numpy import random
from tqdm import tqdm
import transformers
import peft
from peft import LoraConfig, get_peft_model
from peft import TaskType
from torch.utils.data import DataLoader
from accelerate import Accelerator
from datasets import Dataset
from transformers import default_data_collator
import bitsandbytes as bnb
class PasskeyRetrievalDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        prompt, answer = self.data[index]
        return prompt, answer

设定参数

In [None]:
base_model = "Qwen/Qwen2-7B-Instruct"
num_tokens = 32000
batch_size = 1

分析论文：

> Below we showed the input format of the passkey task. There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there. The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again. (repeat x times) The pass key is 9054. Remember it. 9054 is the pass key. The grass is green. The sky is blue. The sun is yellow. Here we go. There and ack again. (repeat y times) What is the pass key? The pass key is

> For each test subset, we controlled the position of the passkey so that it is either located around the beginning, middle or the end of the input sequence. We reported both zero-shot accuracy and finetuning accuracy. Infini-Transformers solved the task with up to 1M context length after fine-tuning on 5K length inputs for 400 steps.

由此可知这个实验实际上是在一段长文本中隐藏一个关键词，看模型能否在长文本中提取关键词。

In [None]:
def generate_prompt_landmark(n_garbage, seed):
    rnd_state = random.get_state()
    random.seed(seed)
    n_garbage_prefix = random.randint(0, n_garbage)
    n_garbage_suffix = n_garbage - n_garbage_prefix

    task_description = "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there."
    garbage = "The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again."
    garbage_inf = " ".join([garbage] * 5000)
    assert len(garbage_inf) >= n_garbage
    garbage_prefix = garbage_inf[:n_garbage_prefix]
    garbage_suffix = garbage_inf[:n_garbage_suffix]
    pass_key = random.randint(1, 50000)
    information_line = f"The pass key is {pass_key}. Remember it. {pass_key} is the pass key."
    final_question = "What is the pass key? The pass key is"
    lines = [
        task_description,
        garbage_prefix,
        information_line,
        garbage_suffix,
        final_question,
    ]
    random.set_state(rnd_state)
    return "\n".join(lines), str(pass_key)

In [None]:
def passkey_retrieval_test(model, tokenizer, accelerator, use_cache=False, n_garbage=60000, seed=666, segment_length=2048, num_train_epochs=3, train_batch_size=1, learning_rate=3e-4):
    # Generate training data
    train_data = []
    for _ in range(1000):
        prompt, answer = generate_prompt_landmark(n_garbage, seed)
        train_data.append({'text': prompt, 'labels': answer})

    train_dataset = Dataset.from_list(train_data)

    def tokenize_function(examples):
        # Tokenize the text and labels
        inputs = tokenizer(examples['text'], padding="max_length", truncation=True)
        inputs['labels'] = tokenizer(examples['labels'], padding="max_length", truncation=True)['input_ids']
        return inputs


    tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
    
    train_dataloader = DataLoader(tokenized_dataset, batch_size=train_batch_size, shuffle=True, collate_fn=default_data_collator)

    # Prepare the model for LoRa training
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=['beta',"q_proj", "v_proj", "k_proj", "o_proj"], # Include 'beta' in the target modules
    )
    model = get_peft_model(model, peft_config)

    # Prepare the optimizer and scheduler
    optimizer = bnb.optim.Adam8bit(model.parameters(), lr=learning_rate)
    lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.0, total_iters=num_train_epochs,)

    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, lr_scheduler
    )

    # Training loop
    for epoch in range(num_train_epochs):
        model.train()
        total_loss = 0

        for step, batch in enumerate(train_dataloader):

            # Segment the input_ids into smaller chunks
            input_segments = torch.tensor_split(batch['input_ids'], list(range(segment_length, batch['input_ids'].shape[1], segment_length)))
            label_segments = torch.tensor_split(batch['labels'], list(range(segment_length, batch['labels'].shape[1], segment_length)))

            for i in range(len(input_segments)):
                outputs = model(input_ids=input_segments[i], labels=label_segments[i]) 
                loss = outputs.loss
                accelerator.backward(loss)
                total_loss += loss.detach().float()
            model.reset_memory()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}, Loss: {avg_loss.item()}")

    # Evaluation
    model.eval()
    prompt, answer = generate_prompt_landmark(n_garbage, seed)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(accelerator.device)

    answer_ids = tokenizer(answer, return_tensors="pt").input_ids[:, 1:].to(accelerator.device) # drop BOS

    # Segment the input_ids into smaller chunks
    input_segments = torch.tensor_split(input_ids[0], list(range(segment_length, input_ids.shape[1], segment_length)))


    for i in range(len(input_segments)-1):
        outputs = model(input_ids=input_segments[i].unsqueeze(0))


    generation_output = model.generate(
        input_ids=input_segments[-1].unsqueeze(0), max_new_tokens=answer_ids.shape[-1], num_beams=1, use_cache=use_cache, M_Z=M_Z
    )

    model_answer = generation_output[0, -answer_ids.shape[-1]:].cpu()
    # All tensors are on the CPU, so we can compare them directly
    answer_ids = answer_ids.cpu()
    is_correct = (model_answer == answer_ids[0]).all().item()
    print(f"The correct answer is {tokenizer.decode(answer_ids[0].cpu())}")
    print(f"The model answer is {tokenizer.decode(model_answer.cpu())}, is_correct : {is_correct}")
    return is_correct

In [None]:
# Set RoPE scaling factor
config = transformers.AutoConfig.from_pretrained(
    base_model,
)
config.use_cache=False
config.num_experts_per_tok = 1
config.max_position_embeddings = 2048

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    config=config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    base_model,
    padding_side="right",
    use_fast=False,
)

accelerator = Accelerator(mixed_precision='bf16')

n_garbage = num_tokens
is_correct = passkey_retrieval_test(model, tokenizer, accelerator, use_cache=False, n_garbage=n_garbage, seed=420, train_batch_size=batch_size)
print(f"Accuracy: {'Passed' if is_correct else 'Failed'}")