Reduction scheduler does not handle allocation domain properly and trigger assert when reduction output has specified allocation domain #2202

jjsjann123 · 2024-05-05T01:09:06Z

Found an repro here:

TEST_F(AllocationDomainTest, ReductionOnPermutedReshape) {
  auto fusion = std::make_unique<Fusion>();
  FusionGuard fg(fusion.get());

  TensorView* in = makeContigConcreteTensor({16, 12, 128, 192});
  in->setAllocationDomain(
      {in->axis(0), in->axis(2), in->axis(1), in->axis(3)}, true);
  TensorView* dqkv = permute(in, {0, 2, 1, 3});
  dqkv = reshape(dqkv, {16, 128, 12, 192}, {16, 128, 2304});
  TensorView* sum_out = sum(dqkv, {0, 1});
  sum_out->setAllocationDomain(
      {sum_out->axis(2), sum_out->axis(0), sum_out->axis(1)}, true);

  fusion->addInput(in);
  fusion->addOutput(sum_out);

  FusionExecutorCache fec(std::move(fusion));
  at::Tensor in_tensor =
      at::randn({16 * 12 * 128 * 192})
          .cuda()
          .as_strided({16, 12, 128, 192}, {128 * 12 * 192, 192, 12 * 192, 1});
  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}

I tried to patch it with just fixing the reduction properties: #2201 , but that hit another issue with vectorization.

I realized that the reduction scheduler assumes the inputs to reduction operation to have the same allocation domain as with its output, which is not true.
IIUC, inner reduction means that the reduction dimension contains the inner most iter domain on inputs, rather than its output.

The text was updated successfully, but these errors were encountered:

jjsjann123 · 2024-05-05T01:33:11Z

Adding another python example for thunder einsum that fails when my simple allocation domain propagation is enabled. Note that this is only for my own reference for repro.

import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id33(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, True, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[3, 2, 1, 0])
    T1 = fd.define_tensor(shape=[1, 1, -1], contiguity=[None, None, True], dtype=DataType.Float, is_cpu=False, stride_order=[2, 1, 0])
    T2 = fd.define_tensor(shape=[-1, 1, -1], contiguity=[True, None, True], dtype=DataType.Float, is_cpu=False, stride_order=[2, 1, 0])
    T3 = fd.ops.sum(T2, dims=[1], keepdim=False, dtype=DataType.Null)
    T4 = fd.ops.permute(T3, dims=[1, 0])
    S5 = fd.define_scalar(1, dtype=DataType.Int)
    S6 = fd.define_scalar(1, dtype=DataType.Int)
    S7 = fd.define_scalar(1, dtype=DataType.Int)
    S8 = fd.define_scalar(2, dtype=DataType.Int)
    S9 = fd.define_scalar(2, dtype=DataType.Int)
    V10 = fd.define_vector([S5, S6, S7, S8, S9], dtype=DataType.Int)
    T11 = fd.ops.reshape(T4, new_shape=V10)
    S12 = fd.define_scalar(1, dtype=DataType.Int)
    S13 = fd.define_scalar(4, dtype=DataType.Int)
    S14 = fd.define_scalar(5, dtype=DataType.Int)
    S15 = fd.define_scalar(2, dtype=DataType.Int)
    S16 = fd.define_scalar(2, dtype=DataType.Int)
    V17 = fd.define_vector([S12, S13, S14, S15, S16], dtype=DataType.Int)
    T18 = fd.ops.broadcast_in_dim(T11, shape=V17, broadcast_dims=[0, 1, 2, 3, 4])
    S19 = fd.define_scalar(1.00000, dtype=DataType.Double)
    S20 = fd.define_scalar(1, dtype=DataType.Int)
    S21 = fd.define_scalar(4, dtype=DataType.Int)
    S22 = fd.define_scalar(5, dtype=DataType.Int)
    S23 = fd.define_scalar(2, dtype=DataType.Int)
    S24 = fd.define_scalar(2, dtype=DataType.Int)
    V25 = fd.define_vector([S20, S21, S22, S23, S24], dtype=DataType.Int)
    T26 = fd.ops.full(shape=V25, fill_value=S19, dtype=DataType.Float)
    T27 = fd.ops.permute(T26, dims=[0, 1, 2, 4, 3])
    T28 = fd.ops.mul(T18, T27)
    T29 = fd.ops.sum(T28, dims=[0, 3, 4], keepdim=False, dtype=DataType.Null)
    S30 = fd.define_scalar(1, dtype=DataType.Int)
    S31 = fd.define_scalar(4, dtype=DataType.Int)
    S32 = fd.define_scalar(5, dtype=DataType.Int)
    S33 = fd.define_scalar(1, dtype=DataType.Int)
    S34 = fd.define_scalar(1, dtype=DataType.Int)
    V35 = fd.define_vector([S30, S31, S32, S33, S34], dtype=DataType.Int)
    T36 = fd.ops.broadcast_in_dim(T29, shape=V35, broadcast_dims=[1, 2])
    S37 = fd.define_scalar(1, dtype=DataType.Int)
    S38 = fd.define_scalar(4, dtype=DataType.Int)
    S39 = fd.define_scalar(5, dtype=DataType.Int)
    V40 = fd.define_vector([S37, S38, S39], dtype=DataType.Int)
    T41 = fd.ops.reshape(T36, new_shape=V40)
    T42 = fd.ops.permute(T41, dims=[2, 0, 1])
    S43 = fd.define_scalar(3, dtype=DataType.Int)
    S44 = fd.define_scalar(5, dtype=DataType.Int)
    S45 = fd.define_scalar(1, dtype=DataType.Int)
    S46 = fd.define_scalar(4, dtype=DataType.Int)
    V47 = fd.define_vector([S43, S44, S45, S46], dtype=DataType.Int)
    T48 = fd.ops.broadcast_in_dim(T42, shape=V47, broadcast_dims=[1, 2, 3])
    T49 = fd.ops.sum(T1, dims=[0], keepdim=False, dtype=DataType.Null)
    S50 = fd.define_scalar(1, dtype=DataType.Int)
    S51 = fd.define_scalar(5, dtype=DataType.Int)
    S52 = fd.define_scalar(1, dtype=DataType.Int)
    S53 = fd.define_scalar(1, dtype=DataType.Int)
    V54 = fd.define_vector([S50, S51, S52, S53], dtype=DataType.Int)
    T55 = fd.ops.reshape(T49, new_shape=V54)
    S56 = fd.define_scalar(3, dtype=DataType.Int)
    S57 = fd.define_scalar(5, dtype=DataType.Int)
    S58 = fd.define_scalar(1, dtype=DataType.Int)
    S59 = fd.define_scalar(4, dtype=DataType.Int)
    V60 = fd.define_vector([S56, S57, S58, S59], dtype=DataType.Int)
    T61 = fd.ops.broadcast_in_dim(T55, shape=V60, broadcast_dims=[0, 1, 2, 3])
    T62 = fd.ops.mul(T61, T48)
    T63 = fd.ops.sum(T62, dims=[1, 2], keepdim=False, dtype=DataType.Null)
    T64 = fd.ops.sum(T0, dims=[1], keepdim=False, dtype=DataType.Null)
    T65 = fd.ops.permute(T64, dims=[1, 0, 2])
    S66 = fd.define_scalar(3, dtype=DataType.Int)
    S67 = fd.define_scalar(1, dtype=DataType.Int)
    S68 = fd.define_scalar(1, dtype=DataType.Int)
    S69 = fd.define_scalar(4, dtype=DataType.Int)
    V70 = fd.define_vector([S66, S67, S68, S69], dtype=DataType.Int)
    T71 = fd.ops.reshape(T65, new_shape=V70)
    S72 = fd.define_scalar(3, dtype=DataType.Int)
    S73 = fd.define_scalar(5, dtype=DataType.Int)
    S74 = fd.define_scalar(1, dtype=DataType.Int)
    S75 = fd.define_scalar(4, dtype=DataType.Int)
    V76 = fd.define_vector([S72, S73, S74, S75], dtype=DataType.Int)
    T77 = fd.ops.broadcast_in_dim(T71, shape=V76, broadcast_dims=[0, 1, 2, 3])
    T78 = fd.ops.mul(T77, T48)
    T79 = fd.ops.sum(T78, dims=[0, 2, 3], keepdim=False, dtype=DataType.Null)
    T80 = fd.ops.mul(T61, T77)
    T81 = fd.ops.sum(T80, dims=[0], keepdim=False, dtype=DataType.Null)
    T82 = fd.ops.permute(T81, dims=[1, 2, 0])
    S83 = fd.define_scalar(1, dtype=DataType.Int)
    S84 = fd.define_scalar(4, dtype=DataType.Int)
    S85 = fd.define_scalar(5, dtype=DataType.Int)
    S86 = fd.define_scalar(1, dtype=DataType.Int)
    S87 = fd.define_scalar(1, dtype=DataType.Int)
    V88 = fd.define_vector([S83, S84, S85, S86, S87], dtype=DataType.Int)
    T89 = fd.ops.reshape(T82, new_shape=V88)
    S90 = fd.define_scalar(1, dtype=DataType.Int)
    S91 = fd.define_scalar(4, dtype=DataType.Int)
    S92 = fd.define_scalar(5, dtype=DataType.Int)
    S93 = fd.define_scalar(2, dtype=DataType.Int)
    S94 = fd.define_scalar(2, dtype=DataType.Int)
    V95 = fd.define_vector([S90, S91, S92, S93, S94], dtype=DataType.Int)
    T96 = fd.ops.broadcast_in_dim(T89, shape=V95, broadcast_dims=[0, 1, 2, 3, 4])
    T97 = fd.ops.mul(T96, T27)
    T98 = fd.ops.sum(T97, dims=[0, 1, 2], keepdim=False, dtype=DataType.Null)
    fd.add_output(T79)
    fd.add_output(T63)
    fd.add_output(T98)

with FusionDefinition() as fd:
    nvfuser_fusion_id33(fd)

inputs = [
    torch.randn((24,), dtype=torch.float32, device='cuda:0').as_strided((1, 2, 3, 4), (24, 12, 4, 1)),
    torch.randn((5,), dtype=torch.float32, device='cuda:0').as_strided((1, 1, 5), (5, 5, 1)),
    torch.randn((4,), dtype=torch.float32, device='cuda:0').as_strided((2, 1, 2), (2, 2, 1)),
]
fd.execute(inputs)

jjsjann123 · 2024-05-05T01:34:12Z

cc'ing @liqiangxl if this is something that you might be interested. 😜

jjsjann123 · 2024-05-05T01:44:02Z

We should get a couple reduction/normalization scheduler and iterate through permutation on allocation domain (from rfactor) for inputs/outputs. That should verify that schedulers are properly handling allocation domain.

jjsjann123 · 2024-05-07T04:56:20Z

We should get a couple reduction/normalization scheduler and iterate through permutation on allocation domain (from rfactor) for inputs/outputs. That should verify that schedulers are properly handling allocation domain.

Highlight comment from #2200 (comment) regarding comprehensive test cases for allocation domain support in reduction scheduler.

We'll want some representative end-2-end tests to validate our functional support.

naoyam · 2024-05-08T18:18:27Z

Could you add the error message here?

jjsjann123 · 2024-05-08T18:25:32Z

The old assert happens here:

Fuser/csrc/scheduler/utils.cpp

Line 1051 in 5af7104

NVF_ERROR(merge_3d(tv) == 3, "Tried 3D merge, but result is not 3D.");

That one can be patched with #2201 . But afterwards, our vectorization is wrong since input tensor cannot be vectorized.
This is not surprising, since looking at reduction output as the reference, the scheduler mistakenly thought this is an inner reduction.

naoyam · 2024-05-08T18:41:17Z

So, with #2201, the issue is now with the vectorization analysis?

jjsjann123 · 2024-05-08T18:48:06Z

So, with #2201, the issue is now with the vectorization analysis?

That's what is failing right now, admittedly it is a problem.

Though I don't think that's the root cause. #2201 just patched the assert, but the scheduler is still wrong... I think reduction scheduler should be looking at reduction inputs instead of its output to decide which reduction to use (inner vs outer), does that sound right to you as well?

naoyam · 2024-05-08T18:56:36Z

Admittedly, I need to refresh my memory about the reduction scheduler but why does it matter? Because the input and output may have different orderings of allocation domains?

jjsjann123 · 2024-05-08T19:08:05Z

Admittedly, I need to refresh my memory about the reduction scheduler but why does it matter? Because the input and output may have different orderings of allocation domains?

exactly and that would change how the read pattern is, which I think dictates which variant of reduction scheduler that we should use.

naoyam · 2024-05-08T19:13:13Z

I think it would be nice to have some simple concrete example.

jjsjann123 · 2024-05-08T19:13:37Z

Sounds good. I'll get that in place to help with the discussion.

jjsjann123 · 2024-05-08T21:22:54Z

TEST_F(AllocationDomainTest, PlayGround) {
  auto fusion = std::make_unique<Fusion>();
  FusionGuard fg(fusion.get());
  TensorView* in = makeContigConcreteTensor({16, 1024, 1024});
  TensorView* sum_out = sum(in, {0, 1});
  sum_out->setAllocationDomain(
      {sum_out->axis(2), sum_out->axis(0), sum_out->axis(1)}, true);
  fusion->addInput(in);
  fusion->addOutput(sum_out);
  FusionExecutorCache fec(std::move(fusion));
  at::Tensor in_tensor =
      at::randn({16 * 1024 * 1024})
          .cuda()
          .as_strided({16, 1024, 1024}, {1024 * 1024, 1024, 1});
  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}

Does this simpler example look better?

TensorView* sum_out = sum(in, {0, 1}); is an outer reduction on the input tensor, but is mistakenly treated as an inner reduction when analyzing sum_out

naoyam · 2024-05-08T21:30:43Z

sum_out->setAllocationDomain({sum_out->axis(2), sum_out->axis(0), sum_out->axis(1)}, true);

When would this happen in practice? sum_out doesn't actually allocate the reduction domains, so I wonder what would cause this ordering.

jjsjann123 · 2024-05-09T00:14:54Z

sum_out->setAllocationDomain({sum_out->axis(2), sum_out->axis(0), sum_out->axis(1)}, true);

When would this happen in practice? sum_out doesn't actually allocate the reduction domains, so I wonder what would cause this ordering.

^^^ That's a valid argument. Do we want to have reduction domains in allocation domain at all and what meaning do they carry?

To keep the question simpler, we can assume that the order is propagate from inputs.

How about this example

TEST_F(AllocationDomainTest, PlayGround) {
  auto fusion = std::make_unique<Fusion>();
  FusionGuard fg(fusion.get());
  TensorView* in = makeContigConcreteTensor({16, 1024, 1024});
  in->setAllocationDomain(
      {in->axis(2), in->axis(0), in->axis(1)}, true);
  TensorView* sum_out = sum(in, {0, 1});
  sum_out->setAllocationDomain(
      {sum_out->axis(2), sum_out->axis(0), sum_out->axis(1)}, true);
  fusion->addInput(in);
  fusion->addOutput(sum_out);
  FusionExecutorCache fec(std::move(fusion));
  at::Tensor in_tensor =
      at::randn({16 * 1024 * 1024})
          .cuda()
          .as_strided({16, 1024, 1024}, {1024 * 1024, 1024, 1});
  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}

We can see here that the allocation order is is propagated from input to output. So maybe it makes more sense?!

On ToT, this example triggers the same issue in merge_3d(tv) == 3 assert. But it is patched with #2201 . I hope it carries over the idea.

naoyam · 2024-05-09T00:47:49Z

That's a valid argument. Do we want to have reduction domains in allocation domain at all and what meaning do they carry?

Yeah, we don't allocate those domains, so it seems to make more sense to drop them from allocation domains.

In this example, IIUC, both of the input and output just look like an inner reduction, so nothing inconsistent?

jjsjann123 · 2024-05-09T05:58:51Z

That's a valid argument. Do we want to have reduction domains in allocation domain at all and what meaning do they carry?

Yeah, we don't allocate those domains, so it seems to make more sense to drop them from allocation domains.

In this example, IIUC, both of the input and output just look like an inner reduction, so nothing inconsistent?

You are right. The second example does not have the inconsistency. The problem with this one is that, our reductoin scheduler isn't handling allocation domain properly and that's where the assert happens.

So really there're two problems:

Functional issue: allocation domain handling in reduction scheduler isn't totally right and we could hit assert.
Performance issue: we are not choosing the right reduction scheduler when inputs are permuted.

For performance issue, we can look at the example below, these two reduction are identical in concept.

TEST_F(AllocationDomainTest, PlayGround) {
  preseg_passes::OptimizationPassGuard<preseg_passes::AllocationDomainPass> guard(false);
  auto fusion = std::make_unique<Fusion>();
  FusionGuard fg(fusion.get());
  TensorView* in = makeContigConcreteTensor({16*1024, 1024*36});
  in->setAllocationDomain(
      {in->axis(1), in->axis(0)}, true);
  TensorView* sum_out = sum(in, {1});
  fusion->addInput(in);
  fusion->addOutput(sum_out);
  FusionExecutorCache fec(std::move(fusion));
  at::Tensor in_tensor =
      at::randn({16 * 1024 * 1024 * 36}) 
          .cuda()
          .as_strided({16 * 1024, 1024 * 36}, {1, 1024*16});
  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}

TEST_F(AllocationDomainTest, PlayGround_ref) {
  preseg_passes::OptimizationPassGuard<preseg_passes::AllocationDomainPass> guard(false);
  auto fusion = std::make_unique<Fusion>();
  FusionGuard fg(fusion.get());
  TensorView* in = makeContigConcreteTensor({1024*36, 16*1024});
  TensorView* sum_out = sum(in, {0});
  fusion->addInput(in);
  fusion->addOutput(sum_out);
  FusionExecutorCache fec(std::move(fusion));
  at::Tensor in_tensor =
      at::randn({16 * 1024 * 1024 * 36}) 
          .cuda()
          .as_strided({1024 * 36, 16 * 1024}, {1024*16, 1}); 
  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}

But PlayGround runs through a inner reduction with
kernelreduction_f0_c1_r0_g0 run in 11.7279 ms, achieved: 206.004 GB/s

Meanwhile, PlayGround_ref runs through outer reduction with
kernelreduction_f0_c1_r0_g0 run in 1.65069 ms, achieved: 1463.62 GB/s

This is the simple logic our reduction scheduler uses to choose between inner vs outer reduction

Fuser/csrc/scheduler/utils.cpp

Lines 2224 to 2239 in 66b2a0a

    
           bool isFastestDimReduction(TensorView* tv) { 
        
             for (auto it = tv->getMaybeAllocationDomain().rbegin(); 
        
                  it != tv->getMaybeAllocationDomain().rend(); 
        
                  ++it) { 
        
               auto root_id = *it; 
        
               if (root_id->isBroadcast()) { 
        
                 continue; 
        
               } else if (root_id->isReduction()) { 
        
                 return true; 
        
               } else { 
        
                 return false; 
        
               } 
        
             } 
        
             return false; 
        
           }

.

FYI, the reference tv in the function above is the output of a reduction.

naoyam · 2024-05-09T06:23:07Z

The examples make sense, but isn't the performance problem fixed by the allocation order propagation?

jjsjann123 · 2024-05-09T16:18:18Z

Touche. We can discuss whether we want to prioritize the performance issue, maybe it's something we can wait until the issue shows up. I'm wondering how channels last vision models would behave with this (convolution followed by normalization).

Allocation order propagation doesn't always fix the performance problem in practice. There could be user specify output format, then it's not something the propagation can change; Also the propagation was done at the whole fusion segment right now, so it could struggle to figure out the right TVs to propagate from/to.

I'll follow up with the performance issue on a separate thread.

So back to the original issue this one is about. Scheduler should at least be functional and we need to patch that.

naoyam · 2024-05-10T00:20:32Z

Allocation order propagation doesn't always fix the performance problem in practice. There could be user specify output format, then it's not something the propagation can change; Also the propagation was done at the whole fusion segment right now, so it could struggle to figure out the right TVs to propagate from/to.

This sounds like a user specifies different allocation orderings between inputs and outputs. There doesn't seem to be any obviously right decision here, i.e., whether the input order or the output order should be adopted. I hope this is uncommon.

Scheduler should at least be functional and we need to patch that.

Is it fixed by #2201?

jjsjann123 · 2024-05-10T14:51:44Z

Scheduler should at least be functional and we need to patch that.
Is it fixed by #2201?

It's not properly fixed. The cpp example in that PR still have wrong vectorization after the patch.

refactored allocation order inference pass: * Instead of per operation propagation rule, we are now using IdModel mapping to map allocation domain of reference tensor to rfactor domain of target tensor. * Updated the inference API to allow specified sources and destinations for the propagation. ``` void inferenceAllocationOrder( Fusion* fusion, const std::vector<TensorView*>& srcs, const std::vector<TensorView*>& dsts); ``` * The propagation tried to keep the memory format of `dsts` closer to the `srcs` to simplify scheduling as well as facilitate vectorization. It works roughly as: * For each entry `dst`, among all its producers in `srcs`, we'll find the one with the most loop iter domain in its allocation domain as the reference `ref` * We try to map each iter domain in `dst`'s rfactor domain to `ref`'s allocation order domain and push those as the inner dimension in `dst`'s new allocation domain, while pushing unmapped iter domains as outer dimensions. * I have to put in a WAR for the mapping logic for now, since reduction scheduler is struggling with permuted output. See issue #2202. The WAR is simply to preserve the existing position of reduction iter domain in rfactor the same as it would be in its new allocation domain. This WAR is supposed to be removed at a later point once we fixed reduction scheduler. I kept both code path in the PR for easier future cleanup. --------- Co-authored-by: Naoya Maruyama <naoyam@users.noreply.github.com> Co-authored-by: Jingyue Wu <wujingyue@gmail.com>

jjsjann123 added the allocation domain issues related to allocation domain support label May 5, 2024

This was referenced May 5, 2024

Allocation order refactor #2168

Merged

Fix TensorView::clearReductionIterDomains specifies wrong contiguity flag #2200

Merged

jjsjann123 mentioned this issue May 21, 2024

allocation order inference creating issue in multidevice_matmul tests #2276

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Reduction scheduler does not handle allocation domain properly and trigger assert when reduction output has specified allocation domain #2202

Reduction scheduler does not handle allocation domain properly and trigger assert when reduction output has specified allocation domain #2202

jjsjann123 commented May 5, 2024

jjsjann123 commented May 5, 2024 •

edited

jjsjann123 commented May 5, 2024

jjsjann123 commented May 5, 2024

jjsjann123 commented May 7, 2024

naoyam commented May 8, 2024

jjsjann123 commented May 8, 2024

naoyam commented May 8, 2024

jjsjann123 commented May 8, 2024

naoyam commented May 8, 2024

jjsjann123 commented May 8, 2024

naoyam commented May 8, 2024

jjsjann123 commented May 8, 2024

jjsjann123 commented May 8, 2024

naoyam commented May 8, 2024

jjsjann123 commented May 9, 2024

naoyam commented May 9, 2024

jjsjann123 commented May 9, 2024

naoyam commented May 9, 2024

jjsjann123 commented May 9, 2024

naoyam commented May 10, 2024

jjsjann123 commented May 10, 2024

Reduction scheduler does not handle allocation domain properly and trigger assert when reduction output has specified allocation domain #2202

Reduction scheduler does not handle allocation domain properly and trigger assert when reduction output has specified allocation domain #2202

Comments

jjsjann123 commented May 5, 2024

jjsjann123 commented May 5, 2024 • edited

jjsjann123 commented May 5, 2024

jjsjann123 commented May 5, 2024

jjsjann123 commented May 7, 2024

naoyam commented May 8, 2024

jjsjann123 commented May 8, 2024

naoyam commented May 8, 2024

jjsjann123 commented May 8, 2024

naoyam commented May 8, 2024

jjsjann123 commented May 8, 2024

naoyam commented May 8, 2024

jjsjann123 commented May 8, 2024

jjsjann123 commented May 8, 2024

naoyam commented May 8, 2024

jjsjann123 commented May 9, 2024

naoyam commented May 9, 2024

jjsjann123 commented May 9, 2024

naoyam commented May 9, 2024

jjsjann123 commented May 9, 2024

naoyam commented May 10, 2024

jjsjann123 commented May 10, 2024

jjsjann123 commented May 5, 2024 •

edited