<a href="https://colab.research.google.com/github/LeSaUi/DLtest/blob/main/better_transformer_with_torchtext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Load torchtext and initialize XLM-R model

In [1]:
import torch
import torch.nn as nn
import torchtext

from torchtext.models import RobertaClassificationHead
from torchtext.functional import to_tensor

xlmr_large = torchtext.models.XLMR_LARGE_ENCODER
classifier_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim = 1024)
model = xlmr_large.get_model(head=classifier_head)

# Put model into inference mode (reduces runtime even without BT - esp for GPU execution, required for Better Transformer)
model.eval()

# Define input transform
transform = xlmr_large.transform()


# System Information

In [2]:
import platform

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

cpu = platform.processor()
gpu = torch.cuda.get_device_name(DEVICE)

print(f"torch version: {torch.__version__}")
print(f"torch cuda available: {torch.cuda.is_available()}")
print(f"CPU type: {cpu}")
print(f"GPU type: {gpu}")

torch version: 2.0.1+cu118
torch cuda available: True
CPU type: x86_64
GPU type: Tesla T4


# Check default sparsity support setting
Sparsity support enables transformers to skip padding in inputs.

In [3]:
model.encoder.transformer.layers.enable_nested_tensor

True

# Benchmark setup

###Define inputs

In [4]:
small_input_batch = [
               "Hello world",
               "How are you!"
]
big_input_batch = [
               "Hello world",
               "How are you!",
               """`Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by
that Antichrist- I really believe he is Antichrist- I will have
nothing more to do with you and you are no longer my friend.
"""

#               `Well, Prince, so Genoa and Lucca are now just family estates of the
#Buonapartes. But I warn you, if you don't tell me that this means war,
#if you still try to defend the infamies and horrors perpetrated by
#that Antichrist- I really believe he is Antichrist- I will have
#nothing more to do with you and you are no longer my friend, no longer
#my 'faithful slave,' as you call yourself! But how do you do? I see
#I have frightened you- sit down and tell me all the news.`

#It was in July, 1805, and the speaker was the well-known Anna
#Pavlovna Scherer, maid of honor and favorite of the Empress Marya
#Fedorovna. With these words she greeted Prince Vasili Kuragin, a man
#of high rank and importance, who was the first to arrive at her
#reception. Anna Pavlovna had had a cough for some days. She was, as
#she said, suffering from la grippe; grippe being then a new word in
#St. Petersburg, used only by the elite."""
]

###Select small or big input set

Modify the assignment to input_batch below to select either the small_input_batch or big_inoput_batch, or substitute your own inputs.

In [5]:
input_batch=big_input_batch

model_input = to_tensor(transform(input_batch), padding_value=1)
output = model(model_input)
output.shape

torch.Size([3, 2])

###Iteration count for performance measurements

In [6]:
ITERATIONS=10

#Measure CPU  performance with slow and fast path, without and with sparsity

Sparsity support enables transformers to skip padding in inputs.


### CPU performance without BT sparsity

In [7]:
model.encoder.transformer.layers.enable_nested_tensor = False

In [8]:
print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))



slow path:
--------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
--------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                 aten::addmm        66.90%       19.966s        68.39%       20.411s      27.582ms           740  
                                    aten::mm        22.18%        6.619s        22.18%        6.619s      27.579ms           240  
                                  aten::gelu         2.72%     810.839ms         2.72%     810.839ms       3.378ms           240  
                                 aten::copy_         2.57%     767.710ms         2.57%     767.710ms     350.553us          2190  
                                   aten::bmm         2.00%     597.387ms

  return torch._transformer_encoder_layer_fwd(


----------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
----------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             aten::addmm        37.60%       10.823s        37.80%       10.880s      21.760ms           500  
                 aten::_addmm_activation        30.20%        8.691s        32.24%        9.280s      38.666ms           240  
                                aten::mm        22.48%        6.471s        22.48%        6.471s      26.964ms           240  
                   aten::_masked_softmax         2.45%     704.373ms         2.46%     706.848ms       2.945ms           240  
       aten::_transform_bias_rescale_qkv         1.86%     535.882ms         1.91%     549.817ms       2.291ms 

###CPU performance with BT sparsity

In [9]:
model.encoder.transformer.layers.enable_nested_tensor = True

In [10]:
print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))



slow path:
--------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
--------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                 aten::addmm        68.98%       20.622s        69.61%       20.812s      28.124ms           740  
                                    aten::mm        22.69%        6.782s        22.69%        6.782s      28.260ms           240  
                                   aten::bmm         1.96%     586.740ms         1.96%     586.761ms       1.222ms           480  
                                  aten::gelu         1.67%     499.206ms         1.67%     499.206ms       2.080ms           240  
                                 aten::copy_         1.40%     419.243ms

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::addmm        34.25%        4.763s        34.42%        4.786s       9.572ms           500  
                    aten::_addmm_activation        27.54%        3.830s        29.20%        4.061s      16.920ms           240  
                                   aten::mm        20.58%        2.861s        20.58%        2.861s      11.922ms           240  
          aten::_transform_bias_rescale_qkv         3.86%     536.441ms         6.07%     843.539ms       3.515ms           240  
                                  aten::bmm         3.55%     494.317ms         3.56%     

#Measure DEVICE performance with slow and fast path, without and with sparsity

Please ensure that the runtime has GPUs enabled to see the performance benefits of Better Transformer fastpath execution on GPUs. You can confirm and change the Runtime type in the Google Colab menu with (Runtime > Change Runtime Type)

In [11]:
model.to(DEVICE)
model.eval()
model_input = model_input.to(DEVICE)

### DEVICE performance without BT sparsity

In [12]:
model.encoder.transformer.layers.enable_nested_tensor=False

In [13]:
print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

slow path:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               aten::mm         2.08%      83.077ms        80.17%        3.204s      13.349ms        3.306s        78.66%        3.306s      13.776ms           240  
                                            aten::addmm         1.74%      69.481ms         2.44%      97.386ms     131.603us     374.939ms         8.92%     382.236ms     516.535us           740 

### DEVICE performance performance with BT sparsity

In [14]:
model.encoder.transformer.layers.enable_nested_tensor = True

In [15]:
model.to(DEVICE)
model_input = model_input.to(DEVICE)

print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

slow path:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::addmm         9.01%      47.495ms        12.17%      64.170ms      86.716us     357.134ms        47.46%     360.866ms     487.657us           740  
                                               aten::mm         1.81%       9.528ms         2.45%      12.931ms      53.879us     116.679ms        15.51%     116.679ms     486.163us           240 