Skip to content

Commit

Permalink
Extend Nemo AutoTokenizer for TRT-LLM evaluation usage
Browse files Browse the repository at this point in the history
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
  • Loading branch information
janekl committed Apr 4, 2024
1 parent 71c73ae commit 0251e96
Showing 1 changed file with 24 additions and 0 deletions.
24 changes: 24 additions & 0 deletions nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,18 @@ def ids_to_text(self, ids):
text = self.tokens_to_text(tokens_clean)
return text

def encode(self, *args, **kwargs):
return self.tokenizer.encode(*args, **kwargs)

def batch_encode_plus(self, *args, **kwargs):
return self.tokenizer.batch_encode_plus(*args, **kwargs)

def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)

def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)

@property
def vocab(self):
id2vocab = {v: k for k, v in self.tokenizer.vocab.items()}
Expand All @@ -241,6 +253,18 @@ def eos_id(self):
return None
return self.tokens_to_ids([getattr(self, 'eos_token')])[0]

@property
def pad_token_id(self):
return self.pad_id

@pad_token_id.setter
def pad_token_id(self, value):
self.pad_token = self.ids_to_tokens(value)

@property
def eos_token_id(self):
return self.eos_id

@property
def eod(self):
"""Returns EOS token id. Exact copy of the eos_id function. Required for megatron-core."""
Expand Down

0 comments on commit 0251e96

Please sign in to comment.