Merge pull request #14 from NMZivkovic/feature/custom-vocab

Classes for custom vocabulary
NMZivkovic · Sep 9, 2022 · 150e40a · 150e40a
2 parents ac1c9f1 + 0f29cef
commit 150e40a
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ While working with BERT Models from Huggingface in combination with ML.NET, I st
 I documented them in [here](https://rubikscode.net/2021/10/25/using-huggingface-transformers-with-ml-net/).</br>
 However, the biggest challenge by far was that I needed to implement my own tokenizer and pair them with the correct vocabulary.
 So, I decided to extend it and publish my implementation as a NuGet package and an open-source project.
-More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/)
+More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/). </br>
 
 This repository contains tokenizers for following models:<br />
     · BERT Base<br />
@@ -77,6 +77,8 @@ This repository contains tokenizers for following models:<br />
     · BERT Base Uncased<br />
     · BERT Large Uncased<br />
 
+There are also clases using which you can upload your own vocabulary.
+
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 ### Built With
@@ -194,6 +196,7 @@ n.zivkovic@rubikscode.net</br>
 ## Acknowledgments
 
 * Gianluca Bertani - Performance Improvements
+* [Paul Calot](https://github.com/PaulCalot) - First Token bugfix
 
 <p align="right">(<a href="#top">back to top</a>)</p>
 

diff --git a/src/BERTTokenizers.csproj b/src/BERTTokenizers.csproj
@@ -21,7 +21,7 @@
     · BERT Large Uncased</Description>
     <PackageReleaseNotes>Open-source project for BERT tokenizers that can be used in C#.</PackageReleaseNotes>
     <PackageTags>BERT, Tokenizer, charp, dotnet</PackageTags>
-    <Version>1.1.0</Version>
+    <Version>1.2.0</Version>
   </PropertyGroup>
 
   <ItemGroup>

diff --git a/src/Base/TokenizerBase.cs b/src/Base/TokenizerBase.cs
@@ -2,6 +2,7 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
+using System.Text.RegularExpressions;
 
 namespace BERTTokenizers.Base
 {
@@ -118,7 +119,7 @@ private IEnumerable<(string Token, int VocabularyIndex)> TokenizeSubwords(string
             {
                 string prefix = null;
                 int subwordLength = remaining.Length;
-                while (subwordLength >= 2)
+                while (subwordLength >= 1) // was initially 2, which prevents using "character encoding"
                 {
                     string subword = remaining.Substring(0, subwordLength);
                     if (!_vocabularyDict.ContainsKey(subword))
@@ -138,7 +139,8 @@ private IEnumerable<(string Token, int VocabularyIndex)> TokenizeSubwords(string
                     return tokens;
                 }
 
-                remaining = remaining.Replace(prefix, "##");
+                var regex = new Regex(prefix);
+                remaining = regex.Replace(remaining, "##", 1);
 
                 tokens.Add((prefix, _vocabularyDict[prefix]));
             }

diff --git a/src/BertCasedCustomVocabulary.cs b/src/BertCasedCustomVocabulary.cs
@@ -0,0 +1,11 @@
+using BERTTokenizers.Base;
+
+namespace BERTTokenizers
+{
+    public class BertUnasedCustomVocabulary : CasedTokenizer
+    {
+        public BertUnasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { }
+
+    }
+
+}
diff --git a/src/BertUncasedCustomVocabulary.cs b/src/BertUncasedCustomVocabulary.cs
@@ -0,0 +1,10 @@
+using BERTTokenizers.Base;
+
+namespace BERTTokenizers
+{
+    public class BertCasedCustomVocabulary : CasedTokenizer
+    {
+        public BertCasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { }
+
+    }
+}