# Unsupervised Subword Tokenizers vs. Morphology

Let's explore how unsupervised tokenizers, commonly used in Deep Learning, relate to the more linguistic aspects of Morphology. Your task is to tweek the code in order to see if subword tokenization could be a proxy for real morphological analysis. 




## Things you may need to do before running the code

### Install NLTK and Tokenizers packages:

```
pip install tokenizers
pip install nltk
```

### Download the Brown Corpus from NLTK


```
import nltk
nltk.download('brown')
```
 

In [1]:
# !pip install tokenizers
# !pip install nltk

In [2]:
import nltk
from nltk.corpus import brown
from datasets import load_dataset   
corpus_f = open("BabyLM-corpus-strict-small.txt", "w+")
corpus = load_dataset("cambridge-climb/BabyLM","strict_small",trust_remote_code = True)
# corpus['train']['text']
# count = 0
# # vocab = set()
for s in corpus['train']['text']:
   corpus_f.write("".join(s) + '\n')
    
#     words =str(s).split()
#     count += len(words)
#     vocab.update(words)

# print("No. of words:", count)
# print("No. of unique words:", len(vocab))

  from .autonotebook import tqdm as notebook_tqdm


# Tokenizers

In [3]:
from tokenizers import Tokenizer

from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Whitespace

from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer

from tokenizers.models import Unigram
from tokenizers.trainers import UnigramTrainer

In [4]:
VOCAB_SIZE = 50000   # You should be playing with this threshold

# Byte-Pair Encoding (BPE)  tokenization

In [5]:
BPE_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

trainer = BpeTrainer(vocab_size=VOCAB_SIZE, 
                     special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

BPE_tokenizer.pre_tokenizer = Whitespace()    # This is optional...

files = ["BabyLM-corpus-strict-small.txt"]

BPE_tokenizer.train(files, trainer)

BPE_tokenizer.save("tokenizers/BPE-tokenizer.json")

# Wordpiece tokenization

In [6]:
WP_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

WP_trainer = WordPieceTrainer(vocab_size=VOCAB_SIZE,
                              special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

WP_tokenizer.pre_tokenizer = Whitespace()    # This is optional...

files = ["BabyLM-corpus-strict-small.txt"]

WP_tokenizer.train(files, WP_trainer)

WP_tokenizer.save("tokenizers/WP-tokenizer.json")

#  Unigram tokenization

In [7]:
UG_tokenizer = Tokenizer(Unigram())

UG_trainer = UnigramTrainer(vocab_size=VOCAB_SIZE,
                            unk_token="<UNK>",
                            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

UG_tokenizer.pre_tokenizer = Whitespace()    # This is optional... 

files = ["BabyLM-corpus-strict-small.txt"]

UG_tokenizer.train(files, UG_trainer)

UG_tokenizer.save("tokenizers/UG-tokenizer.json")

# Let's compare the tokenizers

Your task here will be to use a small evaluation corpus to test how the different algorithms perform against one another, while varying the size of the vocabulary above.

Feel free to add other words see how they are segmented (but you need to provide a gold segmentation for it to work).

In [8]:
# Some data extracted from https://github.com/sigmorphon/2022SegmentationST
test_corpus = [
    ["assistant", ["assist","ant"]],
    ["assistants", ["assist","ant","s"]],
    ["assist", ["assist"]],
    ["assisted",["assist","ed"]],
    ["assisting", ["assist","ing"]],
    ["assistance",["assist", "ance"]],
    ["assistive", ["assist","ive"]],
    ["assistful", ["assist","ful"]],
    ["assister", ["assist","er"]],
    ["unassisted", ["un","assist","ed"]],
    ["coassistance", ["co","assist","ance"]],
    ["coassists", ["co","assist","s"]],
    ["overassisting",["over","assist","ing"]],
    ["entaming", ["en", "tame", "ing"]],
    ["hoarders", ["hoard", "er", "s"]],
    ["visitorship", ["visit","or","ship"]],
    ["reorganises", ["re","organise","s"]],
    ["wargamer", ["war","game","er"]],               
    ["encodability", ["en","code","ability"]],
    ["healthy", ["health","y"]],
    ["buildings", ["build","ing","s"]],
    ["socioeconomy", ["socio","economy"]],
]    

In [9]:
for instance in test_corpus:
    print(instance)

['assistant', ['assist', 'ant']]
['assistants', ['assist', 'ant', 's']]
['assist', ['assist']]
['assisted', ['assist', 'ed']]
['assisting', ['assist', 'ing']]
['assistance', ['assist', 'ance']]
['assistive', ['assist', 'ive']]
['assistful', ['assist', 'ful']]
['assister', ['assist', 'er']]
['unassisted', ['un', 'assist', 'ed']]
['coassistance', ['co', 'assist', 'ance']]
['coassists', ['co', 'assist', 's']]
['overassisting', ['over', 'assist', 'ing']]
['entaming', ['en', 'tame', 'ing']]
['hoarders', ['hoard', 'er', 's']]
['visitorship', ['visit', 'or', 'ship']]
['reorganises', ['re', 'organise', 's']]
['wargamer', ['war', 'game', 'er']]
['encodability', ['en', 'code', 'ability']]
['healthy', ['health', 'y']]
['buildings', ['build', 'ing', 's']]
['socioeconomy', ['socio', 'economy']]


In [10]:
count_wp, count_bpe, count_ug = 0, 0, 0

report = ""
for word, morphs  in test_corpus:
    
    wp = WP_tokenizer.decode(WP_tokenizer.encode(word).ids).replace("#",'').split()
    bpe = BPE_tokenizer.decode(BPE_tokenizer.encode(word).ids).split()
    ug = UG_tokenizer.decode(UG_tokenizer.encode(word).ids).split()

    if wp==morphs:
        count_wp += 1
    if bpe==morphs:
        count_bpe += 1
    if ug==morphs:
        count_ug += 1

        
    report = report + "GOLD: " + " ".join(morphs) + "\n"

    report = report + "Wordpiece: " + WP_tokenizer.decode(WP_tokenizer.encode(word).ids).replace("#",'') + "\n"

    report = report + "BPE: " + BPE_tokenizer.decode(BPE_tokenizer.encode(word).ids) + "\n"

    report = report + "Unigram: " + UG_tokenizer.decode(UG_tokenizer.encode(word).ids) + "\n"
    
    report = report + "------------------------------------------\n"


print("\n")
print("------------------------------------------")
print("RESULTS:")
print("------------------------------------------")
print("Wordpiece:", count_wp)
print("BPE:", count_bpe)
print("Unigram:", count_ug)
print("------------------------------------------")
print("\n\n")
print(report)




------------------------------------------
RESULTS:
------------------------------------------
Wordpiece: 4
BPE: 3
Unigram: 2
------------------------------------------



GOLD: assist ant
Wordpiece: assistant
BPE: assistant
Unigram: assistant
------------------------------------------
GOLD: assist ant s
Wordpiece: assistants
BPE: assistants
Unigram: assistant s
------------------------------------------
GOLD: assist
Wordpiece: assist
BPE: assist
Unigram: assist
------------------------------------------
GOLD: assist ed
Wordpiece: assisted
BPE: assisted
Unigram: assiste d
------------------------------------------
GOLD: assist ing
Wordpiece: assisting
BPE: assisting
Unigram: assisting
------------------------------------------
GOLD: assist ance
Wordpiece: assistance
BPE: assistance
Unigram: a s sistance
------------------------------------------
GOLD: assist ive
Wordpiece: assist ive
BPE: assis tive
Unigram: assist i ve
------------------------------------------
GOLD: assist ful
Word