In [3]:
import torch
import nltk
from nltk.tokenize import word_tokenize
import subword_nmt
import os

# Word Tokenization

In [9]:
# Download NLTK tokenization data
# nltk.download('punkt')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [4]:
print(os.listdir(r'C:\Users\Hi\AppData\Roaming\nltk_data\tokenizers'))
nltk.data.path

['punkt', 'punkt.zip']


['C:\\Users\\Hi/nltk_data',
 'C:\\Users\\Hi\\miniconda3\\envs\\py_3.11\\nltk_data',
 'C:\\Users\\Hi\\miniconda3\\envs\\py_3.11\\share\\nltk_data',
 'C:\\Users\\Hi\\miniconda3\\envs\\py_3.11\\lib\\nltk_data',
 'C:\\Users\\Hi\\AppData\\Roaming\\nltk_data',
 'C:\\nltk_data',
 'D:\\nltk_data',
 'E:\\nltk_data']

In [5]:
text = "I love tokenization in NLP."

# Word-based tokenization
tokens = word_tokenize(text, preserve_line=True)
print(f"Tokens: {tokens}")

ytb_title = "Alexander Rybak - Fairytale (LIVE) | Norway 🇳🇴 | Grand Final | Winner of Eurovision 2009"
word_tokenize(ytb_title, preserve_line=True)

Tokens: ['I', 'love', 'tokenization', 'in', 'NLP', '.']


['Alexander',
 'Rybak',
 '-',
 'Fairytale',
 '(',
 'LIVE',
 ')',
 '|',
 'Norway',
 '🇳🇴',
 '|',
 'Grand',
 'Final',
 '|',
 'Winner',
 'of',
 'Eurovision',
 '2009']

In [6]:
chars = list(text)
print(f"Characters: {chars}")

# Create a vocabulary for characters
vocab_char = {char: i for i, char in enumerate(set(chars))}  # Character vocabulary
print(f"Character Vocabulary: {vocab_char}")

# Convert characters to indices
indices_char = torch.tensor([vocab_char[char] for char in chars], dtype=torch.long)
print(f"Character Indices: {indices_char}")

Characters: ['I', ' ', 'l', 'o', 'v', 'e', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', ' ', 'i', 'n', ' ', 'N', 'L', 'P', '.']
Character Vocabulary: {'n': 0, 'I': 1, 'N': 2, 'e': 3, 'v': 4, '.': 5, ' ': 6, 'L': 7, 'l': 8, 'k': 9, 'i': 10, 'P': 11, 'a': 12, 'o': 13, 'z': 14, 't': 15}
Character Indices: tensor([ 1,  6,  8, 13,  4,  3,  6, 15, 13,  9,  3,  0, 10, 14, 12, 15, 10, 13,
         0,  6, 10,  0,  6,  2,  7, 11,  5])


# Subword Tokenization

In [8]:
!pip install sentencepiece subword_nmt 

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------- ----------------------------- 262.1/991.5 kB ? eta -:--:--
   ---------- ----------------------------- 262.1/991.5 kB ? eta -:--:--
   ---------- ----------------------------- 262.1/991.5 kB ? eta -:--:--
   ---------- ----------------------------- 262.1/991.5 kB ? eta -:--:--
   ---------- ---------------------------

## Byte Pair Encoding (BPE)

In [13]:
import sentencepiece as spm

def train_bpe(corpus_path, vocab_size=5000, model_prefix="bpe_model"):
    """
    Train a BPE tokenizer using SentencePiece.
    
    Args:
    - corpus_path (str): Path to the text corpus for training.
    - vocab_size (int): Size of the vocabulary.
    - model_prefix (str): Prefix for saving the model.
    
    Returns:
    - None
    """
    spm.SentencePieceTrainer.train(
        f"--input={corpus_path} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type=bpe"
    )

def apply_bpe(corpus_path, model_path, output_path):
    """
    Apply the trained BPE model to the corpus.
    
    Args:
    - corpus_path (str): Path to the text corpus.
    - model_path (str): Path to the trained BPE model.
    - output_path (str): Output path for the tokenized text.
    
    Returns:
    - None
    """
    sp = spm.SentencePieceProcessor()
    sp.load(model_path)
    
    with open(corpus_path, 'r') as f_in, open(output_path, 'w', encoding='utf-8') as f_out:
        for line in f_in:
            tokenized = sp.encode_as_pieces(line.strip())  # Tokenizes the line using BPE
            f_out.write(' '.join(tokenized) + '\n')


In [11]:
train_bpe('data.txt', vocab_size=465, model_prefix="bpe_model")

In [15]:
apply_bpe('data.txt', 'bpe_model.model', 'Byte Pair Encoding.txt')

## Unigram Language Modeling Tokenization (UnigramLM)

In [21]:
def train_unigram(corpus_path, vocab_size=5000, model_prefix="unigram_model"):
    """
    Train a UnigramLM tokenizer using SentencePiece.
    
    Args:
    - corpus_path (str): Path to the text corpus for training.
    - vocab_size (int): Size of the vocabulary.
    - model_prefix (str): Prefix for saving the model.
    
    Returns:
    - None
    """
    spm.SentencePieceTrainer.train(
        f"--input={corpus_path} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type=unigram"
    )

def apply_unigram(corpus_path, model_path, output_path):
    """
    Apply the trained Unigram LM model to the corpus.
    
    Args:
    - corpus_path (str): Path to the text corpus.
    - model_path (str): Path to the trained Unigram LM model.
    - output_path (str): Output path for the tokenized text.
    
    Returns:
    - None
    """
    sp = spm.SentencePieceProcessor()
    sp.load(model_path)
    
    with open(corpus_path, 'r') as f_in, open(output_path, 'w', encoding='utf-8') as f_out:
        for line in f_in:
            tokenized = sp.encode_as_pieces(line.strip())  # Tokenizes the line using Unigram LM
            f_out.write(' '.join(tokenized) + '\n')


In [18]:
train_unigram('data.txt', vocab_size=94, model_prefix="unigram_model")

In [22]:
apply_unigram('data.txt', 'unigram_model.model', 'Unigram Language Modeling Tokenization.txt')

## WordPiece Tokenization

In [32]:
# !pip install tokenizers

In [30]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFD, StripAccents, Lowercase

def train_wordpiece(corpus_path, vocab_size=5000, model_prefix="wordpiece_model"):
    # Initialize the WordPiece model
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    tokenizer.normalizer = NFD()
    tokenizer.pre_tokenizer = Whitespace()

    # Define the trainer
    trainer = WordPieceTrainer(vocab_size=vocab_size, min_frequency=2, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

    # Train the model
    tokenizer.train([corpus_path], trainer)

    # Save the trained model
    tokenizer.save(f"{model_prefix}.json")

def apply_wordpiece(corpus_path, model_path, output_path):
    # Load the trained model
    tokenizer = Tokenizer.from_file(model_path)

    with open(corpus_path, 'r', encoding='utf-8') as f_in, open(output_path, 'w', encoding='utf-8') as f_out:
        for line in f_in:
            tokenized = tokenizer.encode(line.strip())  # Tokenizes the line using WordPiece
            f_out.write(' '.join(tokenized.tokens) + '\n')


In [31]:
train_wordpiece('data.txt', vocab_size=5000, model_prefix="wordpiece_model")

In [34]:
apply_wordpiece('data.txt', 'wordpiece_model.json', 'WordPiece Tokenization.txt')

##  Byte-Level BPE (BBPE)

In [39]:
def train_bbpe(corpus_path, vocab_size=5000, model_prefix="bbpe_model"):
    """
    Train a Byte-level BPE tokenizer using SentencePiece.
    
    Args:
    - corpus_path (str): Path to the text corpus for training.
    - vocab_size (int): Size of the vocabulary.
    - model_prefix (str): Prefix for saving the model.
    
    Returns:
    - None
    """
    spm.SentencePieceTrainer.train(
        f"--input={corpus_path} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type=bpe --character_coverage=1.0"
    )

def apply_bbpe(corpus_path, model_path, output_path):
    """
    Apply the trained BBPE model to the corpus.
    
    Args:
    - corpus_path (str): Path to the text corpus.
    - model_path (str): Path to the trained BBPE model.
    - output_path (str): Output path for the tokenized text.
    
    Returns:
    - None
    """
    sp = spm.SentencePieceProcessor()
    sp.load(model_path)
    
    with open(corpus_path, 'r') as f_in, open(output_path, 'w',encoding='utf-8') as f_out:
        for line in f_in:
            tokenized = sp.encode_as_pieces(line.strip())  # Tokenizes the line using Byte-level BPE
            f_out.write(' '.join(tokenized) + '\n')


In [37]:
train_bbpe('data.txt', vocab_size=465, model_prefix="bbpe_model")

In [40]:
apply_bbpe('data.txt', 'bbpe_model.model', 'Byte-Level BPE (BBPE).txt')