# Exercise 3

In [None]:
import os
import numpy as np
import tqdm
import torch
from sklearn.datasets import load_svmlight_file
import re
import glob
import collections
import string
from typing import List, Dict

In [None]:
def sorter(item):
    """ Function tha gets only the first number of the name of the file and organizes the files base on that"""
    
    return int(os.path.basename(item).split('_')[0])

def read_raw_text(path_data):
    """ Function for reading the raw data in the .txt files. 
    
    Parameters
    ----------
    path_data: str
        path of the folder that contains the data that is going to be used. (should be test or train)
        
    Returns
    ---------
    data,scores: array_like
        Data arrays, X is an array of shape [#documents of the dataset, #words in the vocabulary], y is an array of shape [#documents,] 
    """
    
    data = []
    scores = []
    
    sentiments = ['pos', 'neg']
    for sentiment in sentiments:
        path_vocab_pos = os.path.join(".", "aclImdb", path_data, sentiment, "*.txt")
        
        for filename in sorted(glob.glob(path_vocab_pos), key=sorter):
            
            with open(filename, encoding='utf8') as f:
                
                lines = f.read()
                
                data.append(lines)
                scores.append(int(os.path.basename(filename).split('_')[1].strip('.txt')))
    return data, scores


def read_vocab():
    """ Function for reading the vocabulary (.initial_vocab file). 
    
    Parameters
    ----------
    None
        
    Returns
    ---------
    initial_vocab: list
        list with the values different tokens that compose the vocabulary ...... 
    """
    
    path_vocab = os.path.join(".", "aclImdb", "imdb.initial_vocab")
    
    with open(path_vocab, encoding='utf-8') as f:
        lines = f.read()

    lines = lines.split('\n')
    
    vocab = []
    for line in lines:
        vocab.append(line)
    
    return vocab

## Task 1: Pre-Processing

In [None]:
# copy your pre-processing pipeline code from the previous exercise here
def pre_process(
    reviews,
    tokenize_punct=False,
    lowercase=False,
    remove_punct=False,
    remove_high_freq_terms=False,
    high_freq_threshold=0.5,
    replace_numbers=False,
    remove_stopwords=False
):
    if tokenize_punct:
        tokens = []
        for review in reviews:
            words = re.split(r"[^a-zA-Z0-9'-]+", review)        
            tokens.append(words)
    
    if lowercase:
        tokens = [[word.lower() for word in docu] for docu in tokens]

    if remove_punct:
        tokens = [[item.translate(str.maketrans('', '', string.punctuation)) for item in words] for words in tokens]

    if remove_high_freq_terms:
        word_counts = {}
        total_words = 0

        for docu in tokens:
            for word in docu:
                total_words += 1
                word_counts[word] = word_counts.get(word, 0) + 1

        sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
        num_top_words = int(len(sorted_words) * high_freq_threshold)
        high_freq_words = set(word for word, count in sorted_words[:num_top_words])

        tokens = [[word for word in docu if word not in high_freq_words]for docu in tokens]

    if replace_numbers:
        tokens = [[re.sub(r'\d', "<NUM>", word) for word in words if "_" not in word] for words in tokens]
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english')) 
        tokens = [[word for word in words if word.lower() not in stop_words] for words in tokens]
    return tokens

In [None]:
# load the raw data
data, scores = read_raw_text('train')
data_test, scores_test = read_raw_text('test')

In [None]:
# apply the pre-processing pipeline
pre_processed_data = pre_process(data, lowercase=True, remove_punct=True)
pre_processed_data_test = pre_process(data_test, lowercase=True, remove_punct=True)

## Task 2: Byte-Pair Tokenizer

You can refer to this [Hugging Face tutorial](https://huggingface.co/learn/llm-course/en/chapter6/5) for a detailed explanation of the BPE algorithm.

In [None]:
class BPETokenizer:
    def __init__(self, base_vocab: str, num_merges: int = 1000):
        self.base_vocab = base_vocab
        self.num_merges = num_merges
        self.vocab = None
        
#corpus needs to have the frequency of each word
#to train i should tokenize on characters, and start populating with character-level tokens + freq of each char
# then iterate over inital vocab to calculate the freq of adj chars
#iteration should be within the specified range num_merges

    def train(self, texts: List[str]):
        pass # todo

    def tokenize(self, text: str) -> List[List[str]]:
        pass # todo


### 2 (a): Base Vocabulary = Characters from Reviews

In [None]:
# Extract all unique characters from the data
unique_chars = # todo

# Train the tokenizer
bpe_char_based = BPETokenizer(base_vocab=unique_chars, num_merges=1000)
bpe_char_based.train(pre_processed_data)

### 2 (b): Base Vocabulary = ASCII Characters

In [None]:
ascii_chars = list(string.printable)

bpe_ascii_based = BPETokenizer(base_vocab=ascii_chars, num_merges=1000)
bpe_ascii_based.train(pre_processed_data)

In [None]:
# Analyze the two vocabularies
print("Char-based initial_vocab sample:")
print(list(bpe_char_based.vocab.items())[:10])

print("ASCII-based initial_vocab sample:")
print(list(bpe_ascii_based.vocab.items())[:10])

In [None]:
# Compare Tokenization of a New/Unknown Word
unknown_word = "backpropagationlessness"

print("Char-based tokenization:", bpe_char_based.tokenize_word(unknown_word))
print("ASCII-based tokenization:", bpe_ascii_based.tokenize_word(unknown_word))

In [None]:
# more comparison ...

## Task 3: WordPiece Tokenizer

You can refer to this [Hugging Face tutorial](https://huggingface.co/learn/llm-course/en/chapter6/6) for a detailed explanation of the WordPiece algorithm.

In [None]:
import collections
import re

class WordPieceTokenizer:
    def __init__(self, vocab_size=1000, initial_vocab=None):
        self.vocab_size = vocab_size
        self.initial_vocab = set(initial_vocab) if initial_vocab else set()
        self.vocab = {}
        

    def train(self, texts: List[str]):
        pass # todo

    def tokenize(self, text: str) -> List[List[str]]:
        pass # todo


### 3 (a): Base Vocabulary = Characters from Reviews

In [None]:
corpus_chars = # todo
tokenizer_a = WordPieceTokenizer(vocab_size=1000, initial_vocab=corpus_chars)
vocab_a = tokenizer_a.train(pre_processed_data)
print(f"Vocabulary A (corpus chars): {sorted(vocab_a)}")

### 3 (b): Base Vocabulary = Characters from Reviews + ASCII Characters

In [None]:
ascii_chars = set(string.printable)
initial_vocab = sorted(corpus_chars.union(ascii_chars))
tokenizer_b = WordPieceTokenizer(vocab_size=1000, initial_vocab=initial_vocab)
vocab_b = tokenizer_b.train(pre_processed_data)

In [None]:
# Compare Tokenization of a New/Unknown Word
unknown_word = "backpropagationlessness"
print("WordPiece A tokenization:", tokenizer_a.tokenize_word(unknown_word))
print("WordPiece B tokenization:", tokenizer_b.tokenize_word(unknown_word))

In [None]:
# more comparison ...

## Task 4: Hugging Face Implementations

In [None]:
# Hugging Face Byte-Pair Encoder (BPE)
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace


In [None]:
# Hugging Face WordPiece Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer


In [None]:
# compare the different tokenizers