In [1]:
ord('牛')

29275

In [2]:
chr(29275)

'牛'

In [2]:
test_string = "hello! this is is a test"
utf8_encoded = test_string.encode('utf-8')
print(utf8_encoded)

b'hello! this is is a test'


In [3]:
print(type(utf8_encoded))
print(list(map(int,utf8_encoded)))

<class 'bytes'>
[104, 101, 108, 108, 111, 33, 32, 116, 104, 105, 115, 32, 105, 115, 32, 105, 115, 32, 97, 32, 116, 101, 115, 116]


In [4]:
from collections import defaultdict
import regex as re

class BPE:
    def __init__(self, corpus: str, vocab_size: int, special_tokens: list[str]) -> None:
        self.corpus: str = corpus
        self.vocab_size: int = vocab_size
        self.special_tokens: list[str] = special_tokens

        self.PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

        self.vocab: dict[int, bytes] = {i: bytes([i]) for i in range(256)}
        self.merge_sets: dict[int, tuple[int, int]] = {}

        self.special_token_to_id: dict[str, int] = {}

        # Add special tokens to vocab
        for i, token in enumerate(special_tokens):
            token_id = 256 + i  # reserve IDs above byte values
            self.vocab[token_id] = token.encode("utf-8")
            self.special_token_to_id[token] = token_id

        self.next_token_id = 256 + len(special_tokens)

        self.pretoken_table_count: dict[tuple[int, ...], int] = self.get_pre_token_freq_table()


    def get_pretoken(self) -> list[str]:
        return re.findall(self.PAT, self.corpus)

    def get_pre_token_freq_table(self) -> dict[tuple[int, ...], int]:
        freq_pre_tokens: dict[tuple[int, ...], int] = defaultdict(int)
        for pre_token in self.get_pretoken():
            pretoken_stream = tuple(pre_token.encode("utf-8"))
            freq_pre_tokens[pretoken_stream] += 1
        return freq_pre_tokens

    def get_token_freq_pairs(self) -> tuple[dict[tuple[int, int], int], dict[tuple[int, int], list[tuple[int, ...]]]]:
        token_pairs: dict[tuple[int, int], int] = defaultdict(int)
        index_table: dict[tuple[int, int], list[tuple[int, ...]]] = defaultdict(list)

        for pretoken, freq in self.pretoken_table_count.items():
            for token1, token2 in zip(pretoken, pretoken[1:]):
                token_pairs[(token1, token2)] += freq
                index_table[(token1, token2)].append(pretoken)

        return token_pairs, index_table

    def update_pre_token_table(self, most_frequent: tuple[int, int], pretokens_appeared: list[tuple[int, ...]]) -> None:
        token1, token2 = most_frequent
        new_token_id = max(self.vocab.keys()) + 1
        new_token_bytes = self.vocab[token1] + self.vocab[token2]

        self.vocab[new_token_id] = new_token_bytes
        self.merge_sets[new_token_id] = (token1, token2)

        for pretoken in pretokens_appeared:
            freq = self.pretoken_table_count[pretoken]
            new_pre_token = []
            i = 0

            while i < len(pretoken):
                if i < len(pretoken) - 1 and pretoken[i] == token1 and pretoken[i + 1] == token2:
                    new_pre_token.append(new_token_id)
                    i += 2
                else:
                    new_pre_token.append(pretoken[i])
                    i += 1

            del self.pretoken_table_count[pretoken]
            self.pretoken_table_count[tuple(new_pre_token)] += freq

    def merge(self) -> None:
        while len(self.vocab) < self.vocab_size:
            pair_freq, pair_index = self.get_token_freq_pairs()

            if not pair_freq:
                break

            most_frequent = max(pair_freq.items(), key=lambda x: x[1])[0]
            pretokens_appeared = pair_index[most_frequent]

            self.update_pre_token_table(most_frequent, pretokens_appeared)

    def encode(self, text: str) -> list[int]:
        """
        Tokenizes new input text using the trained merge rules.
        """
        pre_tokens = re.findall(self.PAT, text)
        encoded_tokens = []

        for token in pre_tokens:
            byte_sequence = list(token.encode("utf-8"))

            # Apply merges to byte sequence
            byte_sequence = self.apply_merges(byte_sequence)

            encoded_tokens.extend(byte_sequence)

        return encoded_tokens

    def apply_merges(self, byte_sequence: list[int]) -> list[int]:
        """
        Applies learned BPE merges to a sequence of byte-level token IDs.
        """
        merges = list(self.merge_sets.items())
        merges.sort()  # Ensure order of application (token ID ascending)

        for token_id, (a, b) in merges:
            i = 0
            new_sequence = []
            while i < len(byte_sequence):
                if i < len(byte_sequence) - 1 and byte_sequence[i] == a and byte_sequence[i + 1] == b:
                    new_sequence.append(token_id)
                    i += 2
                else:
                    new_sequence.append(byte_sequence[i])
                    i += 1
            byte_sequence = new_sequence

        return byte_sequence

    def decode(self, tokens: list[int]) -> str:
        """
        Converts token IDs back into a string using the vocab mapping.
        """
        byte_stream = b''.join(self.vocab[token] for token in tokens)
        return byte_stream.decode("utf-8", errors="replace")



In [5]:

corpus = f"LOUISVILLE, Ky. — A few unflattering reviews are to be expected with any hotel. The lobby of Hotel Louisville Pat McDonogh for Al Jazeera America Every homeless shelter has a NIMBY problem. Try building a new facility or renovating an old one and the neighbors come out of the woodwork to protest each additional bed. But the battle waged against Hotel Louisville was unusual even in the long history of Wayside Christian Mission, founded in 1957. The saga began six years ago, after the group finally raised enough money to replace its worn-out transitional-housing facility for women and kids. Initially, the married couple at Wayside’s helm — Tim Moseley, a bearded, heavyset minister, and his wife, Nina, an attorney with waist-length platinum blonde hair — intended to build on property it already owned along gentrifying Market Street. Real-estate developers with city-hall ties killed the plan, claiming the need for"
bpe = BPE(corpus, 300, [])
bpe.merge()
new_text = f"Then, in early 2009, the Moseleys heard that the downtown Holiday Inn, nicknamed “Hotel Louisville,” would be sold at a foreclosure auction. The final price tag of $10 million depleted all the funds Wayside had raised through its years-long capital campaign and proceeds from the Market Street sale, but at 187 rooms and 169,400 square feet, the building could house hundreds. Eighty-three homeless women moved into the hotel in November. Shortly thereafter, with utility costs mounting and many floors vacant, the Moseleys saw an opportunity. “People kept coming through and asking for a room,” Nina Moseley recalled. So Wayside opened Hotel Louisville to the public while continuing to provide shelter and substance-abuse recovery services to women in need, free of charge."
encoded = bpe.encode(new_text)
decoded = bpe.decode(encoded)
print(f"is the new text the same thing as the decoded? {new_text == decoded}")

is the new text the same thing as the decoded? True


In [1]:
from multiprocessing import Pool

def square(x):
    return x * x

with Pool(processes=4) as pool:
    result = pool.map(square, [1, 2, 3, 4])
print(result)


[1, 4, 9, 16]


In [1]:
import threading

def task(n):
    print(f"Task {n} running")

threads = []
for i in range(5):
    t = threading.Thread(target=task, args=(i,))
    t.start()
    threads.append(t)

for t in threads:
    t.join()


Task 0 running
Task 1 running
Task 2 running
Task 3 running
Task 4 running


In [1]:
from multiprocessing import Process, cpu_count

def task(n):
    print(f"Process {n} running")

processes = []
for i in range(5):
    p = Process(target=task, args=(i,))
    p.start()
    processes.append(p)

for p in processes:
    p.join()

print(f"the number of CPU in your computer is: {cpu_count()}")

Process 0 running
Process 1 running
Process 2 running
Process 3 running
Process 4 running
the number of CPU in your computer is: 8


In [4]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

def task(n):
    return f"Task {n} done"

# For concurrency
with ThreadPoolExecutor() as executor:
    results = executor.map(task, range(5))
    print(list(results))

# For parallelism
with ProcessPoolExecutor() as executor:
    results = executor.map(task, range(5))
    print(list(results))


['Task 0 done', 'Task 1 done', 'Task 2 done', 'Task 3 done', 'Task 4 done']
['Task 0 done', 'Task 1 done', 'Task 2 done', 'Task 3 done', 'Task 4 done']
