In [21]:
# Negate + custom comparator to avoid comparing items when priorities tie
import heapq

class Rev:
    def __init__(self, val):
        self.val = val
    def __lt__(self, other):
        return self.val > other.val   # invert lexicographic order
    def __eq__(self, other):
        return self.val == other.val
    def __repr__(self):
        return repr(self.val)

pq = []
def push(pq, priority, item):
    # primary sort: -priority (max by priority), secondary: Rev(item) (max by lexicographic)
    heapq.heappush(pq, (-priority, Rev(item)))
def pop(pq):
    _, rev_item = heapq.heappop(pq)
    return rev_item.val

push(pq, 4, (b"aa", b"a"))
push(pq, 4, (b"a", b"ab"))
pop(pq), (b"aa", b"a") < (b"a", b"ab") # will pop the lexicographically largest tuple first


((b'aa', b'a'), False)

In [8]:
max(("aa", "a"), ("a", "ab")), "aa" > "a"

(('aa', 'a'), True)

In [1]:
import threading
import time

threads = []

def worker(i):
    time.sleep(5)
    print(i)

for i in range(10):
    t = threading.Thread(target=worker, args=(i,))
    t.start()
    threads.append(t)

for t in threads:
    t.join()

0
2
1
3
4
5
6
7
8
9


In [7]:
from concurrent.futures import ThreadPoolExecutor
from itertools import chain

def worker(i):
    return [i]

with ThreadPoolExecutor(max_workers=4) as executor:
    results = executor.map(worker, range(10))
    combined = list(chain.from_iterable(results))

print(combined)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [14]:
def gpt2_bytes_to_unicode():
    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("Â¡"), ord("Â¬") + 1)) + list(range(ord("Â®"), ord("Ã¿") + 1))
    cs = bs[:]
    # now get the representations of the other 68 integers that do need shifting
    # each will get mapped chr(256 + n), where n will grow from 0...67 in the loop
    # Get printable representations of the remaining integers 68 integers.
    n = 0
    for b in range(2**8):
        if b not in bs:
            # If this integer isn't in our list of visually-representable
            # charcters, then map it to the next nice character (offset by 256)
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    characters = [chr(n).encode('utf-8') for n in cs]
    d = dict(zip(bs, characters))
    return d

sorted(gpt2_bytes_to_unicode().items())

[(0, b'\xc4\x80'),
 (1, b'\xc4\x81'),
 (2, b'\xc4\x82'),
 (3, b'\xc4\x83'),
 (4, b'\xc4\x84'),
 (5, b'\xc4\x85'),
 (6, b'\xc4\x86'),
 (7, b'\xc4\x87'),
 (8, b'\xc4\x88'),
 (9, b'\xc4\x89'),
 (10, b'\xc4\x8a'),
 (11, b'\xc4\x8b'),
 (12, b'\xc4\x8c'),
 (13, b'\xc4\x8d'),
 (14, b'\xc4\x8e'),
 (15, b'\xc4\x8f'),
 (16, b'\xc4\x90'),
 (17, b'\xc4\x91'),
 (18, b'\xc4\x92'),
 (19, b'\xc4\x93'),
 (20, b'\xc4\x94'),
 (21, b'\xc4\x95'),
 (22, b'\xc4\x96'),
 (23, b'\xc4\x97'),
 (24, b'\xc4\x98'),
 (25, b'\xc4\x99'),
 (26, b'\xc4\x9a'),
 (27, b'\xc4\x9b'),
 (28, b'\xc4\x9c'),
 (29, b'\xc4\x9d'),
 (30, b'\xc4\x9e'),
 (31, b'\xc4\x9f'),
 (32, b'\xc4\xa0'),
 (33, b'!'),
 (34, b'"'),
 (35, b'#'),
 (36, b'$'),
 (37, b'%'),
 (38, b'&'),
 (39, b"'"),
 (40, b'('),
 (41, b')'),
 (42, b'*'),
 (43, b'+'),
 (44, b','),
 (45, b'-'),
 (46, b'.'),
 (47, b'/'),
 (48, b'0'),
 (49, b'1'),
 (50, b'2'),
 (51, b'3'),
 (52, b'4'),
 (53, b'5'),
 (54, b'6'),
 (55, b'7'),
 (56, b'8'),
 (57, b'9'),
 (58, b':'),
 (59, b';'),

In [3]:
a = "ðŸ™ƒ"

for ch in a:
    print(ch.encode('utf-8'))

b'\xf0\x9f\x99\x83'


In [7]:
import regex as re

text = "test<|prova|>test2<|prova2|>"
reg = ["<\|prova\|>","<\|prova2\|>"]

result = re.split(r'(' + '|'.join(reg) + ')', text)
result, '()'.join(["ciao", "prova"])

(['test', '<|prova|>', 'test2', '<|prova2|>', ''], 'ciao()prova')

In [9]:
isinstance(b"test", str)

False