In [12]:
true_labels = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0]
pred_labels = [0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1]

In [13]:
def find_change_points(arr):
    change_points = []
    for i in range(1, len(arr)):
        if arr[i] != arr[i - 1]:
            change_points.append(i)
    return change_points

def calc_tp_fp_fn_tn(true_labels, pred_labels, tolerance=2):
    true_change_points = find_change_points(true_labels)
    pred_change_points = find_change_points(pred_labels)

    tp = 0
    fp = 0
    matched = set()

    for p in pred_change_points:
        match_found = False
        for t in true_change_points:
            if abs(p - t) <= tolerance and t not in matched:
                tp += 1
                matched.add(t)
                match_found = True
                break
        if not match_found:
            fp += 1

    fn = len(true_change_points) - len(matched)

    # total possible positions to classify
    total = len(true_labels)

    # TN = everything else not counted as TP, FP, FN
    tn = total - (tp + fp + fn)

    return tp, fp, fn, tn

def calc_precision_recall(tp, fp, fn, tn):
    # Precision = TP / (TP + FP)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    
    # Recall = TP / (TP + FN)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    
    return precision, recall



In [14]:
find_change_points(true_labels)

[6, 12]

In [15]:
find_change_points(pred_labels)

[11]

In [16]:
tp, fp, fn, tn = calc_tp_fp_fn_tn(true_labels, pred_labels, tolerance=2)
print(f"TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")

TP: 1, FP: 0, FN: 1, TN: 15


In [17]:
precision, recall = calc_precision_recall(tp, fp, fn, tn)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}")

Precision: 1.00, Recall: 0.50


### Calculate the tokizer compression

In [1]:
from transformers import AutoTokenizer

In [2]:
mbert_tokenizer_name = "google-bert/bert-base-multilingual-cased"
cpt_mbert_tokenizer_name = "OMRIDRORI/mbert-tibetan-continual-wylie-final"

In [None]:
text_path = "../dataset/tokenizer/tibetan_bert_ready_512.txt"
with open(text_path, "r", encoding="utf-8") as f:
    text = f.read()

In [None]:
mbert_tokenizer = AutoTokenizer.from_pretrained(mbert_tokenizer_name)
cpt_mbert_tokenizer = AutoTokenizer.from_pretrained(cpt_mbert_tokenizer_name)

In [None]:
mBERT_tokens = mbert_tokenizer.encode(text)
cpt_mBERT_tokens = cpt_mbert_tokenizer.encode(text)

In [None]:
chars = len(text)
comp1 = chars / len(tokens1)
comp2 = chars / len(tokens2)

print(f"Text length: {chars} characters\n")
print(f"{mbert_tokenizer_name}: {len(mBERT_tokens)} tokens, compression = {comp1:.2f} chars/token")
print(f"{cpt_mbert_tokenizer_name}: {len(cpt_mBERT_tokens)} tokens, compression = {comp2:.2f} chars/token")

In [None]:

def compare_tokenizers(text: str, tok1_name: str, tok2_name: str):
    # Load tokenizers
    

    # Tokenize
    tokens1 = tok1.encode(text)
    tokens2 = tok2.encode(text)

    # Compute compression (chars per token and tokens per char)
    chars = len(text)
    comp1 = chars / len(tokens1)  # average chars represented per token
    comp2 = chars / len(tokens2)

    print(f"Text length: {chars} characters\n")
    print(f"{tok1_name}: {len(tokens1)} tokens, compression = {comp1:.2f} chars/token")
    print(f"{tok2_name}: {len(tokens2)} tokens, compression = {comp2:.2f} chars/token")

# Example
text = "བོད་ཡིག་ནི་སློབ་གསོ་དང་གཞུང་ལས་སྒྲིག་བཀོད་ལ་སྤྱོད་མཁན་གྱིས་མང་པོ་ཡོད།"
compare_tokenizers(text, "bert-base-multilingual-cased", "meta-llama/Llama-3.2-3B")
