In [12]:
true_labels = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0]
pred_labels = [0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1]

In [13]:
def find_change_points(arr):
    change_points = []
    for i in range(1, len(arr)):
        if arr[i] != arr[i - 1]:
            change_points.append(i)
    return change_points

def calc_tp_fp_fn_tn(true_labels, pred_labels, tolerance=2):
    true_change_points = find_change_points(true_labels)
    pred_change_points = find_change_points(pred_labels)

    tp = 0
    fp = 0
    matched = set()

    for p in pred_change_points:
        match_found = False
        for t in true_change_points:
            if abs(p - t) <= tolerance and t not in matched:
                tp += 1
                matched.add(t)
                match_found = True
                break
        if not match_found:
            fp += 1

    fn = len(true_change_points) - len(matched)

    # total possible positions to classify
    total = len(true_labels)

    # TN = everything else not counted as TP, FP, FN
    tn = total - (tp + fp + fn)

    return tp, fp, fn, tn

def calc_precision_recall(tp, fp, fn, tn):
    # Precision = TP / (TP + FP)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    
    # Recall = TP / (TP + FN)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    
    return precision, recall



In [14]:
find_change_points(true_labels)

[6, 12]

In [15]:
find_change_points(pred_labels)

[11]

In [16]:
tp, fp, fn, tn = calc_tp_fp_fn_tn(true_labels, pred_labels, tolerance=2)
print(f"TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")

TP: 1, FP: 0, FN: 1, TN: 15


In [17]:
precision, recall = calc_precision_recall(tp, fp, fn, tn)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}")

Precision: 1.00, Recall: 0.50


### Calculate the tokizer compression

In [1]:
from transformers import AutoTokenizer

In [2]:
mbert_tokenizer_name = "google-bert/bert-base-multilingual-cased"
cpt_mbert_tokenizer_name = "OMRIDRORI/mbert-tibetan-continual-wylie-final"

In [4]:
text_path = "../dataset/tokenizer/tibetan_bert_ready_512.txt"
with open(text_path, "r", encoding="utf-8") as f:
    text = f.read()

In [3]:
mbert_tokenizer = AutoTokenizer.from_pretrained(mbert_tokenizer_name)
cpt_mbert_tokenizer = AutoTokenizer.from_pretrained(cpt_mbert_tokenizer_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/170k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/642k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [5]:
mBERT_tokens = mbert_tokenizer.encode(text)
cpt_mBERT_tokens = cpt_mbert_tokenizer.encode(text)

: 

In [None]:
n_tokens_mBERT = len(mBERT_tokens)
n_tokens_cpt_mBERT = len(cpt_mBERT_tokens)
n_words = len(text.split())

In [None]:
# Compression

mBERT_compression = (n_words / n_tokens_mBERT) if n_tokens_mBERT else float("nan")
cpt_mBERT_compression = (n_words / n_tokens_cpt_mBERT) if n_tokens_cpt_mBERT else float("nan")


print(f"mBERT Tokens: {n_tokens_mBERT}")
print(f"cpt-mBERT Tokens: {n_tokens_cpt_mBERT}")
print(f"Words: {n_words}")
print(f"Compression chars/token: {mBERT_compression:.6f}")
print(f"Compression words/token: {cpt_mBERT_compression:.6f}")


In [None]:
# Fertility

mBERT_fertility = (n_tokens_mBERT / n_words) if n_words else float("nan")
cpt_mBERT_fertility = (n_tokens_cpt_mBERT / n_words) if n_words else float("nan")

print(f"mBERT Fertility tokens/char: {mBERT_fertility:.6f}")
print(f"cpt-mBERT Fertility tokens/char: {cpt_mBERT_fertility:.6f}")
