In [None]:
from collections import defaultdict
import sentencepiece as spm


def read_alignment_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        alignments = file.readlines()
    return alignments

forward_alignments = read_alignment_file('forward.align')
reverse_alignments = read_alignment_file('reverse.align')
bidirectional_alignments = read_alignment_file('bidirectional.align')

def create_token_mapping(alignments):
    token_map = defaultdict(lambda: defaultdict(int))
    for line in alignments:
        parts = line.strip().split()
        for part in parts:
            en_idx, bn_idx = map(int, part.split('-'))
            token_map[en_idx][bn_idx] += 1
    return token_map

token_map = create_token_mapping(bidirectional_alignments)

# Filter low-frequency alignments
MIN_FREQUENCY = 5
filtered_token_map = {
    en_idx: {bn_idx: freq for bn_idx, freq in alignments.items() if freq >= MIN_FREQUENCY}
    for en_idx, alignments in token_map.items()
}

print(filtered_token_map)

In [None]:
def smooth_mappings(token_map, vocab_size, smoothing_factor=1e-5):
    smoothed_map = {}
    for en_idx, alignments in token_map.items():
        if not alignments:
            # Uniform distribution if no alignments
            smoothed_map[en_idx] = [1.0 / vocab_size] * vocab_size
        else:
            total = sum(alignments.values())
            # Initialize with a small probability mass
            smoothed_map[en_idx] = [smoothing_factor] * vocab_size
            # Add the observed probabilities
            for bn_idx, freq in alignments.items():
                smoothed_map[en_idx][int(bn_idx)] += freq / total
            # Renormalize
            total_prob = sum(smoothed_map[en_idx])
            smoothed_map[en_idx] = [prob / total_prob for prob in smoothed_map[en_idx]]
    return smoothed_map

bn_vocab_size = 32000  # Set this to the size of your Bengali vocabulary
smoothed_mappings = smooth_mappings(filtered_token_map, bn_vocab_size)

In [None]:
# Test with a small example
test_token_map = {
    0: {0: 10, 1: 5},
    1: {1: 15, 2: 5},
    2: {}  # No alignments for this token
}

test_vocab_size = 5
test_smoothed_mappings = smooth_mappings(test_token_map, test_vocab_size)

for en_idx, probs in test_smoothed_mappings.items():
    print(f"Token {en_idx}: {probs}")
    print(f"Sum of probabilities: {sum(probs)}")
    print()

In [None]:
bn_vocab_size = 32000  # Set this to the size of your Bengali vocabulary
smoothed_mappings = smooth_mappings(filtered_token_map, bn_vocab_size)

In [None]:
print(f"Number of tokens mapped: {len(smoothed_mappings)}")

In [None]:
for token, probs in list(smoothed_mappings.items())[:91]:  # Check first 10 tokens
    print(f"Token {token} sum: {sum(probs)}")

In [None]:
for token in [0, 1, 10, 100, 1000]:  # Check some specific token ids
    if token in smoothed_mappings:
        top_5 = sorted(enumerate(smoothed_mappings[token]), key=lambda x: x[1], reverse=True)[:5]
        print(f"Token {token} top 5 mappings: {top_5}")