In [1]:
print("Hello world")

Hello world


In [2]:
import requests

# Example: Downloading a parallel corpus from OPUS
url = "http://opus.nlpl.eu/download.php?f=JW300/v1/tmx/en-bn.tmx.gz"
response = requests.get(url)
with open("en-bn.tmx.gz", "wb") as file:
    file.write(response.content)


In [3]:
import xml.etree.ElementTree as ET

# Path to the decompressed XML file
decompressed_file_path = 'bn-en_US.xml/bn-en_US.tmp'

# Parse the XML file
tree = ET.parse(decompressed_file_path)
root = tree.getroot()

# Function to extract data from the XML
def extract_data(root):
    links = root.find('linkList')
    alignments = []
    
    for linkGrp in links.findall('linkGrp'):
        from_doc = linkGrp.get('fromDoc')
        to_doc = linkGrp.get('toDoc')
        
        for link in linkGrp.findall('link'):
            id = link.get('id')
            xtargets = link.get('xtargets')
            alignments.append({
                'id': id,
                'xtargets': xtargets,
                'from_doc': from_doc,
                'to_doc': to_doc
            })
    
    return alignments

# Extract alignments
alignments = extract_data(root)

# Display some extracted alignments
for alignment in alignments[:10]:  # Print the first 10 alignments
    print(alignment)

# Save to a file if needed
with open('extracted_alignments.txt', 'w') as f:
    for alignment in alignments:
        f.write(f"{alignment}\n")

print("Alignments extracted successfully.")


{'id': 'a.1', 'xtargets': 's2 ; s2', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz'}
{'id': 'a.2', 'xtargets': 's3 ; s3', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz'}
{'id': 'a.3', 'xtargets': 's4 ; s4', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz'}
{'id': 'a.4', 'xtargets': 's5 ; s5', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz'}
{'id': 'a.5', 'xtargets': 's6 ; s6', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz'}
{'id': 'a.6', 'xtargets': 's7 ; s7', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.x

In [4]:
import ast

def load_alignments(file_path):
    alignments = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            alignments.append(ast.literal_eval(line.strip()))
    return alignments

# Load the alignments
alignments = load_alignments('extracted_alignments.txt')

# Display some alignments
for alignment in alignments[:10]:  # Print the first 10 alignments
    print(alignment)


{'id': 'a.1', 'xtargets': 's2 ; s2', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz'}
{'id': 'a.2', 'xtargets': 's3 ; s3', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz'}
{'id': 'a.3', 'xtargets': 's4 ; s4', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz'}
{'id': 'a.4', 'xtargets': 's5 ; s5', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz'}
{'id': 'a.5', 'xtargets': 's6 ; s6', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz'}
{'id': 'a.6', 'xtargets': 's7 ; s7', 'from_doc': 'bn/gcalctool.gnome-3-4/gcalctool.gnome-3-4.xml.gz', 'to_doc': 'en_US/gcalctool.gnome-3-4/gcalctool.gnome-3-4.x

In [5]:
import gzip
import xml.etree.ElementTree as ET

def extract_alignments(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        tree = ET.parse(f)
        root = tree.getroot()
        alignments = []
        for link in root.findall(".//link"):
            align = {
                'id': link.attrib['id'],
                'xtargets': link.attrib['xtargets'],
                'from_doc': link.attrib.get('fromDoc', ''),
                'to_doc': link.attrib.get('toDoc', '')
            }
            alignments.append(align)
    return alignments

alignments = extract_alignments('bn-en_US.xml.gz')
print(alignments)


[{'id': 'a.1', 'xtargets': 's2 ; s2', 'from_doc': '', 'to_doc': ''}, {'id': 'a.2', 'xtargets': 's3 ; s3', 'from_doc': '', 'to_doc': ''}, {'id': 'a.3', 'xtargets': 's4 ; s4', 'from_doc': '', 'to_doc': ''}, {'id': 'a.4', 'xtargets': 's5 ; s5', 'from_doc': '', 'to_doc': ''}, {'id': 'a.5', 'xtargets': 's6 ; s6', 'from_doc': '', 'to_doc': ''}, {'id': 'a.6', 'xtargets': 's7 ; s7', 'from_doc': '', 'to_doc': ''}, {'id': 'a.7', 'xtargets': 's9 ; s9', 'from_doc': '', 'to_doc': ''}, {'id': 'a.8', 'xtargets': 's10 ; s10', 'from_doc': '', 'to_doc': ''}, {'id': 'a.9', 'xtargets': 's11 ; s11', 'from_doc': '', 'to_doc': ''}, {'id': 'a.10', 'xtargets': 's12 ; s12', 'from_doc': '', 'to_doc': ''}, {'id': 'a.11', 'xtargets': 's13 ; s13', 'from_doc': '', 'to_doc': ''}, {'id': 'a.12', 'xtargets': 's14 ; s14', 'from_doc': '', 'to_doc': ''}, {'id': 'a.13', 'xtargets': 's15 ; s15', 'from_doc': '', 'to_doc': ''}, {'id': 'a.14', 'xtargets': 's16 ; s16', 'from_doc': '', 'to_doc': ''}, {'id': 'a.15', 'xtargets': '

In [10]:
%pip install lxml


Note: you may need to restart the kernel to use updated packages.


In [13]:
# Sample alignments data
alignments = [
    {'id': 'a.1', 'xtargets': 's1 ; s1'},
    {'id': 'a.2', 'xtargets': 's2 ; s2'},
    {'id': 'a.3', 'xtargets': 's3 ; s3'},
    {'id': 'a.4', 'xtargets': 's4 ; s4'},
    {'id': 'a.5', 'xtargets': 's5 ; s5'},
]

# Sample Bangla and English sentences
bn_sentences = {
    's1': 'বিড়ালটি মাদুরের উপর আছে।',
    's2': 'আমি বাজারে যাচ্ছি।',
    's3': 'সে ফুটবল খেলতে ভালোবাসে।',
    's4': 'সে একটি বই পড়ছে।',
    's5': 'সূর্যটি উজ্জ্বলভাবে আলো দিচ্ছে।',
}

en_sentences = {
    's1': 'The cat is on the mat.',
    's2': 'I am going to the market.',
    's3': 'He loves to play football.',
    's4': 'She is reading a book.',
    's5': 'The sun is shining brightly.',
}

# Initialize lists to store the aligned Bangla and English sentences
aligned_bn_sentences = []
aligned_en_sentences = []

# Match sentences using the alignments
for align in alignments:
    bn_id, en_id = align['xtargets'].split(' ; ')
    aligned_bn_sentences.append(bn_sentences[bn_id])
    aligned_en_sentences.append(en_sentences[en_id])

# Ensure the directories exist
import os
os.makedirs('data/bn', exist_ok=True)
os.makedirs('data/en', exist_ok=True)

# Save the aligned Bangla sentences to a file
with open('data/bn/alignments.txt', 'w') as f:
    for sentence in aligned_bn_sentences:
        f.write(sentence + '\n')

# Save the aligned English sentences to a file
with open('data/en/alignments.txt', 'w') as f:
    for sentence in aligned_en_sentences:
        f.write(sentence + '\n')

print("Sample alignments have been saved successfully.")


Sample alignments have been saved successfully.


In [17]:
%%writefile transtokenizers.py
def translate(bn_sentences, en_sentences):
    translations = []
    for bn, en in zip(bn_sentences, en_sentences):
        translation = f"Translated: {bn} -> {en}"
        translations.append(translation)
    return translations


Writing transtokenizers.py


In [23]:
# Function to load sentences from a file
def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

# Function to process alignments and generate translations
def process_alignments():
    # Load Bangla and English sentences
    bn_sentences = load_sentences('data/bn/alignments.txt')
    en_sentences = load_sentences('data/en/alignments.txt')

    # Ensure both files have the same number of sentences
    if len(bn_sentences) != len(en_sentences):
        raise ValueError("The number of Bangla and English sentences must match.")

    # Translate the sentences
    translations = [f"Translated: {bn} -> {en}" for bn, en in zip(bn_sentences, en_sentences)]

    # Print or further process the translations
    for bn, en, translation in zip(bn_sentences, en_sentences, translations):
        print(f"BN: {bn}\nEN: {en}\nTranslation: {translation}\n")

# Call the function to process alignments
process_alignments()


BN: বিড়ালটি মাদুরের উপর আছে।
EN: The cat is on the mat.
Translation: Translated: বিড়ালটি মাদুরের উপর আছে। -> The cat is on the mat.

BN: আমি বাজারে যাচ্ছি।
EN: I am going to the market.
Translation: Translated: আমি বাজারে যাচ্ছি। -> I am going to the market.

BN: সে ফুটবল খেলতে ভালোবাসে।
EN: He loves to play football.
Translation: Translated: সে ফুটবল খেলতে ভালোবাসে। -> He loves to play football.

BN: সে একটি বই পড়ছে।
EN: She is reading a book.
Translation: Translated: সে একটি বই পড়ছে। -> She is reading a book.

BN: সূর্যটি উজ্জ্বলভাবে আলো দিচ্ছে।
EN: The sun is shining brightly.
Translation: Translated: সূর্যটি উজ্জ্বলভাবে আলো দিচ্ছে। -> The sun is shining brightly.



In [24]:
from transtokenizers import translate  # Ensure this import is correct based on your file structure

def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

def process_alignments():
    # Load Bangla and English sentences
    bn_sentences = load_sentences('data/bn/alignments.txt')
    en_sentences = load_sentences('data/en/alignments.txt')

    # Ensure both files have the same number of sentences
    if len(bn_sentences) != len(en_sentences):
        raise ValueError("The number of Bangla and English sentences must match.")

    # Translate the sentences
    translations = translate(bn_sentences)

    # Print or further process the translations
    for bn, en, translation in zip(bn_sentences, en_sentences, translations):
        print(f"BN: {bn}\nEN: {en}\nTranslation: {translation}\n")

# Call the function to process alignments
process_alignments()


ImportError: cannot import name 'translate' from 'transtokenizers' (/home/ubuntu/transtokenizer/transtokenizers/__init__.py)

In [25]:
import os

def read_sentences(file_path, num_sentences=10):
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = [next(f).strip() for _ in range(num_sentences)]
    return sentences

# Read the first 10 sentences from the downloaded files
bn_sentences = read_sentences('ben-ben2017.txt', 10)
en_sentences = read_sentences('eng-enggnv.txt', 10)

# Check if the number of sentences match
if len(bn_sentences) != len(en_sentences):
    raise ValueError("The number of Bangla and English sentences must match.")

# Save the sentences to the respective alignment files
bn_align_path = 'data/bn/alignments.txt'
en_align_path = 'data/en/alignments.txt'

os.makedirs(os.path.dirname(bn_align_path), exist_ok=True)
os.makedirs(os.path.dirname(en_align_path), exist_ok=True)

with open(bn_align_path, 'w', encoding='utf-8') as bn_file:
    for sentence in bn_sentences:
        bn_file.write(sentence + '\n')

with open(en_align_path, 'w', encoding='utf-8') as en_file:
    for sentence in en_sentences:
        en_file.write(sentence + '\n')

print("Alignments have been saved successfully.")


Alignments have been saved successfully.


In [26]:
# translation function
def translate(bn_sentences, en_sentences):
    translations = []
    for bn, en in zip(bn_sentences, en_sentences):
        translation = f"Translated: {bn} -> {en}"
        translations.append(translation)
    return translations


In [27]:
import os

def read_sentences(file_path, num_sentences=10):
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = [next(f).strip() for _ in range(num_sentences)]
    return sentences

def process_alignments():

    bn_sentences = read_sentences('ben-ben2017.txt', 10)
    en_sentences = read_sentences('eng-enggnv.txt', 10)


    if len(bn_sentences) != len(en_sentences):
        raise ValueError("The number of Bangla and English sentences must match.")

 
    bn_align_path = 'data/bn/alignments.txt'
    en_align_path = 'data/en/alignments.txt'

    os.makedirs(os.path.dirname(bn_align_path), exist_ok=True)
    os.makedirs(os.path.dirname(en_align_path), exist_ok=True)

    with open(bn_align_path, 'w', encoding='utf-8') as bn_file:
        for sentence in bn_sentences:
            bn_file.write(sentence + '\n')

    with open(en_align_path, 'w', encoding='utf-8') as en_file:
        for sentence in en_sentences:
            en_file.write(sentence + '\n')

    print("Alignments have been saved successfully.")


    translations = translate(bn_sentences, en_sentences)


    for bn, en, translation in zip(bn_sentences, en_sentences, translations):
        print(f"BN: {bn}\nEN: {en}\nTranslation: {translation}\n")


process_alignments()


Alignments have been saved successfully.
BN: আদিতে ঈশ্বর আকাশমণ্ডল ও পৃথিবীর সৃষ্টি করলেন।
EN: In the beginning God created the heauen and the earth.
Translation: Translated: আদিতে ঈশ্বর আকাশমণ্ডল ও পৃথিবীর সৃষ্টি করলেন। -> In the beginning God created the heauen and the earth.

BN: পৃথিবী বিন্যাস বিহীন ও শূন্য ছিল এবং অন্ধকার জলরাশির উপরে ছিল, আর ঈশ্বরের আত্মা জলের উপরে চলাচল করছিলেন।
EN: And the earth was without forme and void, and darkenesse was vpon the deepe, and the Spirit of God mooued vpon the waters.
Translation: Translated: পৃথিবী বিন্যাস বিহীন ও শূন্য ছিল এবং অন্ধকার জলরাশির উপরে ছিল, আর ঈশ্বরের আত্মা জলের উপরে চলাচল করছিলেন। -> And the earth was without forme and void, and darkenesse was vpon the deepe, and the Spirit of God mooued vpon the waters.

BN: পরে ঈশ্বর বললেন, আলো হোক; তাতে আলো হল।
EN: Then God said, Let there be light: And there was light.
Translation: Translated: পরে ঈশ্বর বললেন, আলো হোক; তাতে আলো হল। -> Then God said, Let there be light: And there was light.



In [28]:
import pandas as pd


bn_align_path = 'data/bn/alignments.txt'
en_align_path = 'data/en/alignments.txt'

bn_sentences = []
en_sentences = []

with open(bn_align_path, 'r', encoding='utf-8') as bn_file:
    bn_sentences = bn_file.readlines()

with open(en_align_path, 'r', encoding='utf-8') as en_file:
    en_sentences = en_file.readlines()


df = pd.DataFrame({
    'Bangla': [sentence.strip() for sentence in bn_sentences],
    'English': [sentence.strip() for sentence in en_sentences]
})

df.head()


Unnamed: 0,Bangla,English
0,আদিতে ঈশ্বর আকাশমণ্ডল ও পৃথিবীর সৃষ্টি করলেন।,In the beginning God created the heauen and th...
1,পৃথিবী বিন্যাস বিহীন ও শূন্য ছিল এবং অন্ধকার জ...,"And the earth was without forme and void, and ..."
2,"পরে ঈশ্বর বললেন, আলো হোক; তাতে আলো হল।","Then God said, Let there be light: And there w..."
3,তখন ঈশ্বর আলো উত্তম দেখলেন এবং ঈশ্বর আলো ও অন্...,"And God saw the light that it was good, and Go..."
4,আর ঈশ্বর আলোর নাম “দিন” ও অন্ধকারের নাম “রাত” ...,"And God called the Light, Day, and the darkene..."


In [46]:
import os
import regex as re
from tqdm import tqdm
import transformers
from collections import defaultdict

# function to get dataset iterator
def get_dataset_iterator(source_file: str, target_file: str):
    class DatasetWrapper:
        def __init__(self, source_file, target_file):
            self.source_file = open(source_file, 'r', encoding='utf-8')
            self.target_file = open(target_file, 'r', encoding='utf-8')

        def __iter__(self):
            return self

        def __next__(self):
            source_line = self.source_file.readline().strip()
            target_line = self.target_file.readline().strip()
            if not source_line or not target_line:
                self.source_file.close()
                self.target_file.close()
                raise StopIteration
            return source_line, target_line

    return DatasetWrapper(source_file, target_file)

# function to create aligned corpus
def create_aligned_corpus(source_language, target_language, source_tokenizer, target_tokenizer, source_file, target_file):
    home_path = os.environ.get('TT_HOME', '.')
    aligned_file_path = f'{home_path}/alignments/{source_language}-{target_language}.aligned'

    if os.path.exists(aligned_file_path):
        print('Aligned corpus already exists.')
        return aligned_file_path

    os.makedirs(f'{home_path}/alignments', exist_ok=True)
    dataset = get_dataset_iterator(source_file, target_file)

    old_tokenizer = transformers.AutoTokenizer.from_pretrained(source_tokenizer)
    new_tokenizer = transformers.AutoTokenizer.from_pretrained(target_tokenizer)

    with open(aligned_file_path, 'w', encoding='utf-8') as f:
        for line_source, line_target in tqdm(dataset):
            line1 = ' '.join(old_tokenizer.tokenize(line_source.strip()))
            line2 = ' '.join(new_tokenizer.tokenize(line_target.strip()))
            f.write(line1 + ' ||| ' + line2 + '\n')

    return aligned_file_path

# Create the aligned corpus
corpus = create_aligned_corpus(
    source_language="eng-enggnv",
    target_language="ben-ben2017",
    source_tokenizer="distilgpt2",
    target_tokenizer="t5-small",
    source_file="data/eng-enggnv.txt",
    target_file="data/ben-ben2017.txt"
)

corpus


3166it [00:00, 3843.10it/s]


'./alignments/eng-enggnv-ben-ben2017.aligned'

In [47]:
import os

def align(corpus: str, fast_align_path: str = "~/transtokenizer/fast_align/build/fast_align") -> str:
    if ".aligned" not in corpus:
        raise ValueError("The input file should be an aligned file")

    aligned_output = corpus.replace(".aligned", ".fast_align.tsv")
    os.system(f'{fast_align_path} -i {corpus} -d -o -v > {aligned_output}')

    return aligned_output

# Align tokens using fast_align
aligned_corpus = align(corpus, fast_align_path="~/transtokenizer/fast_align/build/fast_align")
aligned_corpus


ARG=i
ARG=d
ARG=o
ARG=v
INITIAL PASS 
...
expected target length = source length * 1.12298
ITERATION 1
...
  log_e likelihood: -2.7662e+06
  log_2 likelihood: -3.99079e+06
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.163803
       size counts: 1471
ITERATION 2
...
  log_e likelihood: -621698
  log_2 likelihood: -896920
     cross entropy: 6.71936
        perplexity: 105.373
      posterior p0: 0.0989839
 posterior al-feat: -0.144928
       size counts: 1471
  1  model al-feat: -0.154354 (tension=4)
  2  model al-feat: -0.150333 (tension=4.18851)
  3  model al-feat: -0.148096 (tension=4.29661)
  4  model al-feat: -0.146807 (tension=4.35996)
  5  model al-feat: -0.146051 (tension=4.39754)
  6  model al-feat: -0.145602 (tension=4.42)
  7  model al-feat: -0.145333 (tension=4.43348)
  8  model al-feat: -0.145172 (tension=4.44158)
     final tension: 4.44646
ITERATION 3
...
  log_e likelihood: -475667
  log_2 likelihood: -686242
     c

'./alignments/eng-enggnv-ben-ben2017.fast_align.tsv'

In [49]:
# Inspect the contents of the aligned_corpus file
with open(aligned_corpus, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Display the first few lines
for i in range(10):
    print(lines[i])



0-0 2-1 1-2 3-3 5-4 4-5 5-6 9-7 8-8 11-9 9-10 4-11 10-12 12-13

1-0 2-1 1-2 5-3 6-6 7-7 10-9 14-11 10-13 11-15 16-16 18-17 17-19 20-20 14-21 22-22 29-24 27-26 29-27 25-28 29-29 35-30 30-31 33-32 32-33 31-34 34-35 36-36

0-0 0-1 3-2 1-3 3-4 2-5 4-6 5-7 7-8 6-9 6-10 8-11 8-12 9-13 11-14 12-15 12-16 13-17

0-0 0-1 3-2 1-3 3-4 4-5 5-6 8-7 7-8 2-9 9-10 10-11 10-12 11-13 12-14 14-15 12-16 10-17 16-18 17-19 16-20 15-21 18-22 19-23

0-0 0-1 3-2 1-3 3-4 4-5 5-6 2-7 7-8 28-9 16-10 9-11 8-12 11-13 10-14 14-16 12-17 12-18 16-19 17-20 14-21 17-22 21-23 20-24 22-25 20-26 24-27 23-28 24-29 25-30 27-31 26-32 28-33 26-34 29-35

1-0 0-1 1-2 2-3 4-4 3-5 5-6 3-7 16-8 8-9 11-10 11-11 24-12 12-13 10-14 12-15 14-16 19-17 18-19 18-20 16-21 22-23 22-24 23-25 22-26 21-27 25-28 26-29 27-30

3-0 1-1 3-2 8-4 8-6 5-7 8-8 15-9 17-11 14-12 16-13 22-15 20-17 15-19 22-21 25-22 29-23 26-24 32-25 31-26 35-28 35-30 35-31 36-32

0-0 0-1 3-2 1-3 3-4 4-5 6-6 2-7 6-8 7-9 8-10 9-11 10-12 0-13 12-14 14-15 13-16 15-17 13-18 17-1

In [50]:
def map_tokens(mapped_tokens_file: str, source_tokenizer: str, target_tokenizer: str):
    old_tokenizer = transformers.AutoTokenizer.from_pretrained(source_tokenizer)
    new_tokenizer = transformers.AutoTokenizer.from_pretrained(target_tokenizer)

    tokenized_possible_translations = defaultdict(lambda: defaultdict(int))

    with open(mapped_tokens_file, 'r', encoding='utf-8') as f:
        for line in f:
            pairs = line.strip().split(' ')
            for pair in pairs:
                try:
                    old_index, new_index = pair.split('-')
                    old_token = old_tokenizer.convert_ids_to_tokens(int(old_index))
                    new_token = new_tokenizer.convert_ids_to_tokens(int(new_index))
                    tokenized_possible_translations[new_token][old_token] += 1
                except ValueError:
                    continue

    return tokenized_possible_translations

# Map tokens
tokenized_possible_translations = map_tokens(aligned_corpus, "distilgpt2", "t5-small")


In [51]:
def smooth_mapping(target_tokenizer: str, tokenized_possible_translations: dict) -> dict:
    new_tokenizer = transformers.AutoTokenizer.from_pretrained(target_tokenizer)
    final_list = []

    for token in new_tokenizer.get_vocab():
        if token in tokenized_possible_translations:
            possible_translations = tokenized_possible_translations[token]
            total_count = sum(possible_translations.values())
            probabilities = {old_token: count / total_count for old_token, count in possible_translations.items()}
            probabilities = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
            final_list.append((token, probabilities))

    return final_list

# Smooth mapping
smoothed_mapping = smooth_mapping("t5-small", tokenized_possible_translations)



In [3]:
import pandas as pd


csv_path = "translated_alignments.csv"
df = pd.read_csv(csv_path)


df.head()


Unnamed: 0,Bangla,English
0,আদিতে ঈশ্বর আকাশমণ্ডল ও পৃথিবীর সৃষ্টি করলেন।,In the beginning God created the heauen and th...
1,পৃথিবী বিন্যাস বিহীন ও শূন্য ছিল এবং অন্ধকার জ...,"And the earth was without forme and void, and ..."
2,"পরে ঈশ্বর বললেন, আলো হোক; তাতে আলো হল।","Then God said, Let there be light: And there w..."
3,তখন ঈশ্বর আলো উত্তম দেখলেন এবং ঈশ্বর আলো ও অন্...,"And God saw the light that it was good, and Go..."
4,আর ঈশ্বর আলোর নাম “দিন” ও অন্ধকারের নাম “রাত” ...,"And God called the Light, Day, and the darkene..."


In [15]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset

# Load the dataset from the CSV file
dataset = load_dataset("csv", data_files={"train": "translated_alignments.csv"})

# Split the dataset into train and test sets
dataset = dataset['train'].train_test_split(test_size=0.1)

# Preprocess the dataset
def preprocess_function(examples):
    inputs = [ex for ex in examples['English']]
    targets = [ex for ex in examples['Bangla']]
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)
    labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Define updated training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_dir="./logs",  # Set logging directory
    logging_steps=10,  # Set logging steps
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,  # Increase the number of epochs
    predict_with_generate=True
)

# Initialize the model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

# Fine-tune the model
trainer.train()

# Save the model and tokenizer
trainer.save_model("./results")
tokenizer.save_pretrained("./results")


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,12.672417
2,No log,12.404377
3,No log,12.177648
4,No log,11.988556
5,9.495400,11.828943
6,9.495400,11.714683
7,9.495400,11.62517
8,9.495400,11.557685
9,9.495400,11.516275
10,7.186600,11.499718


('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/tokenizer.json')

In [14]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset

# Load the dataset from the CSV file
dataset = load_dataset("csv", data_files={"train": "translated_alignments.csv"})

# Split the dataset into train and test sets
dataset = dataset['train'].train_test_split(test_size=0.1)

# Preprocess the dataset
def preprocess_function(examples):
    inputs = [ex for ex in examples['English']]
    targets = [ex for ex in examples['Bangla']]
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)
    labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Define training arguments
# Define updated training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,  # Increase the number of epochs
    predict_with_generate=True
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

# Fine-tune the model
trainer.train()

# Save the model and tokenizer
trainer.save_model("./results")
tokenizer.save_pretrained("./results")



Map: 100%|██████████| 1/1 [00:00<00:00, 12.71 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,12.691976
2,No log,11.86167
3,No log,11.048294
4,No log,10.336935
5,No log,9.737167
6,No log,9.245271
7,No log,8.85734
8,No log,8.580269
9,No log,8.403585
10,No log,8.330078


('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/tokenizer.json')

In [16]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('./results')
tokenizer = AutoTokenizer.from_pretrained('./results')

# Translate English to Bangla
english_sentence = "In the beginning, God created the heavens and the earth."
inputs = tokenizer.encode(english_sentence, return_tensors='pt')
outputs = model.generate(inputs, max_length=50)
bangla_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Translated to Bangla:", bangla_translation)


Translated to Bangla: God created the heavens and the earth.


In [17]:
# Translate more English sentences to Bangla
english_sentences = [
    "In the beginning, God created the heavens and the earth.",
    "And the earth was without form, and void; and darkness was upon the face of the deep.",
    "And the Spirit of God moved upon the face of the waters.",
    "And God said, Let there be light: and there was light.",
    "And God saw the light, that it was good: and God divided the light from the darkness."
]

for sentence in english_sentences:
    inputs = tokenizer.encode(sentence, return_tensors='pt')
    outputs = model.generate(inputs, max_length=50)
    bangla_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"English: {sentence}")
    print(f"Translated to Bangla: {bangla_translation}")
    print()


English: In the beginning, God created the heavens and the earth.
Translated to Bangla: God created the heavens and the earth.

English: And the earth was without form, and void; and darkness was upon the face of the deep.
Translated to Bangla: Und die Erd war ohne Form, und ohne void; und die Dunkelheit war auf dem Gesicht des Tief.

English: And the Spirit of God moved upon the face of the waters.
Translated to Bangla: Und der Spirit von God hat sich auf die face der Wasser bewegt.

English: And God said, Let there be light: and there was light.
Translated to Bangla: Und Gott sagte: Laßt Licht Licht: und es war Licht.

English: And God saw the light, that it was good: and God divided the light from the darkness.
Translated to Bangla: Und Gott sah, sah, sah, sah, sah, sah, sah, sah, sah, s



In [11]:
# Translate with adjusted parameters
for sentence in english_sentences:
    inputs = tokenizer.encode(sentence, return_tensors='pt')
    outputs = model.generate(inputs, max_length=50, num_beams=5, temperature=0.7)
    bangla_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"English: {sentence}")
    print(f"Translated to Bangla: {bangla_translation}")
    print()




English: In the beginning, God created the heavens and the earth.
Translated to Bangla: God created the heavens and the earth.

English: And the earth was without form, and void; and darkness was upon the face of the deep.
Translated to Bangla: Und die Erd war ohne Form, und void; und die Dunkelheit war auf dem Gesicht des Tief.

English: And the Spirit of God moved upon the face of the waters.
Translated to Bangla: Und der Spirit of God hat sich auf das Gesicht der Wasser bewegt.

English: And God said, Let there be light: and there was light.
Translated to Bangla: Und Gott sagte: Laßt es Licht: und es war Licht.

English: And God saw the light, that it was good: and God divided the light from the darkness.
Translated to Bangla: Und God saw the light, that it was good: and God divided the light from the darkness.



In [4]:
import os
import regex as re
from tqdm import tqdm
import transformers
from collections import defaultdict

# Define the function to get dataset iterator
def get_dataset_iterator(source_file: str, target_file: str):
    class DatasetWrapper:
        def __init__(self, source_file, target_file):
            self.source_file = open(source_file, 'r', encoding='utf-8')
            self.target_file = open(target_file, 'r', encoding='utf-8')

        def __iter__(self):
            return self

        def __next__(self):
            source_line = self.source_file.readline().strip()
            target_line = self.target_file.readline().strip()
            if not source_line or not target_line:
                self.source_file.close()
                self.target_file.close()
                raise StopIteration
            return source_line, target_line

    return DatasetWrapper(source_file, target_file)

# Define the function to create aligned corpus
def create_aligned_corpus(source_language, target_language, source_tokenizer, target_tokenizer, source_file, target_file):
    home_path = os.environ.get('TT_HOME', '.')
    aligned_file_path = f'{home_path}/alignments/{source_language}-{target_language}.aligned'

    if os.path.exists(aligned_file_path):
        print('Aligned corpus already exists.')
        return aligned_file_path

    os.makedirs(f'{home_path}/alignments', exist_ok=True)
    dataset = get_dataset_iterator(source_file, target_file)

    old_tokenizer = transformers.AutoTokenizer.from_pretrained(source_tokenizer)
    new_tokenizer = transformers.AutoTokenizer.from_pretrained(target_tokenizer)

    with open(aligned_file_path, 'w', encoding='utf-8') as f:
        for line_source, line_target in tqdm(dataset):
            line1 = ' '.join(old_tokenizer.tokenize(line_source.strip()))
            line2 = ' '.join(new_tokenizer.tokenize(line_target.strip()))
            f.write(line1 + ' ||| ' + line2 + '\n')

    return aligned_file_path

# Create the aligned corpus
corpus = create_aligned_corpus(
    source_language="eng-enggnv",
    target_language="ben-ben2017",
    source_tokenizer="mistralai/Mistral-Nemo-Instruct-2407",
    target_tokenizer="t5-small",
    source_file="data/eng-enggnv.txt",
    target_file="data/ben-ben2017.txt"
)

corpus


Aligned corpus already exists.


'./alignments/eng-enggnv-ben-ben2017.aligned'

In [5]:
import os

def align(corpus: str, fast_align_path: str = "/path/to/fast_align/build/fast_align") -> str:
    if ".aligned" not in corpus:
        raise ValueError("The input file should be an aligned file")

    aligned_output = corpus.replace(".aligned", ".fast_align.tsv")
    os.system(f'{fast_align_path} -i {corpus} -d -o -v > {aligned_output}')

    return aligned_output

# Align tokens using fast_align
aligned_corpus = align(corpus, fast_align_path="/path/to/fast_align/build/fast_align")
aligned_corpus


sh: 1: /path/to/fast_align/build/fast_align: not found


'./alignments/eng-enggnv-ben-ben2017.fast_align.tsv'

In [6]:
def map_tokens(mapped_tokens_file: str, source_tokenizer: str, target_tokenizer: str):
    old_tokenizer = transformers.AutoTokenizer.from_pretrained(source_tokenizer)
    new_tokenizer = transformers.AutoTokenizer.from_pretrained(target_tokenizer)

    tokenized_possible_translations = defaultdict(lambda: defaultdict(int))

    with open(mapped_tokens_file, 'r', encoding='utf-8') as f:
        for line in f:
            pairs = line.strip().split(' ')
            for pair in pairs:
                try:
                    old_index, new_index = pair.split('-')
                    old_token = old_tokenizer.convert_ids_to_tokens(int(old_index))
                    new_token = new_tokenizer.convert_ids_to_tokens(int(new_index))
                    tokenized_possible_translations[new_token][old_token] += 1
                except ValueError:
                    continue

    return tokenized_possible_translations

# Map tokens
tokenized_possible_translations = map_tokens(aligned_corpus, "mistralai/Mistral-Nemo-Instruct-2407", "t5-small")


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407.
401 Client Error. (Request ID: Root=1-66a031ce-0380c6190d23434141c9c16b;5a6b1427-406f-4c51-a047-effc308a490b)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407/resolve/main/config.json.
Access to model mistralai/Mistral-Nemo-Instruct-2407 is restricted. You must be authenticated to access it.

In [11]:
import os
os.environ["HF_TOKEN"] = "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"

from huggingface_hub import notebook_login
notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from mistral_inference import download_model
from pathlib import Path

mistral_model_path = Path.home().joinpath('mistral_models', '7B-v0.3')
mistral_model_path.mkdir(parents=True, exist_ok=True)

download_model(model_name="Mistral-7B-v0.3", local_dir=mistral_model_path)
