In [None]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.Load('bangla_char.model')


In [None]:
text = "আমার বাংলা ভাষা আমার গর্ব"
tokens = sp.encode_as_ids(text)
print(tokens)


In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import wandb

# Login to Weights & Biases
wandb.login(key='WANDBKEY')


In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"


In [None]:
import wandb
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments
import sentencepiece as spm
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import DataCollatorForLanguageModeling
from custom_tokenizer import CustomBengaliTokenizer 

wandb.init(project="Transtokenizers")

# custom tokenizer
tokenizer = CustomBengaliTokenizer(vocab_file="bangla_char.vocab", model_file="bangla_char.model")

def tokenize_text_file(file_path, tokenizer):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    tokenized_lines = [tokenizer.encode(line.strip()) for line in lines if line.strip()]
    return tokenized_lines

file_path = 'final_cleaned_bangla_corpus.txt'
tokenized_data = tokenize_text_file(file_path, tokenizer)

class CustomDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.tokenized_data[idx], dtype=torch.long)
        return {'input_ids': input_ids, 'labels': input_ids.clone()}

dataset = CustomDataset(tokenized_data)
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=4) 

model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B`")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,  
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10_000,
    save_total_limit=2,
    gradient_accumulation_steps=4, 
    evaluation_strategy="steps",
    eval_steps=1000,
    fp16=True,
    dataloader_num_workers=4,
    report_to="wandb",
    run_name="Transtokenizers_run"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

trainer.train()

wandb.finish()


In [None]:


from unsloth import FastLanguageModel
import torch
from datasets import Dataset
import sentencepiece as spm
from tqdm import tqdm
import wandb


wandb.init(project="Transtokenizers")


def load_and_tokenize_text(file_path, sp_model):
    sp = spm.SentencePieceProcessor(model_file=sp_model)
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    tokenized_lines = [sp.encode(line.strip(), out_type=int) for line in tqdm(lines, desc="Tokenizing lines") if line.strip()]
    return tokenized_lines


file_path = 'final_cleaned_bangla_corpus.txt'
sp_model = 'bangla_bpe.model'
tokenized_data = load_and_tokenize_text(file_path, sp_model)


def create_hf_dataset(tokenized_data):
    texts = [" ".join(map(str, tokens)) for tokens in tokenized_data]
    dataset = Dataset.from_dict({"text": texts})
    return dataset

dataset = create_hf_dataset(tokenized_data)


model_name = "unsloth/Meta-Llama-3.1-8B"


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,  
    load_in_4bit=True,  
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

def format_data(examples):
    texts = examples["text"]
    formatted_texts = [text + tokenizer.eos_token for text in texts]
    return {"text": formatted_texts}

dataset = dataset.map(format_data, batched=True, desc="Formatting data")

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,  
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="wandb",
        run_name="Transtokenizers_run"
    ),
)


trainer_stats = trainer.train()


gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

print(f"Training runtime: {trainer_stats.metrics['train_runtime']} seconds")
print(f"Training runtime in minutes: {round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes")
print(f"Peak reserved memory: {used_memory} GB")
print(f"Peak reserved memory for training: {used_memory_for_lora} GB")
print(f"Peak reserved memory % of max memory: {used_percentage} %")
print(f"Peak reserved memory for training % of max memory: {lora_percentage} %")


model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


wandb.finish()


In [None]:

import requests

def download_file(url, filename):
    response = requests.get(url)
    response.raise_for_status()  
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(response.text)

ben_url = 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/ben-ben2017.txt'
eng_url = 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/eng-enggnv.txt'


download_file(ben_url, 'ben-ben2017.txt')
download_file(eng_url, 'eng-enggnv.txt')



import sentencepiece as spm


corpus_file = 'eng-enggnv.txt'
model_prefix = 'english_bpe'
vocab_size = 32000

spm.SentencePieceTrainer.train(
    input=corpus_file,
    model_prefix='english_bpe',
    vocab_size=32000,
    model_type='bpe',
    character_coverage=0.9995,
    max_sentence_length=10000
)

print(f"Tokenizer trained and saved as {model_prefix}.model and {model_prefix}.vocab")





In [None]:
# preparing paraellel corpus
def prepare_parallel_corpus(eng_file, ben_file, output_file):
    with open(eng_file, 'r', encoding='utf-8') as eng, open(ben_file, 'r', encoding='utf-8') as ben, open(output_file, 'w', encoding='utf-8') as out:
        eng_lines = eng.readlines()
        ben_lines = ben.readlines()
        
      
        assert len(eng_lines) == len(ben_lines), "The English and Bengali files must have the same number of lines."
        
        for eng_line, ben_line in zip(eng_lines, ben_lines):
            eng_line = eng_line.strip()
            ben_line = ben_line.strip()
            if eng_line and ben_line: 
                out.write(f"{eng_line} ||| {ben_line}\n")

prepare_parallel_corpus('eng-enggnv.txt', 'ben-ben2017.txt', 'parallel_corpus.txt')


In [None]:
#  Aligning the Parallel Corpus Using FastAlign:
import os

fast_align_dir = 'fast_align/build'  

os.system(f'chmod +x {fast_align_dir}/fast_align')
os.system(f'chmod +x {fast_align_dir}/atools')


In [None]:
#  Aligning the Parallel Corpus Using FastAlign:
!{fast_align_dir}/fast_align -i parallel_corpus.txt -d -o -v > forward.align

!{fast_align_dir}/fast_align -i parallel_corpus.txt -d -o -v -r > reverse.align

!{fast_align_dir}/atools -i forward.align -j reverse.align -c grow-diag-final-and > bidirectional.align


In [None]:
def read_alignment_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        alignments = file.readlines()
    return alignments

forward_alignments = read_alignment_file('forward.align')
reverse_alignments = read_alignment_file('reverse.align')
bidirectional_alignments = read_alignment_file('bidirectional.align')


print("Forward Alignments ")
print(forward_alignments[:5])

print("\nReverse Alignments ")
print(reverse_alignments[:5])

print("\nBidirectional Alignments:")
print(bidirectional_alignments[:5])


In [None]:
from collections import defaultdict
import sentencepiece as spm


def read_alignment_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        alignments = file.readlines()
    return alignments

forward_alignments = read_alignment_file('forward.align')
reverse_alignments = read_alignment_file('reverse.align')
bidirectional_alignments = read_alignment_file('bidirectional.align')

def create_token_mapping(alignments):
    token_map = defaultdict(list)
    for line in alignments:
        parts = line.strip().split()
        for part in parts:
            en_idx, bn_idx = part.split('-')
            token_map[int(en_idx)].append(int(bn_idx))
    return token_map


token_map = create_token_mapping(bidirectional_alignments)

print(token_map)


In [None]:
#failing to create smooth token mapping