In [None]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.Load('bangla_char.model')


In [None]:
text = "আমার বাংলা ভাষা আমার গর্ব"
tokens = sp.encode_as_ids(text)
print(tokens)


In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import wandb

# Login to Weights & Biases
wandb.login(key='WANDBKEY')


In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"


In [None]:
import wandb
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments
import sentencepiece as spm
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import DataCollatorForLanguageModeling
from custom_tokenizer import CustomBengaliTokenizer 

wandb.init(project="Transtokenizers")

# custom tokenizer
tokenizer = CustomBengaliTokenizer(vocab_file="bangla_char.vocab", model_file="bangla_char.model")

def tokenize_text_file(file_path, tokenizer):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    tokenized_lines = [tokenizer.encode(line.strip()) for line in lines if line.strip()]
    return tokenized_lines

file_path = 'final_cleaned_bangla_corpus.txt'
tokenized_data = tokenize_text_file(file_path, tokenizer)

class CustomDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.tokenized_data[idx], dtype=torch.long)
        return {'input_ids': input_ids, 'labels': input_ids.clone()}

dataset = CustomDataset(tokenized_data)
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=4) 

model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B`")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,  
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10_000,
    save_total_limit=2,
    gradient_accumulation_steps=4, 
    evaluation_strategy="steps",
    eval_steps=1000,
    fp16=True,
    dataloader_num_workers=4,
    report_to="wandb",
    run_name="Transtokenizers_run"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

trainer.train()

wandb.finish()
