In [9]:
import os
import torch

In [10]:
!pip install pandas==1.5.3



In [11]:
def save_combined_text(dir_path, output_file):
    combined_text = ""
    for file_name in os.listdir(dir_path):
        if file_name.endswith(".txt"):
            with open(os.path.join(dir_path, file_name), 'r', encoding='utf-8') as file:
                file_content = file.read()
                combined_text += file_content + "<|endoftext|>" + "\n"
    with open(output_file, 'w', encoding='utf-8') as output:
        output.write(combined_text)
    print("Combined text saved to:", output_file)


In [12]:
dir_path_train = "/kaggle/input/bl-splitted2/train"
output_file_train = "/kaggle/working/bengali_literature_train_gpt2.txt"
save_combined_text(dir_path_train, output_file_train)

dir_path_valid = "/kaggle/input/bl-splitted2/valid"
output_file_valid = "/kaggle/working/bengali_literature_valid_gpt2.txt"
save_combined_text(dir_path_valid, output_file_valid)

Combined text saved to: /kaggle/working/bengali_literature_train_gpt2.txt
Combined text saved to: /kaggle/working/bengali_literature_valid_gpt2.txt


In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM
gpt2_bengali_tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt2-bengali")

In [14]:
print(len(gpt2_bengali_tokenizer.get_vocab()))
gpt2_bengali_tokenizer

50256


GPT2TokenizerFast(name_or_path='flax-community/gpt2-bengali', vocab_size=50256, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [15]:
# new tokenizer
# from tokenizers import (decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer)
# from transformers import GPT2Tokenizer, GPT2TokenizerFast, GPT2Model, GPT2LMHeadModel
# from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# new_tokenizer = Tokenizer(models.BPE())
# new_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
# trainer = trainers.BpeTrainer(vocab_size=5000, special_tokens=["<|endoftext|>"])
# train_file = '/kaggle/working/bengali_literature_train_gpt2.txt'
# new_tokenizer.train([train_file], trainer=trainer)
# new_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
# new_tokenizer.decoder = decoders.ByteLevel()

# new_tokenizer = GPT2TokenizerFast(tokenizer_object=new_tokenizer)
# new_tokenizer.save_pretrained("new_tokenizer_gpt2")
# new_tokenizer

# # gpt2 tokenizer
# gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# print(len(gpt2_tokenizer.get_vocab()))
# gpt2_tokenizer

# # merge the vocabulary for the extended tokenizer
# vocab_tokens = list(new_tokenizer.get_vocab())
# decoded_tokens = [new_tokenizer.decoder.decode([token]) for token in vocab_tokens]
# print(len(vocab_tokens), len(decoded_tokens))
# gpt2_tokenizer.add_tokens(decoded_tokens)
# gpt2_tokenizer.save_pretrained("extended_tokenizer_gpt2")
# print(len(gpt2_tokenizer.get_vocab()))
# gpt2_tokenizer

# # validate the changes
# text = "কফি, গান আর কাজ নিয়ে "
# gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# new_tokenizer = GPT2TokenizerFast.from_pretrained("new_tokenizer_gpt2")
# extended_tokenizer = GPT2TokenizerFast.from_pretrained("extended_tokenizer_gpt2")

# print(len(gpt2_tokenizer.encode(text)))
# print(gpt2_tokenizer.encode(text))
# print(len(new_tokenizer.encode(text)))
# print(new_tokenizer.encode(text))
# print(len(extended_tokenizer.encode(text)))
# print(extended_tokenizer.encode(text))

In [17]:
import logging
import time
from datasets import Dataset, DatasetDict

# Suppress the warning messages
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt2-bengali")
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.vocab_size)
print(len(tokenizer))

50256
50256


In [18]:
# Fine-tuning data
from datasets import Dataset, DatasetDict

train_file = "/kaggle/working/bengali_literature_train_gpt2.txt"  # Replace with your training data file
valid_file = "/kaggle/working/bengali_literature_valid_gpt2.txt"  # Replace with your validation data file
# Read the text data from the .txt file
with open(train_file, 'r', encoding='utf-8') as f:
    train_data = f.readlines()
with open(valid_file, 'r', encoding='utf-8') as f:
    valid_data = f.readlines()

print(len(train_data), len(valid_data))

901371 9296


In [19]:
# Create a Dataset object from the text data
train_dataset = Dataset.from_dict({"text": train_data})
valid_dataset = Dataset.from_dict({"text": valid_data})

def preprocess_function(examples):
    out = tokenizer(examples["text"])
    # out = tokenizer([" ".join(x) for x in examples["text"]])
    return out

In [20]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 901371
})

In [21]:
# Apply tokenization to the dataset in batches using map
train_dataset = train_dataset.map(preprocess_function, batched=True, num_proc=4, remove_columns=train_dataset.column_names)
time.sleep(5)
valid_dataset = valid_dataset.map(preprocess_function, batched=True, num_proc=4, remove_columns=valid_dataset.column_names)
tokenized_datasets = DatasetDict({"train": train_dataset, "valid":valid_dataset})


     

#0:   0%|          | 0/226 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/226 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/226 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/226 [00:00<?, ?ba/s]

       

#0:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/3 [00:00<?, ?ba/s]

#1:   0%|          | 0/3 [00:00<?, ?ba/s]

#3:   0%|          | 0/3 [00:00<?, ?ba/s]

In [22]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 901371
})

In [23]:
# Group the tokenized datasets into blocks 
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy() # First note that we duplicate the inputs for our labels. This is because the model of the 🤗 Transformers library apply the shifting to the right, so we don't need to do it manually.
    return result

In [24]:
# Group the tokenized train dataset
lm_train_dataset = tokenized_datasets['train'].map(group_texts, batched=True, num_proc=4)
time.sleep(2)
# Group the tokenized valid dataset
lm_valid_dataset = tokenized_datasets['valid'].map(group_texts, batched=True, num_proc=4)

lm_dataset = DatasetDict({"train": lm_train_dataset, "valid":lm_valid_dataset})
lm_dataset

     

#0:   0%|          | 0/226 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/226 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/226 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/226 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/3 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/3 [00:00<?, ?ba/s]

#2:   0%|          | 0/3 [00:00<?, ?ba/s]

#3:   0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 373904
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3488
    })
})

In [None]:
lm_dataset

In [26]:
from transformers import DataCollatorForLanguageModeling



In [27]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

out = data_collator([lm_dataset['train'][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [28]:
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 373904
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3488
    })
})

In [29]:
# Option 1: Load the GPT-2 base model and resize its embeddings to match the vocabulary size of our custom tokenizer. 
from transformers import GPT2LMHeadModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the GPT-2 base model
model = GPT2LMHeadModel.from_pretrained("flax-community/gpt2-bengali").to(device)

# Resize the model's embeddings to match the vocabulary size of our tokenizer
# def find_multiple(n: int, k: int) -> int:
#     if n % k == 0:
#         return n
#     return n + k - (n % k)

# new_embeddings_size = find_multiple(len(tokenizer), 64)
# model.resize_token_embeddings(new_embeddings_size)

model.safetensors:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [30]:
# # Option 2: freeze certain layers of the model (e.g., except embeddings)
freeze_layers = False

if freeze_layers:
    for name, param in model.named_parameters():
        if 'transformer.wte' in name:
            param.requires_grad = True
        else:
            param.requires_grad = False

In [31]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="custom_bengali_gpt2Bengali/",
    overwrite_output_dir=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=300,
    logging_steps=300,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.1,
    warmup_steps=300,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=300,
    push_to_hub=False,
    save_total_limit=2
)

In [32]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["valid"],
)

In [33]:
# Start training
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss,Validation Loss
300,1.8837,2.056279
600,1.7205,2.037768
900,1.6635,2.029567
1200,1.6241,2.032441
1500,1.5906,2.037302
1800,1.533,2.040788
2100,1.5157,2.039753
2400,1.4998,2.041039
2700,1.4858,2.044991
3000,1.4623,2.05998




TrainOutput(global_step=4380, training_loss=1.541053486305829, metrics={'train_runtime': 23339.9904, 'train_samples_per_second': 48.06, 'train_steps_per_second': 0.188, 'total_flos': 7.3239111401472e+16, 'train_loss': 1.541053486305829, 'epoch': 3.0})

In [34]:
# Save the fine-tuned model
trainer.save_model("custom_bengali_gpt2Bengali")

In [37]:
# Load the fine-tuned GPT-2 model for inference
finetuned_model = GPT2LMHeadModel.from_pretrained("custom_bengali_gpt2Bengali").to(device)
finetuned_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [43]:
# Input text for completion
input_text = "ছায়া হয়ে মিলায়ে যায়ে "
# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
# Generate text completions
max_length = 512
output_text = finetuned_model.generate(input_ids, max_length=max_length, top_k=50, top_p=.90, do_sample=True, num_return_sequences=2)[0]
# Decode the generated token IDs to text
completed_text = tokenizer.decode(output_text, skip_special_tokens=True)

print("Input Text:", input_text)
# print("Completed Text:", completed_text)
for line in completed_text.splitlines():
    print(line)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input Text: ছায়া হয়ে মিলায়ে যায়ে 
ছায়া হয়ে মিলায়ে যায়ে যেথায় আকাশ আলোক ঢালে যেথায় আকাশের পানে চাহিয়া কাটিবে সন্ধে সুখ দুখ দেবার কথা গুলি।  হাসি খেলা দেখিবারে পাই নাই সুখ কিসের মাঝখানে মিছে হাসি খেলা দেখিবারে পাই। বিফল সুখের সাধ জাগে মনে, কে তারে সুখী করে? দূরে প্রবাসের বাতাসে বাঁশির সুর শিথিল সুরে ফিরিতেছে ফিরিতেছে ছুটিয়া। জীবনের পথে সেই ছিল ভাল কিছুদূরে রয়েছে হেথায় একাকী ফুলবনে, তার পরে সেইখানে হারানো ফুল ছুঁড়ে ফেলেছি সুদূরে। ফুলগুলির রূপের রূপালি রঙে, গুলির গুলির লালে গুলির গন্ধে সেই গোলাপের মায়ায়, হাতে হাতে তুলেছিলাম ভাল সায়াসি লাবণ্য। তোমারে ছুঁইয়েছিল মনে জাগার লাজে। তোমারে দিয়েছিল মনে আমার লাজে, সুর না মেখে ফুলের রঙে গাঁথায়ে পুরানো গোলাপ শ্রাঙ্গ


In [51]:
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer

# Load the saved model and tokenizer
model_path = "/kaggle/working/custom_bengali_gpt2Bengali"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt2-bengali")

# Additional kwargs for text generation
generation_kwargs = {
    "max_length": max_length,
    "num_return_sequences": 1,
    "temperature": 0.8,  # Example additional parameter
    "top_k": 50,
    "top_p": 0.90,
    "do_sample": True,
    "num_return_sequences": 2
}

# Create a text generation pipeline with additional kwargs
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, **generation_kwargs)

# Example usage
input_text = "ছায়া হয়ে মিলায়ে যায়ে"
generated_text = text_generator(input_text)[0]['generated_text']

print("Input Text:", input_text)
print("Generated Text:", generated_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input Text: ছায়া হয়ে মিলায়ে যায়ে
Generated Text: ছায়া হয়ে মিলায়ে যায়ে দেখিতে পাই মাঝে মাঝে দুই তীরে দুই শিশির-নীরে বায়ুভরে মেঘের মায়াময় কানন-পথে বায়ুভরে স্নিগ্ধ সায়াহ্ন-ঘেরা দুটি মেঘের দুই বিন্দু মিলে একটি মায়াময় বাসর-ঘরে নাহি যেতে দেরি, কেবলি গান গেয়ে গেলেম সারারাতি।  যে গানের সুরে তোমার মন বেজেছিল, তোমার প্রাণ নাহি জানি, সেই গানের সুরে বেদনা গেঁথে স্মৃতির বন্ধন তোমারি চরণে ছিন্ন করিয়াছি প্রাণের মন। কাননের সুধাস্রোতে বায়ুর মৃদু বেগে স্মৃতির ঝংকারে আমার হৃদয় তোমারে বাজালো তীরে দুই স্রোতের মাঝে মিলিতে হবে বিশ্ব-হৃদয়। পুলক-রেণুরে # গাঁথিলে তোমার স্বর তোমার সাথে মিশে নিরন্তর চলার পথে পারাবারে কিবারে বারে তোমার চপল হাতে তাই বারে বারে মুকুলে কুহুরে মুক


In [52]:
!pip install gradio

Collecting gradio
  Obtaining dependency information for gradio from https://files.pythonhosted.org/packages/0d/b8/a21fcefdd25b0e7a0fea866d6bbef09c36764f277c4d65238e6b66dd6532/gradio-4.11.0-py3-none-any.whl.metadata
  Downloading gradio-4.11.0-py3-none-any.whl.metadata (17 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Obtaining dependency information for aiofiles<24.0,>=22.0 from https://files.pythonhosted.org/packages/c5/19/5af6804c4cc0fed83f47bff6e413a98a36618e7d40185cd36e69737f3b0e/aiofiles-23.2.1-py3-none-any.whl.metadata
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting gradio-client==0.7.3 (from gradio)
  Obtaining dependency information for gradio-client==0.7.3 from https://files.pythonhosted.org/packages/78/52/a96eada27a2f711464c4a8c85a6110d46e35034cd2108640980c1fa4e8bb/gradio_client-0.7.3-py3-none-any.whl.metadata
  Downl

In [53]:
generation_kwargs = {
    "max_length": max_length,
    "num_return_sequences": 1,
    "temperature": 0.8,  # Example additional parameter
    "top_k": 50,
    "top_p": 0.90,
    "do_sample": True,
    "num_return_sequences": 2
}

In [55]:
!pip install pydantic

Collecting pydantic-core==2.14.5 (from pydantic)
  Obtaining dependency information for pydantic-core==2.14.5 from https://files.pythonhosted.org/packages/7c/f5/3e59681bd53955da311a7f4efbb6315d01006e9d18b8a06b527a22d3d923/pydantic_core-2.14.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading pydantic_core-2.14.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.5 kB)
Downloading pydantic_core-2.14.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: pydantic-core
  Attempting uninstall: pydantic-core
    Found existing installation: pydantic_core 2.14.6
    Uninstalling pydantic_core-2.14.6:
      Successfully uninstalled pydantic_core-2.14.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are install

In [56]:
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
import gradio as gr

model_path = "/kaggle/working/custom_bengali_gpt2Bengali"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt2-bengali")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, **generation_kwargs)

demo = gr.Interface.from_pipeline(pipe)
demo.launch()

ImportError: cannot import name 'RootModel' from 'pydantic' (/opt/conda/lib/python3.10/site-packages/pydantic/__init__.cpython-310-x86_64-linux-gnu.so)

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
trainer.push_to_hub("Kaizu07/custom_bengali_gpt2Bengali")

NameError: name 'trainer' is not defined