In [1]:
!pip install datasets transformers



In [2]:
from datasets import load_dataset

dataset = load_dataset("SKNahin/bengali-transliteration-data")

README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

In [4]:
def convert_to_unicode(example):
    example['bn'] = [f"U+{ord(char):04X}" for char in example['bn']]
    return example

In [3]:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer

model_name = "facebook/mbart-large-50-many-to-many-mmt"

tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [5]:
def preprocess_function(examples):
    # Set source and target language codes
    tokenizer.src_lang = "en_XX"  # Replace with the correct language code for Roman script
    tokenizer.tgt_lang = "bn_IN"  # Replace with the correct language code for Bengali script

    # Tokenize inputs and targets
    inputs = tokenizer(examples["rm"], max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["bn"], max_length=128, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

In [6]:
dataset = dataset["train"].train_test_split(test_size=0.1)

In [7]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4505 [00:00<?, ? examples/s]



Map:   0%|          | 0/501 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['bn', 'rm', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4505
    })
    test: Dataset({
        features: ['bn', 'rm', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 501
    })
})

In [9]:
import torch

# Clear GPU memory cache after each epoch or major computation
torch.cuda.empty_cache()

In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ['PYTORCH_CUDA_ALLOC_CONF']="expandable_segments:True"


In [11]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss
0,No log,1.682974
2,No log,0.889585
4,No log,0.737432
6,No log,0.678853
8,0.920100,0.666548
9,0.920100,0.664853


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameter

TrainOutput(global_step=700, training_loss=0.7090614863804409, metrics={'train_runtime': 2961.928, 'train_samples_per_second': 15.21, 'train_steps_per_second': 0.236, 'total_flos': 2977331548520448.0, 'train_loss': 0.7090614863804409, 'epoch': 9.929078014184396})

In [14]:
!rm -rf /kaggle/working/results

  pid, fd = os.forkpty()


In [13]:
trainer.save_model("./banglish-bangla-model")  

# Save the tokenizer
tokenizer.save_pretrained("./banglish-bangla-model")

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


('./banglish-bangla-model/tokenizer_config.json',
 './banglish-bangla-model/special_tokens_map.json',
 './banglish-bangla-model/sentencepiece.bpe.model',
 './banglish-bangla-model/added_tokens.json')

In [15]:
!zip -r banglish-bangla-model.zip /kaggle/working/banglish-bangla-model

  adding: kaggle/working/banglish-bangla-model/ (stored 0%)
  adding: kaggle/working/banglish-bangla-model/generation_config.json (deflated 43%)
  adding: kaggle/working/banglish-bangla-model/tokenizer_config.json (deflated 92%)
  adding: kaggle/working/banglish-bangla-model/model.safetensors (deflated 7%)
  adding: kaggle/working/banglish-bangla-model/config.json (deflated 59%)
  adding: kaggle/working/banglish-bangla-model/sentencepiece.bpe.model (deflated 49%)
  adding: kaggle/working/banglish-bangla-model/special_tokens_map.json (deflated 61%)
  adding: kaggle/working/banglish-bangla-model/training_args.bin (deflated 51%)


In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the model and tokenizer
model_path = "/kaggle/working/banglish-bangla-model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [17]:
input_text = "Ajke amar mon valo nei"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output from the model
outputs = model.generate(**inputs)

# Decode the generated tokens to human-readable text
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:", output_text)


Generated text: আজকে আমার মনে ভালো নেই


In [18]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
ngrok_token = user_secrets.get_secret("NGROK_AUTHTOKEN")

In [19]:
!pip install fastapi nest-asyncio pyngrok uvicorn

Collecting fastapi
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.2-py3-none-any.whl.metadata (8.4 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting starlette<0.42.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)
Collecting h11>=0.8 (from uvicorn)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading fastapi-0.115.6-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.2-py3-none-any.whl (22 kB)
Downloading uvicorn-0.34.0-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.3

In [20]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from fastapi.responses import FileResponse
import os

app = FastAPI()

# middlewares
app.add_middleware(
    CORSMiddleware, # https://fastapi.tiangolo.com/tutorial/cors/
    allow_origins=['*'], # wildcard to allow all, more here - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Origin
    allow_credentials=True, # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Credentials
    allow_methods=['*'], # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Methods
    allow_headers=['*'], # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Headers
)

class Text(BaseModel):
    text: str

@app.get('/')
async def root():
    return {'hello': 'world'}


@app.post("/banglish/")
async def generate_bangla(text: Text):
    inputs = tokenizer(text.text, return_tensors="pt")
    # Generate output from the model
    outputs = model.generate(**inputs)
    # Decode the generated tokens to human-readable text
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return {"generated_text": output_text}

FILE_PATH = "/kaggle/working/banglish-bangla-model.zip"
@app.get("/download/")
async def download_file():
    if os.path.exists(FILE_PATH):
        return FileResponse(FILE_PATH, media_type='application/octet-stream', filename="banglish-bangla-model.zip")
    else:
        return {"error": "File not found"}

In [None]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn

# specify a port
port = 8000

ngrok.set_auth_token(ngrok_token)
ngrok_tunnel = ngrok.connect(port)

# where we can visit our fastAPI app
print('Public URL:', ngrok_tunnel.public_url)


nest_asyncio.apply()

# finally run the app
uvicorn.run(app, port=port)

                                                                                                    

INFO:     Started server process [40]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://0fbd-34-132-164-92.ngrok-free.app
INFO:     103.74.84.163:0 - "POST /banglish/ HTTP/1.1" 200 OK
INFO:     103.74.84.163:0 - "OPTIONS /banglish HTTP/1.1" 200 OK
INFO:     103.74.84.163:0 - "POST /banglish HTTP/1.1" 307 Temporary Redirect
INFO:     103.74.84.163:0 - "OPTIONS /banglish/ HTTP/1.1" 200 OK
INFO:     103.74.84.163:0 - "POST /banglish/ HTTP/1.1" 200 OK
INFO:     103.74.84.163:0 - "POST /banglish HTTP/1.1" 307 Temporary Redirect
INFO:     103.74.84.163:0 - "POST /banglish/ HTTP/1.1" 200 OK
INFO:     103.74.84.163:0 - "POST /banglish HTTP/1.1" 307 Temporary Redirect
INFO:     103.74.84.163:0 - "POST /banglish/ HTTP/1.1" 200 OK
INFO:     103.74.84.163:0 - "OPTIONS /banglish HTTP/1.1" 200 OK
INFO:     103.74.84.163:0 - "POST /banglish HTTP/1.1" 307 Temporary Redirect
INFO:     103.74.84.163:0 - "OPTIONS /banglish/ HTTP/1.1" 200 OK
INFO:     103.74.84.163:0 - "POST /banglish/ HTTP/1.1" 200 OK
INFO:     103.74.84.163:0 - "POST /banglish HTTP/1.1" 307 Temporary Re