## Installing packages

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
%pip install transformers accelerate huggingface-hub torch numpy pandas tensorflow transformers



## Importing and testing the mbert model

In [3]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Load model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# Your Bengali text here
bengali_text = "আমি বাংলায় কথা বলি।"  # Example: "I speak Bengali"

# Set source language to Bengali
tokenizer.src_lang = "bn_IN"  # Bengali language code

# Encode the Bengali text
encoded_bn = tokenizer(bengali_text, return_tensors="pt")

# Generate translation with English as target
generated_tokens = model.generate(
    **encoded_bn,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]  # Set target language to English
)

# Decode the translation
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
print(f"Bengali: {bengali_text}")
print(f"English: {translation}")

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

Bengali: আমি বাংলায় কথা বলি।
English: I speak English.


## Defining a prompt template

In [4]:
def banglish_to_bengali(banglish_text):

    # # Load model and tokenizer
    # model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
    # tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

    # Set source language as English
    tokenizer.src_lang = "en_XX"

    # Define the prompt template
    prompt_template = """
                        Translate this Banglish text to Bengali: {banglish_text}
                        Just provide the result, nothing else.
                        Example:
                        Input:
                        Voucher diye ki free kena jabe?
                        Output: ভাউচার দিয়ে কি ফ্রি কেনা যাবে?

                        """

    # Format the prompt with the input text
    prompt = prompt_template.format(banglish_text=banglish_text)

    # Encode the prompt
    encoded_text = tokenizer(prompt, return_tensors="pt")

    # Generate Bengali text with Bengali as target language
    generated_tokens = model.generate(
        **encoded_text,
        forced_bos_token_id=tokenizer.lang_code_to_id["bn_IN"]
    )

    # Decode the Bengali text
    bengali_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

    return bengali_text

# Example usage
banglish_text = "scroll kore 20/30 second er video pann nai???"
result = banglish_to_bengali(banglish_text)
print(f"Banglish: {banglish_text}")
print(f"Bengali: {result}")

Banglish: scroll kore 20/30 second er video pann nai???
Bengali: মাত ্ র রেকর ্ ড উৎপাদনের মাধ ্ যমে, অন ্ য কিছুই না । উদাহরণ: আড়াতাড়ি: ভর ্ যাফার diye ki free kena jabe? আড়াতাড়ি: ভাউচার দিয়ে কি zväč কেনা যাবে? NAME OF TRANSLATORS


## Dataset


In [5]:
import pandas as pd

df = pd.read_parquet("hf://datasets/SKNahin/bengali-transliteration-data/data/train-00000-of-00001.parquet")
print(len(df))

5006


In [6]:
df.head()

Unnamed: 0,bn,rm
0,স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???,scroll kore 20/30 second er video pann nai???
1,ও গুলা টরেন্ট সাইট এ পাবেন,o gula Torrent site e paben
2,ভক্কর চক্কর পোস্ট একটা করলেই এপ্রুভড.… নিশ্চই ...,vokkor chokkor post akta korlei approved…. nis...
3,আমি টেস্ট করেই কোড দিছি…,ami test koreii code disi…
4,"এতো কষ্টের কি আছে সাকিবওয়াপ.টক,সাকিবওয়াপ.মল&এআ...","eto koster ki ache shakibwap.tk,shakibwap.ml&a..."


Cleaning the data

For cleaning, I have ensured all records are within fixed length and ensured no bengali letters in banglish and no english letter in bengali

In [7]:
import pandas as pd
import re
import json

# Step 1: Remove blank records
df.dropna(subset=['bn', 'rm'], inplace=True)
df = df[(df['bn'].str.strip() != '') & (df['rm'].str.strip() != '')]

# # Step 2: Normalize punctuation
# def normalize_punctuation(text):
#     return re.sub(r'[\W_]+', ' ', text).strip()

# df['bn'] = df['bn'].apply(normalize_punctuation)
# df['rm'] = df['rm'].apply(normalize_punctuation)

# Step 3: Remove invalid characters
# bn: Remove rows with English letters
# rm: Remove rows with Bengali letters
df = df[~df['bn'].str.contains(r'[a-zA-Z]', regex=True)]
df = df[~df['rm'].str.contains(r'[\u0980-\u09FF]', regex=True)]

# Step 4: Remove excessively long or short sentences
bn_lengths = df['bn'].str.len()
rm_lengths = df['rm'].str.len()
length_mean = (bn_lengths.mean() + rm_lengths.mean()) / 2
length_std = (bn_lengths.std() + rm_lengths.std()) / 2

min_length = max(1, length_mean - 2 * length_std)
max_length = length_mean + 2 * length_std

df = df[(bn_lengths >= min_length) & (bn_lengths <= max_length)]
df = df[(rm_lengths >= min_length) & (rm_lengths <= max_length)]

# Reset index after cleaning
df.reset_index(drop=True, inplace=True)


# Print dataset size
print(f"Total records: {len(df)}")

Total records: 4746


  df = df[(rm_lengths >= min_length) & (rm_lengths <= max_length)]


## Change the data format into proper question answer

In [8]:
import json

# Export dataset to JSONL format
data = []
for i in range(len(df)):
    input_text = f"Question: Convert this Banglish text into Bengali: {df['rm'].iloc[i]}?"
    output_text = f"Answer: {df['bn'].iloc[i]}"

    data.append({
        "instruction": "You are a bot who is expert in transliteration tasks. Transliterate the input Banglish text to Bengali.",
        "input": input_text,
        "output": output_text
    })

with open("data.jsonl", "w") as f:
    for item in data:
        f.write(json.dumps(item) + "\n")

# Display sample JSONL records
with open("data.jsonl", "r") as f:
    for i, line in enumerate(f):
        if i < 10:
            sample_data = json.loads(line)
            print(sample_data)

{'instruction': 'You are a bot who is expert in transliteration tasks. Transliterate the input Banglish text to Bengali.', 'input': 'Question: Convert this Banglish text into Bengali: scroll kore 20/30 second er video pann nai????', 'output': 'Answer: স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???'}
{'instruction': 'You are a bot who is expert in transliteration tasks. Transliterate the input Banglish text to Bengali.', 'input': 'Question: Convert this Banglish text into Bengali: o gula Torrent site e paben?', 'output': 'Answer: ও গুলা টরেন্ট সাইট এ পাবেন'}
{'instruction': 'You are a bot who is expert in transliteration tasks. Transliterate the input Banglish text to Bengali.', 'input': 'Question: Convert this Banglish text into Bengali: vokkor chokkor post akta korlei approved…. nishchoi ghabla ache?', 'output': 'Answer: ভক্কর চক্কর পোস্ট একটা করলেই এপ্রুভড.… নিশ্চই  ঘাবলা আছে'}
{'instruction': 'You are a bot who is expert in transliteration tasks. Transliterate the input Banglish text

## Split the data into tokens

In [9]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"

In [10]:
dataset_path = "data.jsonl"
use_hf = True

training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}

In [11]:
%pip install datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [12]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("json", data_files="data.jsonl")

# Split the dataset into train and test (80:20)
train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# print(train_dataset["input"])
# print(train_dataset["output"])
print(train_dataset)
print(test_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 3796
})
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 950
})


In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, max_length=128)


Delete non type objects

In [14]:
def clean_dataset(dataset):
    # Remove or replace examples with None values
    return dataset.filter(
        lambda example: example["instruction"] is not None and
                        example["input"] is not None and
                        example["output"] is not None
    )

cleaned_train = clean_dataset(train_dataset)
cleaned_test = clean_dataset(test_dataset)

print(cleaned_train)
print(cleaned_test)

# print the data

for i in range(10):
    print(cleaned_train[i])
    print(cleaned_test[i])

# print the data type
print(type(cleaned_train))
print(type(cleaned_test))


Filter:   0%|          | 0/3796 [00:00<?, ? examples/s]

Filter:   0%|          | 0/950 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 3796
})
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 950
})
{'instruction': 'You are a bot who is expert in transliteration tasks. Transliterate the input Banglish text to Bengali.', 'input': 'Question: Convert this Banglish text into Bengali: dekhi try kore?', 'output': 'Answer: দেখি ট্রাই করে'}
{'instruction': 'You are a bot who is expert in transliteration tasks. Transliterate the input Banglish text to Bengali.', 'input': 'Question: Convert this Banglish text into Bengali: 25+1 usd manea ki????', 'output': 'Answer: ২৫+১ ইউএসডি মানে কি??? '}
{'instruction': 'You are a bot who is expert in transliteration tasks. Transliterate the input Banglish text to Bengali.', 'input': 'Question: Convert this Banglish text into Bengali: hahahaha?', 'output': 'Answer: হাহাহাহা'}
{'instruction': 'You are a bot who is expert in transliteration tasks. Transliterate the input Banglish text to Bengali.

In [15]:
def tokenize_function(examples, max_length=512):
    """
    Tokenize examples for fine-tuning with instruction, input, and output format.
    """
    # Format prompts by combining instruction and input
    prompts = []
    for instr, inp in zip(examples['instruction'], examples['input']):
        # Handle None values
        instruction = str(instr) if instr is not None else ""
        input_text = str(inp) if inp is not None else ""

        if input_text.strip():
            prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput:"
        else:
            prompt = f"Instruction: {instruction}\nOutput:"
        prompts.append(prompt)

    # Prepare outputs
    targets = [str(output) if output is not None else "" for output in examples['output']]

    # Tokenize inputs
    model_inputs = tokenizer(
        prompts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors=None
    )

    # Tokenize targets
    labels = tokenizer(
        targets,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors=None
    )

    # Replace padding token id with -100 for loss calculation
    model_inputs["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels["input_ids"]
    ]

    return model_inputs


# Apply tokenization
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3796 [00:00<?, ? examples/s]

Map:   0%|          | 0/950 [00:00<?, ? examples/s]

In [16]:
for i in range(1, 10):
  print (tokenized_train[i])

{'instruction': 'You are a bot who is expert in transliteration tasks. Transliterate the input Banglish text to Bengali.', 'input': 'Question: Convert this Banglish text into Bengali: hahahaha?', 'output': 'Answer: হাহাহাহা', 'input_ids': [250004, 72022, 10763, 12, 2583, 621, 10, 13820, 2750, 83, 26808, 23, 3900, 39798, 2320, 66211, 7, 5, 11062, 39798, 2182, 70, 107730, 26387, 15227, 7986, 47, 151303, 5, 360, 7077, 12, 68185, 12, 1657, 11549, 903, 26387, 15227, 7986, 3934, 151303, 12, 256, 71514, 32, 13538, 7077, 12, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

## Training Setup

In [17]:
max_steps = 30

In [18]:
trained_model_name = f"banglish_{max_steps}_steps"
output_dir = trained_model_name

The parameters were mostly kept as the conventional values used, it was not possible for me to come up with the optimal values within time, hence these values.

no of steps were kept 30 as I found it to be limiting value before GPU memory runs out.

In [19]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=10,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=20,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)



In [20]:
import logging

# Create a logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)  # Set the desired logging level

# Create a handler (e.g., to output to console)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# Create a formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(ch)

switch to gpu if available

In [21]:
from transformers import AutoModelForCausalLM
from transformers import Trainer
import torch

base_model = base_model = AutoModelForCausalLM.from_pretrained(model_name)

device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

base_model.to(device)

Some weights of MBartForCausalLM were not initialized from the model checkpoint at facebook/mbart-large-50-many-to-many-mmt and are newly initialized: ['lm_head.weight', 'model.decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-12-21 15:27:45,853 - __main__ - DEBUG - Select GPU device
DEBUG:__main__:Select GPU device


MBartForCausalLM(
  (model): MBartDecoderWrapper(
    (decoder): MBartDecoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartDecoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (encoder_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj)

In [22]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

MBartForCausalLM(
  (model): MBartDecoderWrapper(
    (decoder): MBartDecoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartDecoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (encoder_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj)

In [23]:
train_dataset = tokenized_train
test_dataset = tokenized_test

In [24]:
trainer = Trainer(
    model=base_model,
    # model_flops=model_flops,
    # total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [25]:
training_output = trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss




In [26]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: banglish_30_steps/final


In [27]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)

In [28]:
finetuned_slightly_model.to(device)

MBartForCausalLM(
  (model): MBartDecoderWrapper(
    (decoder): MBartDecoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartDecoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (encoder_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj)

model.safetensors:   0%|          | 0.00/1.83G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shadabtanjeed/mbert-banglish-to-bangla/commit/97a1f9eb2848b2d3fb576917e8252394652cf277', commit_message='Upload tokenizer', commit_description='', oid='97a1f9eb2848b2d3fb576917e8252394652cf277', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shadabtanjeed/mbert-banglish-to-bangla', endpoint='https://huggingface.co', repo_type='model', repo_id='shadabtanjeed/mbert-banglish-to-bangla'), pr_revision=None, pr_num=None)

In [32]:
from google.colab import files
files.download('/content/banglish_30_steps')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
from google.colab import files
files.download('/content/data.jsonl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>