In [1]:
from datasets import load_dataset

# Make sure you have logged in using:
# huggingface-cli login
# to access this dataset if it requires authentication.

# Load the BPCC dataset with the specific config
ds = load_dataset("thenlpresearcher/iitb_eng_mar_dataset")

# Inspect available splits
print(ds)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 237175
    })
})


In [2]:
# Check the first example from the training split
mar_ds = ds['train']

# Optional: quick check
print(mar_ds.column_names)
print(mar_ds[0])

['src_lang', 'tgt_lang', 'src', 'tgt']
{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}


In [3]:
print(mar_ds[0])

# Optional: see column names
print(mar_ds.column_names)

{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}
['src_lang', 'tgt_lang', 'src', 'tgt']


In [4]:
import string
from datasets import Dataset
from tqdm import tqdm

punct_table = str.maketrans("", "", string.punctuation)

all_rows = []

for ex in tqdm(mar_ds):
    # original
#     all_rows.append({
#         "src_lang": ex["src_lang"],
#         "tgt_lang": ex["tgt_lang"],
#         "src": ex["src"],
#         "tgt": ex["tgt"]
#     })
    # punctuation removed
    all_rows.append({
        "src_lang": ex["src_lang"],
        "tgt_lang": ex["tgt_lang"],
        "src": ex["src"].translate(punct_table),
        "tgt": ex["tgt"]
    })

# Create a new Dataset
mar_ds_expanded = Dataset.from_list(all_rows)

# Inspect
print(mar_ds_expanded[0])
print(mar_ds_expanded[1])
print("Total rows:", len(mar_ds_expanded))

100%|████████████████████████████| 237175/237175 [00:08<00:00, 29177.01it/s]


{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example let’s rename Sheet 2 as “Dump”', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}
{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'The Options dialogbox appears', 'tgt': 'Options चा डायलॉग बॉक्स उघडेल.'}
Total rows: 237175


In [5]:
!huggingface-cli login --token {hf_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llm_finetuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm_finetuning`


In [6]:
from huggingface_hub import HfApi, Repository
from datasets import DatasetDict

# If you have multiple splits, wrap in DatasetDict
ds_dict = DatasetDict({
    "marathi_punct": mar_ds_expanded
})

# Push to HF
ds_dict.push_to_hub("thenlpresearcher/iitb_marathi_without_punct", private=True)

Uploading the dataset shards:   0%|                   | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|           | 0/238 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:  33%|▎| 79/238 [00:00<00:00, 786.13ba/s][A
Creating parquet from Arrow format:  66%|▋| 158/238 [00:00<00:00, 706.45ba/s[A
Creating parquet from Arrow format: 100%|█| 238/238 [00:00<00:00, 665.17ba/s[A
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|███████████| 1/1 [00:06<00:00,  6.54s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_without_punct/commit/578ca0285552f06699692843ca0efa4d8f8286ac', commit_message='Upload dataset', commit_description='', oid='578ca0285552f06699692843ca0efa4d8f8286ac', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_without_punct', endpoint='https://huggingface.co', repo_type='dataset', repo_id='thenlpresearcher/iitb_marathi_without_punct'), pr_revision=None, pr_num=None)

### Made validation and test sets

In [7]:
!huggingface-cli login --token {hf_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llm_finetuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm_finetuning`


In [8]:
import datasets
from datasets import load_dataset


mar_ds = datasets.load_dataset("thenlpresearcher/iitb_marathi_without_punct")

Downloading readme: 100%|██████████████████| 408/408 [00:00<00:00, 1.93kB/s]
Downloading data: 100%|████████████████| 38.0M/38.0M [00:01<00:00, 19.0MB/s]
Generating marathi_punct split: 100%|█| 237175/237175 [00:00<00:00, 681080.9


In [9]:
mar_ds

DatasetDict({
    marathi_punct: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 237175
    })
})

In [10]:
from datasets import DatasetDict

# Assuming your DatasetDict is called `dataset_dict`
dataset = mar_ds['marathi_punct']

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Define split ratios
train_frac = 0.8
val_frac = 0.1
test_frac = 0.1

# Compute sizes
total = len(dataset)
train_size = int(train_frac * total)
val_size = int(val_frac * total)
test_size = total - train_size - val_size  # ensure all samples are used

# Split the dataset
train_dataset = dataset.select(range(0, train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, total))

# Combine into a DatasetDict
split_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 189740
    })
    validation: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 23717
    })
    test: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 23718
    })
})


In [11]:
split_dataset.push_to_hub(
    "thenlpresearcher/iitb_marathi_without_punct", 
    private=True
)

Uploading the dataset shards:   0%|                   | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|           | 0/190 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:   2%|   | 3/190 [00:00<00:06, 27.44ba/s][A
Creating parquet from Arrow format:   4%|   | 7/190 [00:00<00:05, 31.79ba/s][A
Creating parquet from Arrow format:   6%|  | 11/190 [00:00<00:05, 34.74ba/s][A
Creating parquet from Arrow format:   8%|▏ | 15/190 [00:00<00:04, 36.26ba/s][A
Creating parquet from Arrow format:  10%|▏ | 19/190 [00:00<00:04, 37.16ba/s][A
Creating parquet from Arrow format:  12%|▏ | 23/190 [00:00<00:04, 37.71ba/s][A
Creating parquet from Arrow format:  14%|▎ | 27/190 [00:00<00:04, 38.18ba/s][A
Creating parquet from Arrow format:  16%|▎ | 31/190 [00:00<00:04, 38.48ba/s][A
Creating parquet from Arrow format:  18%|▎ | 35/190 [00:00<00:04, 38.69ba/s][A
Creating parquet from Arrow format:  21%|▍ | 39/190 [00:01<00:03, 38.81ba/s][A
Creating parquet from Arrow format:  23%|▍ 

CommitInfo(commit_url='https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_without_punct/commit/05a1cdc591b6e4789f9cc196856541d8c95b4587', commit_message='Upload dataset', commit_description='', oid='05a1cdc591b6e4789f9cc196856541d8c95b4587', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_without_punct', endpoint='https://huggingface.co', repo_type='dataset', repo_id='thenlpresearcher/iitb_marathi_without_punct'), pr_revision=None, pr_num=None)