In [1]:
from datasets import load_dataset

# Make sure you have logged in using:
# huggingface-cli login
# to access this dataset if it requires authentication.

# Load the BPCC dataset with the specific config
ds = load_dataset("thenlpresearcher/iitb_eng_mar_dataset")

# Inspect available splits
print(ds)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 237175
    })
})


In [2]:
# Check the first example from the training split
mar_ds = ds['train']

# Optional: quick check
print(mar_ds.column_names)
print(mar_ds[0])

['src_lang', 'tgt_lang', 'src', 'tgt']
{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}


In [3]:
print(mar_ds[0])

# Optional: see column names
print(mar_ds.column_names)

{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}
['src_lang', 'tgt_lang', 'src', 'tgt']


In [4]:
import string
from datasets import Dataset
from tqdm import tqdm

punct_table = str.maketrans("", "", string.punctuation)

all_rows = []

for ex in tqdm(mar_ds):
    # original
    all_rows.append({
        "src_lang": ex["src_lang"],
        "tgt_lang": ex["tgt_lang"],
        "src": ex["src"],
        "tgt": ex["tgt"]
    })
#     # punctuation removed
#     all_rows.append({
#         "src_lang": ex["src_lang"],
#         "tgt_lang": ex["tgt_lang"],
#         "src": ex["src"].translate(punct_table),
#         "tgt": ex["tgt"]
#     })

# Create a new Dataset
mar_ds_expanded = Dataset.from_list(all_rows)

# Inspect
print(mar_ds_expanded[0])
print(mar_ds_expanded[1])
print("Total rows:", len(mar_ds_expanded))

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 237175/237175 [00:11<00:00, 20618.27it/s]


{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}
{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'The Options dialog-box appears.', 'tgt': 'Options चा डायलॉग बॉक्स उघडेल.'}
Total rows: 237175


In [7]:
!huggingface-cli login --token {hf_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llm_finetuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm_finetuning`


In [5]:
from huggingface_hub import HfApi, Repository
from datasets import DatasetDict

# If you have multiple splits, wrap in DatasetDict
ds_dict = DatasetDict({
    "marathi_punct": mar_ds_expanded
})

# Push to HF
ds_dict.push_to_hub("thenlpresearcher/iitb_marathi_orig_punct", private=True)

Uploading the dataset shards:   0%|                                                                                      | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|                                                                              | 0/238 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:  29%|████████████████████                                                | 70/238 [00:00<00:00, 691.75ba/s][A
Creating parquet from Arrow format:  59%|███████████████████████████████████████▍                           | 140/238 [00:00<00:00, 549.83ba/s][A
Creating parquet from Arrow format: 100%|███████████████████████████████████████████████████████████████████| 238/238 [00:00<00:00, 485.95ba/s][A
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|██████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.98s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_orig_punct/commit/9cec70dc89be3e47b61ab30d4006fb7fc5968813', commit_message='Upload dataset', commit_description='', oid='9cec70dc89be3e47b61ab30d4006fb7fc5968813', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_orig_punct', endpoint='https://huggingface.co', repo_type='dataset', repo_id='thenlpresearcher/iitb_marathi_orig_punct'), pr_revision=None, pr_num=None)

### Made validation and test sets

In [6]:
!huggingface-cli login --token {hf_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llm_finetuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm_finetuning`


In [7]:
import datasets
from datasets import load_dataset


mar_ds = datasets.load_dataset("thenlpresearcher/iitb_marathi_orig_punct")

Downloading readme: 100%|█████████████████████████████████████████████████████████████████████████████████████| 408/408 [00:00<00:00, 1.90kB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████| 38.5M/38.5M [00:02<00:00, 14.8MB/s]
Generating marathi_punct split: 100%|███████████████████████████████████████████████████████| 237175/237175 [00:00<00:00, 262938.54 examples/s]


In [8]:
mar_ds

DatasetDict({
    marathi_punct: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 237175
    })
})

In [9]:
from datasets import DatasetDict

# Assuming your DatasetDict is called `dataset_dict`
dataset = mar_ds['marathi_punct']

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Define split ratios
train_frac = 0.8
val_frac = 0.1
test_frac = 0.1

# Compute sizes
total = len(dataset)
train_size = int(train_frac * total)
val_size = int(val_frac * total)
test_size = total - train_size - val_size  # ensure all samples are used

# Split the dataset
train_dataset = dataset.select(range(0, train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, total))

# Combine into a DatasetDict
split_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 189740
    })
    validation: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 23717
    })
    test: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 23718
    })
})


In [10]:
split_dataset.push_to_hub(
    "thenlpresearcher/iitb_marathi_orig_punct", 
    private=True
)

Uploading the dataset shards:   0%|                                                                                      | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|                                                                              | 0/190 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:   1%|▋                                                                     | 2/190 [00:00<00:13, 13.65ba/s][A
Creating parquet from Arrow format:   2%|█▍                                                                    | 4/190 [00:00<00:11, 16.43ba/s][A
Creating parquet from Arrow format:   3%|██▏                                                                   | 6/190 [00:00<00:10, 17.79ba/s][A
Creating parquet from Arrow format:   4%|██▉                                                                   | 8/190 [00:00<00:09, 18.49ba/s][A
Creating parquet from Arrow format:   6%|███▉                                                                 | 11/190 [0

CommitInfo(commit_url='https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_orig_punct/commit/b5bbb821630b649be7b7598947ee52057b2e0b16', commit_message='Upload dataset', commit_description='', oid='b5bbb821630b649be7b7598947ee52057b2e0b16', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_orig_punct', endpoint='https://huggingface.co', repo_type='dataset', repo_id='thenlpresearcher/iitb_marathi_orig_punct'), pr_revision=None, pr_num=None)