In [1]:
from datasets import load_dataset

# Make sure you have logged in using:
# huggingface-cli login
# to access this dataset if it requires authentication.

# Load the BPCC dataset with the specific config
ds = load_dataset("thenlpresearcher/iitb_eng_mar_dataset")

# Inspect available splits
print(ds)

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|█████████████████████████████████████████████████████████████████████████████████████| 384/384 [00:00<00:00, 1.79kB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████| 38.5M/38.5M [00:02<00:00, 19.3MB/s]
Generating train split: 100%|███████████████████████████████████████████████████████████████| 237175/237175 [00:00<00:00, 246532.57 examples/s]

DatasetDict({
    train: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 237175
    })
})





In [4]:
# Check the first example from the training split
mar_ds = ds['train']

# Optional: quick check
print(mar_ds.column_names)
print(mar_ds[0])

['src_lang', 'tgt_lang', 'src', 'tgt']
{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}


In [5]:
print(mar_ds[0])

# Optional: see column names
print(mar_ds.column_names)

{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}
['src_lang', 'tgt_lang', 'src', 'tgt']


In [6]:
import string
from datasets import Dataset
from tqdm import tqdm

punct_table = str.maketrans("", "", string.punctuation)

all_rows = []

for ex in tqdm(mar_ds):
    # original
    all_rows.append({
        "src_lang": ex["src_lang"],
        "tgt_lang": ex["tgt_lang"],
        "src": ex["src"],
        "tgt": ex["tgt"]
    })
    # punctuation removed
    all_rows.append({
        "src_lang": ex["src_lang"],
        "tgt_lang": ex["tgt_lang"],
        "src": ex["src"].translate(punct_table),
        "tgt": ex["tgt"]
    })

# Create a new Dataset
mar_ds_expanded = Dataset.from_list(all_rows)

# Inspect
print(mar_ds_expanded[0])
print(mar_ds_expanded[1])
print("Total rows:", len(mar_ds_expanded))

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 237175/237175 [00:12<00:00, 18450.07it/s]


{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}
{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example let’s rename Sheet 2 as “Dump”', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}
Total rows: 474350


In [7]:
!huggingface-cli login --token {hf_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llm_finetuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm_finetuning`


In [8]:
from huggingface_hub import HfApi, Repository
from datasets import DatasetDict

# If you have multiple splits, wrap in DatasetDict
ds_dict = DatasetDict({
    "marathi_punct": mar_ds_expanded
})

# Push to HF
ds_dict.push_to_hub("thenlpresearcher/iitb_marathi_punct_variants", private=True)

Uploading the dataset shards:   0%|                                                                                      | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|                                                                              | 0/475 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:  11%|███████▎                                                            | 51/475 [00:00<00:00, 505.98ba/s][A
Creating parquet from Arrow format:  21%|██████████████▍                                                    | 102/475 [00:00<00:00, 466.89ba/s][A
Creating parquet from Arrow format:  36%|███████████████████████▉                                           | 170/475 [00:00<00:00, 556.21ba/s][A
Creating parquet from Arrow format:  49%|████████████████████████████████▊                                  | 233/475 [00:00<00:00, 582.92ba/s][A
Creating parquet from Arrow format:  67%|████████████████████████████████████████████▉                      | 319/475 [00

CommitInfo(commit_url='https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_punct_variants/commit/d594c32e21b1c7cbfee4073dad4fef5372fe935c', commit_message='Upload dataset', commit_description='', oid='d594c32e21b1c7cbfee4073dad4fef5372fe935c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_punct_variants', endpoint='https://huggingface.co', repo_type='dataset', repo_id='thenlpresearcher/iitb_marathi_punct_variants'), pr_revision=None, pr_num=None)

### Made validation and test sets

In [9]:
!huggingface-cli login --token {hf_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llm_finetuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm_finetuning`


In [11]:
import datasets
from datasets import load_dataset


mar_ds = datasets.load_dataset("thenlpresearcher/iitb_marathi_punct_variants")

Downloading readme: 100%|█████████████████████████████████████████████████████████████████████████████████████| 410/410 [00:00<00:00, 1.96kB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████| 43.7M/43.7M [00:02<00:00, 21.2MB/s]
Generating marathi_punct split: 100%|███████████████████████████████████████████████████████| 474350/474350 [00:01<00:00, 321976.08 examples/s]


In [12]:
mar_ds

DatasetDict({
    marathi_punct: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 474350
    })
})

In [13]:
from datasets import DatasetDict

# Assuming your DatasetDict is called `dataset_dict`
dataset = mar_ds['marathi_punct']

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Define split ratios
train_frac = 0.8
val_frac = 0.1
test_frac = 0.1

# Compute sizes
total = len(dataset)
train_size = int(train_frac * total)
val_size = int(val_frac * total)
test_size = total - train_size - val_size  # ensure all samples are used

# Split the dataset
train_dataset = dataset.select(range(0, train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, total))

# Combine into a DatasetDict
split_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 379480
    })
    validation: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 47435
    })
    test: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 47435
    })
})


In [14]:
split_dataset.push_to_hub(
    "thenlpresearcher/iitb_marathi_punct_variants", 
    private=True
)

Uploading the dataset shards:   0%|                                                                                      | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|                                                                              | 0/380 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:   1%|▎                                                                     | 2/380 [00:00<00:26, 14.45ba/s][A
Creating parquet from Arrow format:   1%|▋                                                                     | 4/380 [00:00<00:21, 17.17ba/s][A
Creating parquet from Arrow format:   2%|█▎                                                                    | 7/380 [00:00<00:19, 18.97ba/s][A
Creating parquet from Arrow format:   2%|█▋                                                                    | 9/380 [00:00<00:24, 14.99ba/s][A
Creating parquet from Arrow format:   3%|█▉                                                                   | 11/380 [0

CommitInfo(commit_url='https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_punct_variants/commit/9861dc6a91996a6e3f8f8d677810153b5bccf66b', commit_message='Upload dataset', commit_description='', oid='9861dc6a91996a6e3f8f8d677810153b5bccf66b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/thenlpresearcher/iitb_marathi_punct_variants', endpoint='https://huggingface.co', repo_type='dataset', repo_id='thenlpresearcher/iitb_marathi_punct_variants'), pr_revision=None, pr_num=None)