In [1]:
from datasets import load_dataset

# Make sure you have logged in using:
# huggingface-cli login
# to access this dataset if it requires authentication.

# Load the BPCC dataset with the specific config
ds = load_dataset("thenlpresearcher/iitb_eng_mar_dataset")

# Inspect available splits
print(ds)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 237175
    })
})


In [2]:
# Check the first example from the training split
mar_ds = ds['train']

# Optional: quick check
print(mar_ds.column_names)
print(mar_ds[0])

['src_lang', 'tgt_lang', 'src', 'tgt']
{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}


In [3]:
print(mar_ds[0])

# Optional: see column names
print(mar_ds.column_names)

{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}
['src_lang', 'tgt_lang', 'src', 'tgt']


In [4]:
import string
from datasets import Dataset
from tqdm import tqdm

punct_table = str.maketrans("", "", string.punctuation)

all_rows = []

# Toggle to alternate keeping/removing punctuation
keep_punct = True

for ex in tqdm(mar_ds):
    if keep_punct:
        all_rows.append({
            "src_lang": ex["src_lang"],
            "tgt_lang": ex["tgt_lang"],
            "src": ex["src"],
            "tgt": ex["tgt"]
        })
    else:
        all_rows.append({
            "src_lang": ex["src_lang"],
            "tgt_lang": ex["tgt_lang"],
            "src": ex["src"].translate(punct_table),
            "tgt": ex["tgt"]
        })
    keep_punct = not keep_punct  # alternate for next example

# Create a new Dataset
mar_ds_expanded = Dataset.from_list(all_rows)

# Inspect
print(mar_ds_expanded[0])
print(mar_ds_expanded[1])
print("Total rows:", len(mar_ds_expanded))

100%|████████████████████████████| 237175/237175 [00:08<00:00, 27559.72it/s]


{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'For example, let’s rename Sheet 2 as “Dump”.', 'tgt': 'उदाहरणार्थ Sheet 2 चे नाव बदलून “Dump” करू.'}
{'src_lang': 'eng_Latn', 'tgt_lang': 'mar_Deva', 'src': 'The Options dialogbox appears', 'tgt': 'Options चा डायलॉग बॉक्स उघडेल.'}
Total rows: 237175


In [5]:
!huggingface-cli login --token {hf_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llm_finetuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm_finetuning`


In [6]:
from huggingface_hub import HfApi, Repository
from datasets import DatasetDict

# If you have multiple splits, wrap in DatasetDict
ds_dict = DatasetDict({
    "marathi_punct": mar_ds_expanded
})

# Push to HF
ds_dict.push_to_hub("thenlpresearcher/shalaka_iitb_marathi_punct", private=True)

Uploading the dataset shards:   0%|                   | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|           | 0/238 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:  33%|▎| 79/238 [00:00<00:00, 781.10ba/s][A
Creating parquet from Arrow format:  66%|▋| 158/238 [00:00<00:00, 722.16ba/s[A
Creating parquet from Arrow format: 100%|█| 238/238 [00:00<00:00, 661.81ba/s[A
Processing Files (0 / 0): |                    |  0.00B /  0.00B            
Processing Files (0 / 1):   3%|▍               | 1.05MB / 38.4MB,  752kB/s  [A
Processing Files (0 / 1):   4%|▋               | 1.58MB / 38.4MB,  790kB/s  [A
Processing Files (0 / 1):   5%|▉               | 2.11MB / 38.4MB,  958kB/s  [A
Processing Files (0 / 1):   8%|█▎              | 3.16MB / 38.4MB, 1.32MB/s  [A
Processing Files (0 / 1):  11%|█▊              | 4.21MB / 38.4MB, 1.62MB/s  [A
Processing Files (0 / 1):  12%|█▉              | 4.74MB / 38.4MB, 1.69MB/s  [A
Processing Files (0 / 1):  16%|██▋            

CommitInfo(commit_url='https://huggingface.co/datasets/thenlpresearcher/shalaka_iitb_marathi_punct/commit/fc43236983dee8a6dacaca80e37b65408b520e2c', commit_message='Upload dataset', commit_description='', oid='fc43236983dee8a6dacaca80e37b65408b520e2c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/thenlpresearcher/shalaka_iitb_marathi_punct', endpoint='https://huggingface.co', repo_type='dataset', repo_id='thenlpresearcher/shalaka_iitb_marathi_punct'), pr_revision=None, pr_num=None)

### Made validation and test sets

In [7]:
!huggingface-cli login --token {hf_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llm_finetuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm_finetuning`


In [8]:
import datasets
from datasets import load_dataset


mar_ds = datasets.load_dataset("thenlpresearcher/shalaka_iitb_marathi_punct")

Generating marathi_punct split: 100%|█| 237175/237175 [00:00<00:00, 478580.4


In [9]:
mar_ds

DatasetDict({
    marathi_punct: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 237175
    })
})

In [10]:
from datasets import DatasetDict

# Assuming your DatasetDict is called `dataset_dict`
dataset = mar_ds['marathi_punct']

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Define split ratios
train_frac = 0.8
val_frac = 0.1
test_frac = 0.1

# Compute sizes
total = len(dataset)
train_size = int(train_frac * total)
val_size = int(val_frac * total)
test_size = total - train_size - val_size  # ensure all samples are used

# Split the dataset
train_dataset = dataset.select(range(0, train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, total))

# Combine into a DatasetDict
split_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 189740
    })
    validation: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 23717
    })
    test: Dataset({
        features: ['src_lang', 'tgt_lang', 'src', 'tgt'],
        num_rows: 23718
    })
})


In [11]:
split_dataset.push_to_hub(
    "thenlpresearcher/shalaka_iitb_marathi_punct", 
    private=True
)

Uploading the dataset shards:   0%|                   | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|           | 0/190 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:   2%|   | 4/190 [00:00<00:05, 36.06ba/s][A
Creating parquet from Arrow format:   4%|▏  | 8/190 [00:00<00:04, 37.29ba/s][A
Creating parquet from Arrow format:   6%|▏ | 12/190 [00:00<00:04, 36.94ba/s][A
Creating parquet from Arrow format:   8%|▏ | 16/190 [00:00<00:04, 37.83ba/s][A
Creating parquet from Arrow format:  11%|▏ | 20/190 [00:00<00:04, 38.54ba/s][A
Creating parquet from Arrow format:  13%|▎ | 24/190 [00:00<00:04, 38.89ba/s][A
Creating parquet from Arrow format:  15%|▎ | 28/190 [00:00<00:04, 39.20ba/s][A
Creating parquet from Arrow format:  17%|▎ | 33/190 [00:00<00:03, 39.55ba/s][A
Creating parquet from Arrow format:  19%|▍ | 37/190 [00:00<00:03, 39.64ba/s][A
Creating parquet from Arrow format:  22%|▍ | 41/190 [00:01<00:03, 39.74ba/s][A
Creating parquet from Arrow format:  24%|▍ 

CommitInfo(commit_url='https://huggingface.co/datasets/thenlpresearcher/shalaka_iitb_marathi_punct/commit/2ff26a7643d37d597244363f2d50632506de8979', commit_message='Upload dataset', commit_description='', oid='2ff26a7643d37d597244363f2d50632506de8979', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/thenlpresearcher/shalaka_iitb_marathi_punct', endpoint='https://huggingface.co', repo_type='dataset', repo_id='thenlpresearcher/shalaka_iitb_marathi_punct'), pr_revision=None, pr_num=None)