In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np

# Dataset repos
punct_restor = "thenlpresearcher/english_punctuation_restoration"
fine_tune = "thenlpresearcher/iitb_marathi_punct_variants"
test_data = "thenlpresearcher/test_data_human_validated_eng_mar"

def load_all_splits(repo):
    try:
        return load_dataset(repo)
    except:
        print(f"Could not load dataset: {repo}")
        return None

def find_text_column(ds):
    candidates = ["text", "sentence", "src", "tgt", "sent", "sent_written"]
    for c in candidates:
        if c in ds.column_names:
            return c
    return ds.column_names[0]

def compute_stats_for_split(dataset_dict, repo_name):
    for split, ds in dataset_dict.items():
        text_col = find_text_column(ds)
        texts = ds[text_col]

        lengths = [len(str(t).split()) for t in texts]

        stats = pd.DataFrame([{
            "dataset": repo_name,
            "split": split,
            "num_samples": len(texts),
            "avg_length": np.mean(lengths).round(2),
            "min_length": np.min(lengths),
            "max_length": np.max(lengths)
        }])

        print(f"\n===== {repo_name} :: {split.upper()} =====")
        print(stats.to_string(index=False))

def process(repo):
    ds_dict = load_all_splits(repo)
    if ds_dict:
        compute_stats_for_split(ds_dict, repo)

# Run for each dataset
process(punct_restor)
process(fine_tune)
process(test_data)


===== thenlpresearcher/english_punctuation_restoration =====
                                         dataset      split  num_instances
thenlpresearcher/english_punctuation_restoration      train         206112
thenlpresearcher/english_punctuation_restoration validation            888
thenlpresearcher/english_punctuation_restoration       test           8079

===== thenlpresearcher/iitb_marathi_punct_variants =====
                                     dataset      split  num_instances
thenlpresearcher/iitb_marathi_punct_variants      train         379480
thenlpresearcher/iitb_marathi_punct_variants validation          47435
thenlpresearcher/iitb_marathi_punct_variants       test          47435

===== thenlpresearcher/test_data_human_validated_eng_mar =====
                                           dataset split  num_instances
thenlpresearcher/test_data_human_validated_eng_mar  test             54


In [None]:
from datasets import load_dataset
import pandas as pd

punct_restor = "thenlpresearcher/english_punctuation_restoration"
fine_tune = "thenlpresearcher/iitb_marathi_punct_variants"
test_data = "thenlpresearcher/test_data_human_validated_eng_mar"

def print_split_sizes(repo):
    try:
        ds = load_dataset(repo)
    except:
        print(f"Could not load {repo}")
        return
    
    rows = []
    for split in ds.keys():
        rows.append({
            "dataset": repo,
            "split": split,
            "num_instances": len(ds[split])
        })

    df = pd.DataFrame(rows)
    print(f"\n===== {repo} =====")
    print(df.to_string(index=False))

print_split_sizes(punct_restor)
print_split_sizes(fine_tune)
print_split_sizes(test_data)

In [3]:
from datasets import load_dataset
import pandas as pd

repo = "thenlpresearcher/test_data_human_validated_eng_mar"

ds = load_dataset(repo)["test"]

df = pd.DataFrame(ds)
grouped = df.groupby("punct_type").size().reset_index(name="num_instances")

print(grouped)

      punct_type  num_instances
0          Colon              3
1          Comma             38
2        Em Dash              1
3         Hyphen              3
4    Parenthesis              2
5  Question Mark              1
6         Quotes              2
7     Semi Colon              2
8          Slash              2
