In [None]:
!pip install datasets huggingface_hub


In [None]:
# --------------Load the original dataset while keeping the train-test split--------------------------------------

from datasets import load_dataset, DatasetDict
from huggingface_hub import HfApi


original_dataset = DatasetDict({
    "train": load_dataset("originalusername/originaldatasetname", split="train"),
    "test": load_dataset("originalusername/originaldatasetname", split="test")
})



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#----------------------Push the dataset to your Hugging Face repository-------------------------
hf_username = "yourusername"  # Change if needed
new_dataset_name = "datasetname"  # Change if needed
dataset_repo = f"{hf_username}/{new_dataset_name}"

# Create the dataset repo in your account
api = HfApi()
api.create_repo(repo_id=dataset_repo, repo_type="dataset")

# Push dataset to your Hugging Face repository
original_dataset.push_to_hub(dataset_repo)


In [None]:
#----------------------Delete specific training files from your Hugging Face repository to reduce size-------------------------
from huggingface_hub import delete_file
import time

# Define repo ID and list of files to delete
repo_id = "yourusername/datasetname"  # Your dataset repository
file_paths = [f"data/train-{i:05d}-of-00061.parquet" for i in range(40, 61)]

# Hugging Face token (Replace with your actual token)
hf_token = "<token_id>"  # Get this from https://huggingface.co/settings/tokens

# Loop through files and delete each one
for file_path in file_paths:
    try:
        print(f"Deleting {file_path} ...")
        delete_file(
            path_in_repo=file_path,
            repo_id=repo_id,
            repo_type="dataset",  # Ensure we specify it as a dataset repository
            token=hf_token
        )
        time.sleep(1)  # Delay to avoid rate limiting
        print(f"Deleted {file_path} successfully!")
    except Exception as e:
        print(f"Failed to delete {file_path}: {e}")

print("All specified files have been processed.")


Deleting data/train-00040-of-00061.parquet ...
Deleted data/train-00040-of-00061.parquet successfully!
Deleting data/train-00041-of-00061.parquet ...
Deleted data/train-00041-of-00061.parquet successfully!
Deleting data/train-00042-of-00061.parquet ...
Deleted data/train-00042-of-00061.parquet successfully!
Deleting data/train-00043-of-00061.parquet ...
Deleted data/train-00043-of-00061.parquet successfully!
Deleting data/train-00044-of-00061.parquet ...
Deleted data/train-00044-of-00061.parquet successfully!
Deleting data/train-00045-of-00061.parquet ...
Deleted data/train-00045-of-00061.parquet successfully!
Deleting data/train-00046-of-00061.parquet ...
Deleted data/train-00046-of-00061.parquet successfully!
Deleting data/train-00047-of-00061.parquet ...
Deleted data/train-00047-of-00061.parquet successfully!
Deleting data/train-00048-of-00061.parquet ...
Deleted data/train-00048-of-00061.parquet successfully!
Deleting data/train-00049-of-00061.parquet ...
Deleted data/train-00049-o

In [None]:
#----------------------Filtering long sentences in your dataset-------------------------
from datasets import DatasetDict, load_dataset
from transformers import WhisperProcessor

# Load dataset
bangla_dataset = DatasetDict({
    "train": load_dataset("yourusername/datasetname", split="train"),
    "test": load_dataset("yourusername/datasetname", split="test")
})

# Load Whisper processor (includes tokenizer)
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
tokenizer = processor.tokenizer  # Extract tokenizer

# Function to filter sentences with token length ≤ 448
def filter_long_sentences(example):
    tokenized = tokenizer(example["sentence"], truncation=False)  # Check real length
    return len(tokenized["input_ids"]) <= 448

# Apply filtering
filtered_test_dataset = bangla_dataset["test"].filter(filter_long_sentences)

# Display the new dataset structure
print(filtered_test_dataset)

In [None]:
#---------------------------------Save the filtered dataset to disk---------------------------------
from huggingface_hub import HfApi, HfFolder
import os

# Save dataset in multiple Parquet files
filtered_test_dataset.save_to_disk("filtered_test_dataset")

#-------------------Then manually upload the files to the your Hugging Face repo------------------------


In [None]:
#-----------------------Alternatively, you can upload the dataset directly to your Hugging Face repo-------------------------
# from huggingface_hub import HfApi, HfFolder

# # Define repo ID and target folder
# repo_id = "yourusername/datasetname"  # Change this to your repository
# target_folder = "data"  # Directly save in 'data' folder

# # Save dataset directly in the 'data' folder
# filtered_test_dataset.save_to_disk(target_folder)

# # Authenticate if needed
# if not HfFolder.get_token():
#     from huggingface_hub import notebook_login
#     notebook_login()

# # Upload dataset to the 'data' folder inside the Hugging Face repo
# api = HfApi()
# api.upload_folder(
#     folder_path=target_folder,  # Upload the 'data' folder directly
#     repo_id=repo_id,
#     repo_type="dataset",
#     path_in_repo="data"  # Ensures files are placed inside 'data/'
# )

# print(f"Dataset uploaded to: https://huggingface.co/datasets/{repo_id}/tree/main/data")
