In [None]:
! pip install --quiet datasets
! pip install --quiet tabulate

In [None]:
import pandas as pd
from tabulate import tabulate
from collections import Counter
from datasets import load_dataset, DatasetDict

# from huggingface_hub import notebook_login
# notebook_login()

# Set Pandas display options for better DataFrame output
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.width', 1000)        # Set the width to a larger value
pd.set_option('display.max_colwidth', 100)  # Set max column width to display wider content


In [None]:
train_split_ratio = 0.8
dataset_path = './dataset/rice-leaf-disease-augmented'
hub_path = 'cvmil/rice-leaf-disease-augmented'

In [None]:
# Load the dataset
dataset = load_dataset('imagefolder', data_dir=dataset_path)
dataset = dataset['train']
print(f"Dataset: {dataset}")

In [None]:
# First, split the dataset into train and the remaining (test + validation)
train_test_split = dataset.train_test_split(train_size=train_split_ratio, stratify_by_column='label')

# Further split the remaining data into test and validation sets
test_valid_split = train_test_split['test'].train_test_split(test_size=0.5, stratify_by_column='label')

# Combine all the splits into a new DatasetDict
split_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test']
})

# Print the structure of the new dataset
print(split_dataset)


In [None]:
# Assuming label mapping and label names are available
label_mapping = split_dataset['train'].features['label'].int2str
labels = split_dataset['train'].features['label'].names

# Count the number of samples per class in each split
train_counts = Counter(split_dataset['train']['label'])
validation_counts = Counter(split_dataset['validation']['label'])
test_counts = Counter(split_dataset['test']['label'])

# Create a DataFrame for the counts, ordered by label name
data = {
    'ID': list(range(len(labels))),
    'Label': labels,
    'Training': [train_counts[i] if i in train_counts else 0 for i in range(len(labels))],
    'Validation': [validation_counts[i] if i in validation_counts else 0 for i in range(len(labels))],
    'Test': [test_counts[i] if i in test_counts else 0 for i in range(len(labels))]
}
df = pd.DataFrame(data)

# Print the DataFrame with borders using tabulate
print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

In [None]:
# upload dataset to huggingface
split_dataset.push_to_hub(hub_path, private=True)