# Split dataset and upload to huggingface
* This notebook will spilt the dataset into: training, validate and test set.
* Then upload the dataset to huggingface 

In [36]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from huggingface_hub import notebook_login

In [37]:
name = "high_vs_med_and_low_priority"
file_name = f"csv/clean_{name}.csv"
df = pd.read_csv(file_name)
df

Unnamed: 0,text_clean,labels,class
0,autolog out after some time frame is your feat...,0,high_priority
1,image picker for sourceimplement an android im...,0,high_priority
2,fix video page listitem hovering behaviour whe...,0,high_priority
3,escape shuttle reaches ludicrous speed descrip...,0,high_priority
4,binder doesnt load notebooks outside of the un...,0,high_priority
...,...,...,...
419837,include motd feature in ci testscurrent messag...,1,medium_and_low_priority
419838,checkbox to disable tweeting displayed for non...,1,medium_and_low_priority
419839,add additional labels to service monitorsis yo...,1,medium_and_low_priority
419840,no mic button in the keyboard for hvr build co...,1,medium_and_low_priority


In [38]:
df.labels.value_counts().to_frame()[:50]
value_counts_df = df['class'].value_counts().to_frame()
value_counts_df

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
high_priority,210100
medium_and_low_priority,209742


In [39]:
# Split dataframe into three parts: training, validation and testing.
# Currently 80/10/10 split.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [40]:
# Split dataframe into three parts: training, validation and testing.
train , validate , test = train_validate_test_split(df)

In [41]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
test_ds = Dataset.from_pandas(test)

ds = DatasetDict()

ds["test"] = test_ds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'labels', 'class', '__index_level_0__'],
        num_rows: 41985
    })
    train: Dataset({
        features: ['text_clean', 'labels', 'class', '__index_level_0__'],
        num_rows: 335873
    })
    validate: Dataset({
        features: ['text_clean', 'labels', 'class', '__index_level_0__'],
        num_rows: 41984
    })
})

In [42]:
# Uncomment this if not logged in.
#notebook_login()

In [43]:
# Push to Hugging Face Hub
ds.push_to_hub(name)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/336 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]