# Split dataset and upload to huggingface
* This notebook will spilt the dataset into: training, validate and test set.
* Then upload the dataset to huggingface 

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from huggingface_hub import notebook_login

In [2]:
name = "low_vs_random"
file_name = f"csv/clean_{name}.csv"
df = pd.read_csv(file_name)
df

Unnamed: 0,text_clean,labels,class
0,zindex issue using classic blockselecting a li...,0,low_priority
1,indicate the language of a code blocktell us a...,0,low_priority
2,use one attribute name for header title namema...,0,low_priority
3,document levels of validation in vldtermas thi...,0,low_priority
4,support puml files in boilerplatessee this com...,0,low_priority
...,...,...,...
245108,monitoring allow the user to define abnormal s...,1,not_priority
245109,run binbash dockerentrypointsh smw step is fai...,1,not_priority
245110,null links on grscicoll pagination in other la...,1,not_priority
245111,todo migrate to strapi or wordpress idkim on t...,1,not_priority


In [3]:
df.labels.value_counts().to_frame()[:50]
value_counts_df = df['class'].value_counts().to_frame()
value_counts_df

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
low_priority,126285
not_priority,118828


In [4]:
# Split dataframe into three parts: training, validation and testing.
# Currently 80/10/10 split.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [5]:
# Split dataframe into three parts: training, validation and testing.
train , validate , test = train_validate_test_split(df)

In [6]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
test_ds = Dataset.from_pandas(test)

ds = DatasetDict()

ds["test"] = test_ds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'labels', 'class', '__index_level_0__'],
        num_rows: 24512
    })
    train: Dataset({
        features: ['text_clean', 'labels', 'class', '__index_level_0__'],
        num_rows: 196090
    })
    validate: Dataset({
        features: ['text_clean', 'labels', 'class', '__index_level_0__'],
        num_rows: 24511
    })
})

In [42]:
# Uncomment this if not logged in.
#notebook_login()

In [7]:
# Push to Hugging Face Hub
ds.push_to_hub(name)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/197 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]