# Split dataset and upload to huggingface
* This notebook will spilt the dataset into: training, validate and test set.
* Then upload the dataset to huggingface 

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from huggingface_hub import notebook_login

In [2]:
name = "highest_high_vs_rest_5_levels"
file_name = f"clean_{name}.csv"
df = pd.read_csv(file_name)
df

Unnamed: 0,text_clean,label
0,sourceforge submitter bruceb first bug so bear...,0
1,pete has modified all quickstarts to use uniqu...,1
2,get this api currently gives the following err...,1
3,currently fsnamesystem operations are protecte...,0
4,i promote commits of my repositories the modal...,1
...,...,...
347189,currently the sourcecoordinator may invoke sou...,1
347190,update unifiedpush java sender to support ios ...,0
347191,the exception occures in ra during rollback an...,1
347192,,1


In [3]:
null_rows = df[df['text_clean'].isnull()]
null_rows

Unnamed: 0,text_clean,label
86,,0
92,,1
140,,0
167,,1
285,,1
...,...,...
347113,,0
347126,,0
347131,,1
347137,,0


In [4]:
df = df.dropna(subset=['text_clean'])
# reset index
df = df.reset_index(drop=True)
df

Unnamed: 0,text_clean,label
0,sourceforge submitter bruceb first bug so bear...,0
1,pete has modified all quickstarts to use uniqu...,1
2,get this api currently gives the following err...,1
3,currently fsnamesystem operations are protecte...,0
4,i promote commits of my repositories the modal...,1
...,...,...
340471,context i store a file in an ftp directory and...,1
340472,currently the sourcecoordinator may invoke sou...,1
340473,update unifiedpush java sender to support ios ...,0
340474,the exception occures in ra during rollback an...,1


In [5]:
null_rows = df[df['text_clean'].isnull()]
null_rows

Unnamed: 0,text_clean,label


In [6]:
df.label.value_counts().to_frame()[:50]
value_counts_df = df['label'].value_counts().to_frame()
value_counts_df

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,171045
0,169431


In [7]:
# Split dataframe into three parts: training, validation and testing.
# Currently 80/10/10 split.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [8]:
# Split dataframe into three parts: training, validation and testing.
train , validate , test = train_validate_test_split(df)

In [9]:
print(f"Training set size: {len(train)/len(df):.2%}")
print(f"Validation set size: {len(validate)/len(df):.2%}")
print(f"Test set size: {len(test)/len(df):.2%}")

Training set size: 80.00%
Validation set size: 10.00%
Test set size: 10.00%


In [10]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
test_ds = Dataset.from_pandas(test)
# Remove __index_level_0__ column
tds = tds.remove_columns("__index_level_0__")
vds = vds.remove_columns("__index_level_0__")
test_ds = test_ds.remove_columns("__index_level_0__")

ds = DatasetDict()

ds["test"] = test_ds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 34049
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 272380
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 34047
    })
})

In [11]:
# Uncomment this if not logged in.
#notebook_login()

In [12]:
# Push to Hugging Face Hub
ds.push_to_hub(name)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/273 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]