# Split dataset and upload to huggingface
* This notebook will spilt the dataset into: training, validate and test set.
* Then upload the dataset to huggingface 

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from huggingface_hub import notebook_login

In [2]:
name = "highest_vs_rest_balanced_jira"
file_name = f"../csv/{name}.csv"
df = pd.read_csv(file_name)
df

Unnamed: 0,class,text_clean,label,class_original
0,highest,when we do range query on simple keys it does ...,1,Highest
1,highest,unhandledpromiserejectionwarning unhandled pro...,1,Highest
2,rest,the fabricunittestdaily branch failing intermi...,0,Medium
3,rest,as a system operator i want to receive alerts ...,0,Medium
4,rest,there is no support in datasourcetransactionma...,0,Medium
...,...,...,...,...
110711,rest,once i have imported a widget into the store i...,0,Low
110712,rest,the spec defines an axiom of a ie singleton li...,0,Medium
110713,rest,to have an history on master of all csvs setup...,0,Medium
110714,highest,there are still some changes expected to be me...,1,Highest


In [3]:
null_rows = df[df['text_clean'].isnull()]
null_rows

Unnamed: 0,class,text_clean,label,class_original


In [4]:
df = df.dropna(subset=['text_clean'])
# reset index
df = df.reset_index(drop=True)
df

Unnamed: 0,class,text_clean,label,class_original
0,highest,when we do range query on simple keys it does ...,1,Highest
1,highest,unhandledpromiserejectionwarning unhandled pro...,1,Highest
2,rest,the fabricunittestdaily branch failing intermi...,0,Medium
3,rest,as a system operator i want to receive alerts ...,0,Medium
4,rest,there is no support in datasourcetransactionma...,0,Medium
...,...,...,...,...
110711,rest,once i have imported a widget into the store i...,0,Low
110712,rest,the spec defines an axiom of a ie singleton li...,0,Medium
110713,rest,to have an history on master of all csvs setup...,0,Medium
110714,highest,there are still some changes expected to be me...,1,Highest


In [5]:
null_rows = df[df['text_clean'].isnull()]
null_rows

Unnamed: 0,class,text_clean,label,class_original


In [6]:
df = df[['text_clean', 'label']]
df

Unnamed: 0,text_clean,label
0,when we do range query on simple keys it does ...,1
1,unhandledpromiserejectionwarning unhandled pro...,1
2,the fabricunittestdaily branch failing intermi...,0
3,as a system operator i want to receive alerts ...,0
4,there is no support in datasourcetransactionma...,0
...,...,...
110711,once i have imported a widget into the store i...,0
110712,the spec defines an axiom of a ie singleton li...,0
110713,to have an history on master of all csvs setup...,0
110714,there are still some changes expected to be me...,1


In [7]:
df.label.value_counts().to_frame()[:50]
value_counts_df = df['label'].value_counts().to_frame()
value_counts_df

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,55358
0,55358


In [8]:
# Split dataframe into three parts: training, validation and testing.
# Currently 80/10/10 split.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [9]:
# Split dataframe into three parts: training, validation and testing.
train , validate , test = train_validate_test_split(df)

In [10]:
print(f"Training set size: {len(train)/len(df):.2%}")
print(f"Validation set size: {len(validate)/len(df):.2%}")
print(f"Test set size: {len(test)/len(df):.2%}")

Training set size: 80.00%
Validation set size: 10.00%
Test set size: 10.00%


In [11]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
test_ds = Dataset.from_pandas(test)
# Remove __index_level_0__ column
tds = tds.remove_columns("__index_level_0__")
vds = vds.remove_columns("__index_level_0__")
test_ds = test_ds.remove_columns("__index_level_0__")

ds = DatasetDict()

ds["test"] = test_ds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 11073
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 88572
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 11071
    })
})

In [12]:
# Uncomment this if not logged in.
#notebook_login()

In [13]:
# Push to Hugging Face Hub
ds.push_to_hub(name)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/89 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]