# Load the dependencies and the helper functions

In [1]:
import pandas as pd
from Label_Bot.label_bot import utils

# Load the full dataset
More versions of the dataset can be found in **http://cs-people.bu.edu/giorgos/labelbot**. 

In [2]:
base_url = 'http://cs-people.bu.edu/giorgos/labelbot/'

url = base_url + 'github.pkl'

In [3]:
df = pd.read_pickle(url)

In [4]:
utils.get_labels_stats(df)

Unnamed: 0,Labels Present,Fraction,Examples
0,Bug,0.440106,1397248
1,Question,0.085609,271792
2,Enhancement,0.380621,1208396
3,"Bug, Question",0.000461,1463
4,"Bug, Enhancement",0.00106,3364
5,"Question, Enhancement",0.00025,795
6,"Bug, Question, Enhancement",1e-06,4
7,Total,1.0,3174798


# Prepare the dataset for Masked Language Modeling

You can skip this section if you're not interested in Language Modeling

In [5]:
df, _ = utils.split_train_test(df, 
                               train_frac=.125, 
                               save=False)

In [6]:
df, mlm_df = utils.split_train_test(df, 
                                    save=False)

In [7]:
mlm_df_train, mlm_df_val, mlm_df_test = utils.split_train_test(mlm_df, 
                                                               save=False, 
                                                               validation=True, 
                                                               to_keep=['title', 'body'])

In [11]:
utils.df_to_txt(mlm_df_train, name='train')
utils.df_to_txt(mlm_df_val, name='val')
utils.df_to_txt(mlm_df_test, name='test')

del mlm_df, mlm_df_train, mlm_df_val, mlm_df_test

# Prepare the dataset for classification

## Sample the dataset
We randomly keep only 20,000 examples from each class. Because the dataframe that corresponds to combinations of classes is too small, we keep all of its examples. 
<br>
Keep in mind that for the classification dataset, we use only datapoints that haven't been seen by our language model, meaning that we don't sample from the 400k datapoints of the language modeling dataset. 

In [5]:
sample_size = int(2e+4)
name = f'{int(sample_size / 1000)}k'

In [7]:
bugs_df, questions_df, enhancements_df, combinations_df = utils.sample_df(df, 
                                                                          n=sample_size, 
                                                                          to_keep=['title', 
                                                                                   'body', 
                                                                                   'label_bug', 
                                                                                   'label_question', 
                                                                                   'label_enhancement'])

## Split to train, test and validation sets. 
By default the split is done 70/30.

In [8]:
to_keep=['title', 'body', 'label_bug', 'label_question', 'label_enhancement']

In [9]:
bugs_train, bugs_val, bugs_test = utils.split_train_test(bugs_df, 
                                                         save=False, 
                                                         validation=True, 
                                                         to_keep=to_keep)

questions_train, questions_val, questions_test = utils.split_train_test(questions_df, 
                                                                        save=False, 
                                                                        validation=True, 
                                                                        to_keep=to_keep)

enhancements_train, enhancements_val, enhancements_test = utils.split_train_test(enhancements_df, 
                                                                                 save=False, 
                                                                                 validation=True, 
                                                                                 to_keep=to_keep)

combinations_train, combinations_val, combinations_test = utils.split_train_test(combinations_df, 
                                                                                 save=False, 
                                                                                 validation=True, 
                                                                                 to_keep=to_keep)

In [10]:
train_df = pd.concat((bugs_train, 
                      questions_train, 
                      enhancements_train, 
                      combinations_train), axis=0, ignore_index=True)

val_df = pd.concat((bugs_val, 
                      questions_val, 
                      enhancements_val, 
                      combinations_val), axis=0, ignore_index=True)

test_df = pd.concat((bugs_test, 
                     questions_test, 
                     enhancements_test, 
                     combinations_test), axis=0, ignore_index=True)

## Shuffle Dataframes

In [11]:
train_df = train_df.sample(frac=1)
val_df = val_df.sample(frac=1)
test_df = test_df.sample(frac=1)

## Check the distribution of classes in train and test set

In [12]:
df_stats = utils.get_labels_stats(df)
train_stats = utils.get_labels_stats(train_df)
val_stats = utils.get_labels_stats(val_df)
test_stats = utils.get_labels_stats(test_df)

pd.DataFrame(pd.concat([df_stats, 
                        train_stats, 
                        val_stats, 
                        test_stats], 
                       axis=1, 
                       keys=['Original Dataset', 'Train', 'Val', 'Test']))

Unnamed: 0_level_0,Original Dataset,Original Dataset,Original Dataset,Train,Train,Train,Val,Val,Val,Test,Test,Test
Unnamed: 0_level_1,Labels Present,Fraction,Examples,Labels Present,Fraction,Examples,Labels Present,Fraction,Examples,Labels Present,Fraction,Examples
0,Bug,0.440106,1397248,Bug,0.304753,12600,Bug,0.304812,1400,Bug,0.304754,6000
1,Question,0.085609,271792,Question,0.304753,12600,Question,0.304812,1400,Question,0.304754,6000
2,Enhancement,0.380621,1208396,Enhancement,0.304753,12600,Enhancement,0.304812,1400,Enhancement,0.304754,6000
3,"Bug, Question",0.000461,1463,"Bug, Question",0.022518,931,"Bug, Question",0.021337,98,"Bug, Question",0.022044,434
4,"Bug, Enhancement",0.00106,3364,"Bug, Enhancement",0.05101,2109,"Bug, Enhancement",0.053995,248,"Bug, Enhancement",0.051148,1007
5,"Question, Enhancement",0.00025,795,"Question, Enhancement",0.01219,504,"Question, Enhancement",0.010233,47,"Question, Enhancement",0.012393,244
6,"Bug, Question, Enhancement",1e-06,4,"Bug, Question, Enhancement",2.4e-05,1,"Bug, Question, Enhancement",0.0,0,"Bug, Question, Enhancement",0.000152,3
7,Total,1.0,3174798,Total,1.0,41345,Total,1.0,4593,Total,1.0,19688


## Save the train and test set

In [13]:
train_df.to_pickle(f'train_{name}.pkl')
val_df.to_pickle(f'val_{name}.pkl')
test_df.to_pickle(f'test_{name}.pkl')