# 1. Prepare the environment

In [1]:
import pandas as pd
from Label_Bot.label_bot import utils

## 1.1. Load the full dataset
More versions of the dataset can be found in **http://cs-people.bu.edu/giorgos/labelbot**. 

In [2]:
base_url = 'http://cs-people.bu.edu/giorgos/labelbot/'

url = base_url + 'github.pkl'

In [3]:
df = pd.read_pickle(url)

In [4]:
utils.get_labels_stats(df)

Unnamed: 0,Labels Present,Fraction,Examples
0,Bug,0.43947,1395227
1,Question,0.085425,271206
2,Enhancement,0.444426,1410962
3,"Bug, Question",0.00046,1459
4,"Bug, Enhancement",0.001676,5322
5,"Question, Enhancement",0.00043,1365
6,"Bug, Question, Enhancement",2e-06,5
7,Total,1.0,3174798


<hr></hr>

# 2. Prepare the dataset for Masked Language Modeling

**You should skip this section if you're not interested in Language Modeling!**

In [None]:
df, _ = utils.split_train_test(df, 
                               train_frac=.125, 
                               save=False)

In [None]:
df, mlm_df = utils.split_train_test(df, 
                                    save=False)

In [None]:
mlm_df_train, mlm_df_val, mlm_df_test = utils.split_train_test(mlm_df, 
                                                               save=False, 
                                                               validation=True, 
                                                               to_keep=['title', 'body'])

In [None]:
utils.df_to_txt(mlm_df_train, name='train')
utils.df_to_txt(mlm_df_val, name='val')
utils.df_to_txt(mlm_df_test, name='test')

del mlm_df, mlm_df_train, mlm_df_val, mlm_df_test

<hr></hr>

# 3. Prepare the dataset for classification

## 3.1. Sample the dataset
We randomly keep only 90,000 examples from each class. Because the dataframe that corresponds to combinations of classes is too small, we keep all of its examples. If you plan to fine-tune the model on your own, you can use a bigger dataset as well; I used this size due to RAM issues. 

Keep in mind that for the classification dataset, we use only datapoints that haven't been seen by our language model, meaning that we don't sample from the 400k datapoints of the language modeling dataset. 

In [5]:
sample_size = int(9e+4)
name = f'{int(sample_size / 1000)}k'

In [6]:
bugs_df, questions_df, enhancements_df, combinations_df = utils.sample_df(df, 
                                                                          n=sample_size, 
                                                                          to_keep=['title', 
                                                                                   'body', 
                                                                                   'label_bug', 
                                                                                   'label_question', 
                                                                                   'label_enhancement'])

## 3.2. Split to train, test and validation sets. 
By default the split is done 70/30.

In [7]:
to_keep=['title', 'body', 'label_bug', 'label_question', 'label_enhancement']

In [8]:
bugs_train, bugs_val, bugs_test = utils.split_train_test(bugs_df, 
                                                         save=False, 
                                                         validation=True, 
                                                         to_keep=to_keep)

questions_train, questions_val, questions_test = utils.split_train_test(questions_df, 
                                                                        save=False, 
                                                                        validation=True, 
                                                                        to_keep=to_keep)

enhancements_train, enhancements_val, enhancements_test = utils.split_train_test(enhancements_df, 
                                                                                 save=False, 
                                                                                 validation=True, 
                                                                                 to_keep=to_keep)

combinations_train, combinations_val, combinations_test = utils.split_train_test(combinations_df, 
                                                                                 save=False, 
                                                                                 validation=True, 
                                                                                 to_keep=to_keep)

In [9]:
train_df = pd.concat((bugs_train, 
                      questions_train, 
                      enhancements_train, 
                      combinations_train), axis=0, ignore_index=True)

val_df = pd.concat((bugs_val, 
                      questions_val, 
                      enhancements_val, 
                      combinations_val), axis=0, ignore_index=True)

test_df = pd.concat((bugs_test, 
                     questions_test, 
                     enhancements_test, 
                     combinations_test), axis=0, ignore_index=True)

## 3.3. Shuffle Dataframes

In [10]:
train_df = train_df.sample(frac=1)
val_df = val_df.sample(frac=1)
test_df = test_df.sample(frac=1)

## 3.4. Check the distribution of classes in train and test set

In [11]:
df_stats = utils.get_labels_stats(df)
train_stats = utils.get_labels_stats(train_df)
val_stats = utils.get_labels_stats(val_df)
test_stats = utils.get_labels_stats(test_df)

pd.DataFrame(pd.concat([df_stats, 
                        train_stats, 
                        val_stats, 
                        test_stats], 
                       axis=1, 
                       keys=['Original Dataset', 'Train', 'Val', 'Test']))

Unnamed: 0_level_0,Original Dataset,Original Dataset,Original Dataset,Train,Train,Train,Val,Val,Val,Test,Test,Test
Unnamed: 0_level_1,Labels Present,Fraction,Examples,Labels Present,Fraction,Examples,Labels Present,Fraction,Examples,Labels Present,Fraction,Examples
0,Bug,0.43947,1395227,Bug,0.323565,56700,Bug,0.323573,6299,Bug,0.323563,27001
1,Question,0.085425,271206,Question,0.323565,56700,Question,0.323573,6299,Question,0.323563,27001
2,Enhancement,0.444426,1410962,Enhancement,0.323565,56700,Enhancement,0.323573,6299,Enhancement,0.323563,27001
3,"Bug, Question",0.00046,1459,"Bug, Question",0.005227,916,"Bug, Question",0.005496,107,"Bug, Question",0.005225,436
4,"Bug, Enhancement",0.001676,5322,"Bug, Enhancement",0.019214,3367,"Bug, Enhancement",0.018801,366,"Bug, Enhancement",0.019042,1589
5,"Question, Enhancement",0.00043,1365,"Question, Enhancement",0.004845,849,"Question, Enhancement",0.004983,97,"Question, Enhancement",0.005021,419
6,"Bug, Question, Enhancement",2e-06,5,"Bug, Question, Enhancement",1.7e-05,3,"Bug, Question, Enhancement",0.0,0,"Bug, Question, Enhancement",2.4e-05,2
7,Total,1.0,3174798,Total,1.0,175235,Total,1.0,19467,Total,1.0,83449


## 3.5. Save the train and test set

In [12]:
train_df.to_pickle(f'train_{name}.pkl')
val_df.to_pickle(f'val_{name}.pkl')
test_df.to_pickle(f'test_{name}.pkl')