# Produce Keyword Prediction Evaluation Dataset
Copyright (C) 2021 ServiceNow, Inc.

The keyword prediction evaluation dataset is produced from the GEOSCAN metadata. 
This notebook assumes that the `Metadata Analysis.ipynb` notebook has already been run and output the following two parquet files: 
* '/nrcan_p2/data/01_raw/20201006/geoscan/GEOSCAN-extract-20200211144755.xml_processed_Feb29.parquet'
* '/nrcan_p2/data/01_raw/20201006/geoscan/EAIDown.xml_processed_Feb29.parquet'

It proceeds in two steps: 
1. Create the various versions of the dataset for
    * PAIRING vs MULTICLASS tasks
    * 5, 10, 30 keyword classes ("subjects")
    * using text from the title or description columns
    * preprocessed using various cleaning pipelines
2. Create static splits of the above datasets for 5-fold cross validation

In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import sys
sys.path.append('..')
#import nrcan_p2.metadata_processing.read_metadata as read_metadata

## General parameters

In [2]:
# pick an output folder
output_folder = '/nrcan_p2/data/03_primary/keyword_prediction/'

In [3]:
# these columns have already been processed correctly
TITLE_COLUMN = 'title_merged' 
DESC_COLUMN = 'desc_en_en_50_3000'
#TITLE_AND_DESC_COLUMN = ''

In [4]:
# subect_columns 
#SUBJECT_G200 = 'subject_g200' # applicable to small and large
#SUBJECT_NS200 = 'subject_ns200'
SUBJECT_5 = 'subject_5'
SUBJECT_30 = 'subject_30'
SUBJECT_10 = 'subject_desc_t10'

## Load the data

In [5]:
import pandas as pd
output_large = '/nrcan_p2/data/01_raw/20201006/geoscan/GEOSCAN-extract-20200211144755.xml_processed_Feb29.parquet'
df_s_large = pd.read_parquet(output_large)

In [6]:
import pandas as pd
output_small = '/nrcan_p2/data/01_raw/20201006/geoscan/EAIDown.xml_processed_Feb29.parquet'
df_s = pd.read_parquet(output_small)

## Produce the datasets

In [10]:
import nrcan_p2.evaluation.keyword_prediction as kp
import json
import pathlib

DATASET='small'

PIPELINE_SETS = [
    ('BERT', 'PIPELINE_BERT_80', None),
    ('BERT', 'PIPELINE_BERT_90', None),
    ('BERT', 'PIPELINE_BERT_PLUS', None),
    ('BERT', 'SIMPLE_PIPELINE_BERT_3', None),
    ('BERT', None, None),
    ('GLOVE', 'PIPELINE_GLOVE_80', 'POSTPIPE_GLOVE'),
    ('GLOVE', 'PIPELINE_GLOVE_90', 'POSTPIPE_GLOVE'),
    ('GLOVE', 'PIPELINE_GLOVE_PLUS', 'POSTPIPE_GLOVE'),
    ('GLOVE', 'SIMPLE_PIPELINE_GLOVE_3', 'POSTPIPE_GLOVE'),
]

for CURR_TITLE_COL in [TITLE_COLUMN, DESC_COLUMN]:
    for TYPE in ['PAIRING', 'MULTICLASS']:
        for MODEL_TYPE, PREPIPE, POSTPIPE in PIPELINE_SETS:

            assert MODEL_TYPE in PREPIPE if PREPIPE is not None else True
            assert MODEL_TYPE in POSTPIPE if POSTPIPE is not None else True

            if TYPE == 'PAIRING':
                DO_NOT_DROP=False
            elif TYPE == 'MULTICLASS':
                DO_NOT_DROP=True

            if DATASET == 'small':
                dff = df_s
                subject_cols = [SUBJECT_5, SUBJECT_30, SUBJECT_10] 

            else:
                dff = df_s_large
                subject_cols = [SUBJECT_5, SUBJECT_30, SUBJECT_10] 

            if TYPE == "PAIRING":
                n_neg_sample = None # 3
            else:
                n_neg_sample = None


            for CURR_SUBJECT_COL in subject_cols:
                # Produce output file name
                if TYPE == "PAIRING":
                    file_suffix = f'{TYPE}_{DATASET}_{CURR_SUBJECT_COL}_{CURR_TITLE_COL}_{PREPIPE}_{POSTPIPE}_n{n_neg_sample}'
                else:
                    file_suffix = f'{TYPE}_{DATASET}_{CURR_SUBJECT_COL}_{CURR_TITLE_COL}_{PREPIPE}_{POSTPIPE}'

                output_file = pathlib.Path(output_folder) / (file_suffix + ('' if not DO_NOT_DROP else '_nodrop') + '-Feb29.parquet')

                print(output_file)
                if output_file.exists():
                    print('...skipping, file exists')
                    continue

                print(dff.shape)

                if TYPE == "PAIRING":
                    df_s_test = dff.dropna(subset=[CURR_SUBJECT_COL, CURR_TITLE_COL])
                else:
                    df_s_test = dff.dropna(subset=[CURR_TITLE_COL]) #don't drop the keyword col if null

                print(df_s_test.shape)


                df_s_test = kp.produce_keyword_classification_dataset_from_df(
                    df_s_test,
                  pre_pipeline=PREPIPE,
                  post_pipeline=POSTPIPE,  
                  cat_pre_pipeline=None,
                  cat_post_pipeline=None,     
                    text_column=CURR_TITLE_COL,
                    text_column_is_list=False,
                    text_col_processing=None,
                    keyword_col=CURR_SUBJECT_COL,
                    n_categories=None,
                    task=TYPE,
                    n_negative_sample=n_neg_sample,
                    do_not_drop=DO_NOT_DROP,
                )
                df_s_test
                print(df_s_test.shape)

                # Save for sklearn
                if MODEL_TYPE == "BERT":
                    if TYPE == "PAIRING":
                        df_s_test = df_s_test[['label', 'keyword_text', 'cat']]
                    else:
                        df_s_test = df_s_test[['keyword_text', 'keyword_cat']].rename(columns={'keyword_cat': 'label'})
                else:
                    if TYPE == "PAIRING":
                        df_s_test = df_s_test[['label', 'keyword_text', 'cat']]
                    else:
                        df_s_test = df_s_test.filter(regex=f'keyword_text|cat_')

                df_s_test.to_parquet(output_file)

/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_title_merged_PIPELINE_BERT_80_None_nNone-Feb29.parquet
...skipping, file exists
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_80_None_nNone-Feb29.parquet
...skipping, file exists
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_desc_t10_title_merged_PIPELINE_BERT_80_None_nNone-Feb29.parquet
...skipping, file exists
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_title_merged_PIPELINE_BERT_90_None_nNone-Feb29.parquet
...skipping, file exists
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_90_None_nNone-Feb29.parquet
...skipping, file exists
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_desc_t10_title_merged_PIPELINE_BERT_90_None_nNone-Feb29.parquet
...skipping, file exists
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_title_merged_PIPELINE

  return func(self, *args, **kwargs)


Dropping...
(2463, 51)
(2463, 51)
Dropping negative samples...
(2463, 52)
(2459, 52)
(12296, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29.parquet
(12316, 49)
(3221, 49)


  return func(self, *args, **kwargs)


Dropping...
(3221, 51)
(3221, 51)
Dropping negative samples...
(3221, 52)
(3221, 52)
(96646, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29.parquet
(12316, 49)
(2892, 49)


  return func(self, *args, **kwargs)


Dropping...
(2892, 51)
(2892, 51)
Dropping negative samples...
(2892, 52)
(2890, 52)
(28905, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_90_None_nNone-Feb29.parquet
(12316, 49)
(2463, 49)


  return func(self, *args, **kwargs)


Dropping...
(2463, 51)
(2463, 51)
Dropping negative samples...
(2463, 52)
(2459, 52)
(12296, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_90_None_nNone-Feb29.parquet
(12316, 49)
(3221, 49)


  return func(self, *args, **kwargs)


Dropping...
(3221, 51)
(3221, 51)
Dropping negative samples...
(3221, 52)
(3221, 52)
(96646, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_90_None_nNone-Feb29.parquet
(12316, 49)
(2892, 49)


  return func(self, *args, **kwargs)


Dropping...
(2892, 51)
(2892, 51)
Dropping negative samples...
(2892, 52)
(2890, 52)
(28905, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nNone-Feb29.parquet
(12316, 49)
(2463, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


Dropping...
(2463, 51)
(2463, 51)
Dropping negative samples...
(2463, 52)
(2459, 52)
(12296, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nNone-Feb29.parquet
(12316, 49)
(3221, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


Dropping...
(3221, 51)
(3221, 51)
Dropping negative samples...
(3221, 52)
(3221, 52)
(96646, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nNone-Feb29.parquet
(12316, 49)
(2892, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


Dropping...
(2892, 51)
(2892, 51)
Dropping negative samples...
(2892, 52)
(2890, 52)
(28905, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nNone-Feb29.parquet
(12316, 49)
(2463, 49)
Dropping...
(2463, 51)
(2463, 51)
Dropping negative samples...
(2463, 52)
(2459, 52)
(12296, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nNone-Feb29.parquet
(12316, 49)
(3221, 49)
Dropping...
(3221, 51)
(3221, 51)
Dropping negative samples...
(3221, 52)
(3221, 52)
(96646, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nNone-Feb29.parquet
(12316, 49)
(2892, 49)
Dropping...
(2892, 51)
(2892, 51)
Dropping negative samples...
(2892, 52)
(2890, 52)
(28905, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_desc_en_en_50_3000_None_None_nNone-Feb29.parquet
(12316, 4

  return func(self, *args, **kwargs)


Dropping...
(2463, 51)
(2463, 51)
Dropping negative samples...
(2463, 52)
(2459, 52)
(12296, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(3221, 49)


  return func(self, *args, **kwargs)


Dropping...
(3221, 51)
(3221, 51)
Dropping negative samples...
(3221, 52)
(3221, 52)
(96646, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(2892, 49)


  return func(self, *args, **kwargs)


Dropping...
(2892, 51)
(2892, 51)
Dropping negative samples...
(2892, 52)
(2890, 52)
(28905, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(2463, 49)


  return func(self, *args, **kwargs)


Dropping...
(2463, 51)
(2463, 51)
Dropping negative samples...
(2463, 52)
(2459, 52)
(12296, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(3221, 49)


  return func(self, *args, **kwargs)


Dropping...
(3221, 51)
(3221, 51)
Dropping negative samples...
(3221, 52)
(3221, 52)
(96646, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(2892, 49)


  return func(self, *args, **kwargs)


Dropping...
(2892, 51)
(2892, 51)
Dropping negative samples...
(2892, 52)
(2890, 52)
(28905, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(2463, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


Dropping...
(2463, 51)
(2463, 51)
Dropping negative samples...
(2463, 52)
(2459, 52)
(12296, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(3221, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


Dropping...
(3221, 51)
(3221, 51)
Dropping negative samples...
(3221, 52)
(3221, 52)
(96646, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(2892, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


Dropping...
(2892, 51)
(2892, 51)
Dropping negative samples...
(2892, 52)
(2890, 52)
(28905, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(2463, 49)
Dropping...
(2463, 51)
(2463, 51)
Dropping negative samples...
(2463, 52)
(2459, 52)
(12296, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(3221, 49)
Dropping...
(3221, 51)
(3221, 51)
Dropping negative samples...
(3221, 52)
(3221, 52)
(96646, 52)
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet
(12316, 49)
(2892, 49)
Dropping...
(2892, 51)
(2892, 51)
Dropping negative samples...
(2892, 52)
(2890, 52)
(28905, 52)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIP

  return func(self, *args, **kwargs)


(1054, 56)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)


(1054, 81)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)


(1054, 61)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)


(1054, 56)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)


(1054, 81)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)


(1054, 61)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


(1054, 56)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


(1054, 81)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


(1054, 61)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)
(1054, 56)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)
(1054, 81)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)
(1054, 61)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_5_desc_en_en_50_3000_None_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)
(1054, 56)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_None_None_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)
(1054, 81)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_None_None_nodrop-Feb29.parquet
(12316, 49)
(3500

  return func(self, *args, **kwargs)


(1054, 56)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)


(1054, 81)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)


(1054, 61)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)


(1054, 56)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)


(1054, 81)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)


(1054, 61)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


(1054, 56)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


(1054, 81)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)


  return func(self, *args, **kwargs)
  return rm_dbl_space(dfcol.str.replace(r"([+=\[\]\(\)\/\-*:])", ' \\1 '))
  return rm_dbl_space(dfcol.str.replace(r"\S*[0-9]deg[0-9NSEW]\S*", " "))


(1054, 61)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)
(1054, 56)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)
(1054, 81)
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet
(12316, 49)
(3500, 49)
(1054, 61)


In [10]:
df_s[~df_s.subject_desc_t10.isnull()].subject_desc_t10.explode().value_counts(dropna=False)

geophysics                         2723
stratigraphy                       2254
Quaternary                         2123
Precambrian                        1917
structural geology                 1888
surficial geology/geomorphology    1874
geochemistry                       1755
lithology                          1632
Cenozoic                           1485
Paleozoic                          1353
Name: subject_desc_t10, dtype: int64

In [9]:
df_s_test.cat.value_counts(dropna=False)
#df_s_test[df_s_test.cat.isnull()]

Precambrian                        7919
Quaternary                         7917
lithology                          7917
Cenozoic                           7916
geophysics                         7912
Paleozoic                          7912
geochemistry                       7911
surficial geology/geomorphology    7911
structural geology                 7910
stratigraphy                       7910
NaN                                   7
Name: cat, dtype: int64

## Create the dataset splits

In [11]:
import nrcan_p2.evaluation.keyword_prediction as kp
import pathlib

output_dir = '/nrcan_p2/data/03_primary/keyword_prediction/splits'

glove_pipes = ['SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE', 
               'PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE',
              'PIPELINE_GLOVE_80_POSTPIPE_GLOVE',
              'PIPELINE_GLOVE_90_POSTPIPE_GLOVE']
bert_pipes = ['SIMPLE_PIPELINE_BERT_3_None', 
              'PIPELINE_BERT_PLUS_None',
              'PIPELINE_BERT_80_None',
              'PIPELINE_BERT_90_None',
              'None_None']
#model_type="GLOVE"
input_files = []
model_types = []
n_neg_sample = None
for task in ['PAIRING', 'MULTICLASS']: # #, 'PAIRING']:
    for size in ['small']: #, 'large']:
        for subj in ['subject_30', 'subject_5', 'subject_desc_t10']:
            for text in ['title_merged', 'desc_en_en', 'desc_en_en_50_3000']: 
                for pipe in glove_pipes + bert_pipes:
                    if pipe in bert_pipes:
                        model_type = 'BERT'
                    elif pipe in glove_pipes:
                        model_type = 'GLOVE'
                    
                    model_types.append(model_type)
                    
                    if task == 'MULTICLASS':
                        input_file = f'/nrcan_p2/data/03_primary/keyword_prediction/{task}_{size}_{subj}_{text}_{pipe}_nodrop-Feb29.parquet'
                    elif task == 'PAIRING':
                        input_file = f'/nrcan_p2/data/03_primary/keyword_prediction/{task}_{size}_{subj}_{text}_{pipe}_n{n_neg_sample}-Feb29.parquet'
                    input_files.append(input_file)
                    
for model_type,input_df_file in zip(model_types,input_files):
                #input_df_file = '/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb25.parquet'
    print('---')
    print(input_df_file)
    
    input_df_file = pathlib.Path(input_df_file)
    if not input_df_file.exists():
        print('...dne')
        continue
    #input_df_file = '/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb25.parquet'
    #input_df_file = '/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_n3-Feb25.parquet'
    output_name = pathlib.Path(input_df_file).stem
    task = output_name.split('_')[0]

    print(output_dir)
    print(input_df_file)
    print(output_name)
    print(task)
    print(model_type)

    try:
        kp.produce_dataset_splits(
            output_dir=output_dir,
            input_df_file=input_df_file,
            output_name=output_name,
            task=task,
            model_type=model_type
        )
    except Exception as e: 
        print(e)
        

---
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet
/nrcan_p2/data/03_primary/keyword_prediction/splits
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet
PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29
PAIRING
GLOVE
ERROR: output dir /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29 already exists
---
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29.parquet
/nrcan_p2/data/03_primary/keyword_prediction/splits
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29.parquet
PAIRING_small_subject_30_title_merged_PIPELINE_GLOV

(96646, 3)
(96646, 3)
Split 0
Dropping nas from the training set...
(19321, 3)
(19321, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_0/train.csv...
(19321, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_0/valid.csv...
(77325, 3)
Split 1
Dropping nas from the training set...
(19324, 3)
(19324, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_1/train.csv...
(19324, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_1/valid.csv...
(77322, 3)
Split 2
Dropping nas from the training set...
(19324, 3)
(19324, 3)
Writing /nrcan_p2/data/03_

Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_0/valid.csv...
(77325, 3)
Split 1
Dropping nas from the training set...
(19324, 3)
(19324, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_1/train.csv...
(19324, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_1/valid.csv...
(77322, 3)
Split 2
Dropping nas from the training set...
(19324, 3)
(19324, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_2/train.csv...
(19324, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNo

Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_2/train.csv...
(19324, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_2/valid.csv...
(77322, 3)
Split 3
Dropping nas from the training set...
(19325, 3)
(19325, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_3/train.csv...
(19325, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_3/valid.csv...
(77321, 3)
Split 4
Dropping nas from the training set...
(19322, 3)
(19322, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_4/train.csv...
(19322, 3)
Writing /nrcan

Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_0/train.csv...
(2455, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_0/valid.csv...
(9841, 3)
Split 1
Dropping nas from the training set...
(2455, 3)
(2455, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_1/train.csv...
(2455, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_1/valid.csv...
(9841, 3)
Split 2
Dropping nas from the training set...
(2455, 3)
(2455, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_P

Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_1/valid.csv...
(9841, 3)
Split 2
Dropping nas from the training set...
(2455, 3)
(2455, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_2/train.csv...
(2455, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_2/valid.csv...
(9841, 3)
Split 3
Dropping nas from the training set...
(2455, 3)
(2455, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_3/train.csv...
(2455, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/spli

Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_3/valid.csv...
(9841, 3)
Split 4
Dropping nas from the training set...
(2455, 3)
(2455, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_4/train.csv...
(2455, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_4/valid.csv...
(9841, 3)
---
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_90_None_nNone-Feb29.parquet
/nrcan_p2/data/03_primary/keyword_prediction/splits
/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_90_None_nNone-Feb29.parquet
PAIRING_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_90_None_nNone-Feb29
PAIRING
BERT
(12296, 3)
(12296, 3)
Sp

(5781, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_0/train.csv...
(5781, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_0/valid.csv...
(23124, 3)
Split 1
Dropping nas from the training set...
(5780, 3)
(5780, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_1/train.csv...
(5780, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_1/valid.csv...
(23125, 3)
Split 2
Dropping nas from the training set...
(5782, 3)
(5782, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t

Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_0/valid.csv...
(23124, 3)
Split 1
Dropping nas from the training set...
(5780, 3)
(5780, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_1/train.csv...
(5780, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_1/valid.csv...
(23125, 3)
Split 2
Dropping nas from the training set...
(5782, 3)
(5782, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29/split_2/train.csv...
(5782, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLO

Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_1/valid.csv...
(23125, 3)
Split 2
Dropping nas from the training set...
(5782, 3)
(5782, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_2/train.csv...
(5782, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_2/valid.csv...
(23123, 3)
Split 3
Dropping nas from the training set...
(5780, 3)
(5780, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_3/train.csv...
(5780, 3)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nNone-Feb29/split_3/valid.csv...
(

(1054, 31)
Split 0
Dropping nas from the training set...
(843, 31)
(843, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_0/train.csv...
(843, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_0/valid.csv...
(211, 31)
Split 1
Dropping nas from the training set...
(843, 31)
(843, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_1/train.csv...
(843, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_1/valid.csv...
(211, 31)
Split 2
Dropping nas from the training set...
(843, 31)
(843, 31)
Writing /nrcan_p2/data/03_prima



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_3/valid.csv...
(211, 31)
Split 4
Dropping nas from the training set...
(844, 31)
(844, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_4/train.csv...
(844, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_4/valid.csv...
(210, 31)
---
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29.parquet
/nrcan_p2/data/03_primary/keyword_prediction/splits
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29.parquet
MULTICLASS_s



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_2/train.csv...
(843, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_2/valid.csv...
(211, 31)
Split 3
Dropping nas from the training set...
(843, 31)
(843, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_3/train.csv...
(843, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_3/valid.csv...
(211, 31)
Split 4
Dropping nas from the training set...
(844, 31)
(844, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_PL



(793, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_1/train.csv...
(793, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_1/valid.csv...
(211, 31)
Split 2
Dropping nas from the training set...
(843, 31)
(795, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_2/train.csv...
(795, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_2/valid.csv...
(211, 31)
Split 3
Dropping nas from the training set...
(843, 31)
(791, 31)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29/split_4/valid.csv...
(210, 31)
---
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29.parquet
/nrcan_p2/data/03_primary/keyword_prediction/splits
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29.parquet
MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29
MULTICLASS
BERT
(1054, 2)
(1054, 2)
Split 0
Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_0/train.json...
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMP



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_4/train.json...
(844, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_4/valid.json...
(210, 2)
---
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29.parquet
/nrcan_p2/data/03_primary/keyword_prediction/splits
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29.parquet
MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29
MULTICLASS
BERT
(1054, 2)
(1054, 2)
Split 0
Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_PLUS_No



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_3/valid.json...
(211, 2)
Split 4
Dropping nas from the training set...
(844, 2)
(844, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_4/train.json...
(844, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_4/valid.json...
(210, 2)
---
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29.parquet
/nrcan_p2/data/03_primary/keyword_prediction/splits
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29.parquet
MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29
MUL




(843, 2)
(793, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_1/train.json...
(793, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_1/valid.json...
(211, 2)
Split 2
Dropping nas from the training set...
(843, 2)
(795, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_2/train.json...
(795, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_2/valid.json...
(211, 2)
Split 3
Dropping nas from the training set...
(843, 2)
(791, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_3/train.json.



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29/split_1/train.json...
(569, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29/split_1/valid.json...
(211, 2)
Split 2
Dropping nas from the training set...
(843, 2)
(565, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29/split_2/train.json...
(565, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29/split_2/valid.json...
(211, 2)
Split 3
Dropping nas from the training set...
(843, 2)
(565, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29/split_3/train.json...
(565, 2)
Writing




Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_0/train.csv...
(843, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_0/valid.csv...
(211, 6)
Split 1
Dropping nas from the training set...
(843, 6)
(843, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_1/train.csv...
(843, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_1/valid.csv...
(211, 6)
Split 2
Dropping nas from the training set...
(843, 6)
(843, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_SIMPLE_PIPELI



(843, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_3/train.csv...
(843, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_3/valid.csv...
(211, 6)
Split 4
Dropping nas from the training set...
(844, 6)
(844, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_4/train.csv...
(844, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_4/valid.csv...
(210, 6)
---
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29.parquet
/nrcan_p2/data/03_primary/keyword_predicti



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_1/train.csv...
(793, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_1/valid.csv...
(211, 6)
Split 2
Dropping nas from the training set...
(843, 6)
(795, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_2/train.csv...
(795, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_2/valid.csv...
(211, 6)
Split 3
Dropping nas from the training set...
(843, 6)
(791, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodro



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29/split_0/train.csv...
(522, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29/split_0/valid.csv...
(211, 6)
Split 1
Dropping nas from the training set...
(843, 6)
(569, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29/split_1/train.csv...
(569, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29/split_1/valid.csv...
(211, 6)
Split 2
Dropping nas from the training set...
(843, 6)
(565, 6)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodro



(1054, 2)
(1054, 2)
Split 0
Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_0/train.json...
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_0/valid.json...
(211, 2)
Split 1
Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_1/train.json...
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_1/valid.json...
(211, 2)
Split 2
Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_



(1054, 2)
(1054, 2)
Split 0
Dropping nas from the training set...
(843, 2)
(784, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_0/train.json...
(784, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_0/valid.json...
(211, 2)
Split 1
Dropping nas from the training set...
(843, 2)
(793, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_1/train.json...
(793, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_1/valid.json...
(211, 2)
Split 2
Dropping nas from the training set...
(843, 2)
(795, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50



(1054, 2)
(1054, 2)
Split 0
Dropping nas from the training set...
(843, 2)
(522, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29/split_0/train.json...
(522, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29/split_0/valid.json...
(211, 2)
Split 1
Dropping nas from the training set...
(843, 2)
(569, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29/split_1/train.json...
(569, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_PIPELINE_BERT_90_None_nodrop-Feb29/split_1/valid.json...
(211, 2)
Split 2
Dropping nas from the training set...
(843, 2)
(565, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_None_None_nodrop-Feb29/split_0/train.json...
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_None_None_nodrop-Feb29/split_0/valid.json...
(211, 2)
Split 1
Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_None_None_nodrop-Feb29/split_1/train.json...
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_None_None_nodrop-Feb29/split_1/valid.json...
(211, 2)
Split 2
Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_5_desc_en_en_50_3000_None_None_nodrop-Feb29/split_2/train.json...
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_s

ERROR: output dir /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29 already exists
---
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29.parquet
/nrcan_p2/data/03_primary/keyword_prediction/splits
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29.parquet
MULTICLASS_small_subject_desc_t10_desc_en_en_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29
MULTICLASS
GLOVE
ERROR: output dir /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29 already exists
---
/nrcan_p2/data/03_primary/keyword_prediction/MULTICLASS_small_subject_desc_t10_desc_en_en_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29.parquet
/nrcan_p2/data/03_primary/keyword_prediction/splits
/



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_2/train.csv...
(843, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_2/valid.csv...
(211, 11)
Split 3
Dropping nas from the training set...
(843, 11)
(843, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_3/train.csv...
(843, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29/split_3/valid.csv...
(211, 11)
Split 4
Dropping nas from the training set...
(844, 11)
(844, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_1/train.csv...
(843, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_1/valid.csv...
(211, 11)
Split 2
Dropping nas from the training set...
(843, 11)
(843, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_2/train.csv...
(843, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29/split_2/valid.csv...
(211, 11)
Split 3
Dropping nas from the training set...
(843, 11)
(843, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_e



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_2/valid.csv...
(211, 11)
Split 3
Dropping nas from the training set...
(843, 11)
(791, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_3/train.csv...
(791, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_3/valid.csv...
(211, 11)
Split 4
Dropping nas from the training set...
(844, 11)
(809, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29/split_4/train.csv...
(809, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29/split_2/train.csv...
(565, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29/split_2/valid.csv...
(211, 11)
Split 3
Dropping nas from the training set...
(843, 11)
(565, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29/split_3/train.csv...
(565, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29/split_3/valid.csv...
(211, 11)
Split 4
Dropping nas from the training set...
(844, 11)
(591, 11)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_



Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_2/train.json...
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_2/valid.json...
(211, 2)
Split 3
Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_3/train.json...
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_3/valid.json...
(211, 2)
Split 4
Dropping nas from the training set...
(844, 2)
(844, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small



Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_1/train.json...
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_1/valid.json...
(211, 2)
Split 2
Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_2/train.json...
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_2/valid.json...
(211, 2)
Split 3
Dropping nas from the training set...
(843, 2)
(843, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_PLUS_None_nodrop-Feb2



Dropping nas from the training set...
(843, 2)
(793, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_1/train.json...
(793, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_1/valid.json...
(211, 2)
Split 2
Dropping nas from the training set...
(843, 2)
(795, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_2/train.json...
(795, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_en_en_50_3000_PIPELINE_BERT_80_None_nodrop-Feb29/split_2/valid.json...
(211, 2)
Split 3
Dropping nas from the training set...
(843, 2)
(791, 2)
Writing /nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_desc_t10_desc_e



## Investigate the datasets and produce some statistics

### Investigate the PAIRING datasets

In [62]:
input_files

['/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet',
 '/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29.parquet',
 '/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nNone-Feb29.parquet',
 '/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29.parquet',
 '/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nNone-Feb29.parquet',
 '/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29.parquet',
 '/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_80_None_nNone-Feb29.parquet',
 '/nrcan_p2/data/03_primary/keyword_p

In [40]:
import pandas as pd
import tqdm

pairing_label_dist = []
pairing_column_dist = []
pairing_size_dist = []
for model_type,input_df_file in tqdm.tqdm(zip(model_types,input_files), total=len(input_files)):
    #print(model_type, pathlib.Path(input_df_file).name)
    fname = pathlib.Path(input_df_file).name
    data = pd.read_parquet(input_df_file)
    
    if sorted(list(data.columns)) == sorted(['label', 'keyword_text', 'cat']):
        x = pd.concat([data.label.value_counts().to_frame().transpose(), data.label.value_counts(normalize=True).to_frame().transpose()], axis=1)
        x['file'] = fname
        #display(x)
        pairing_label_dist.append(x)
        
        x = pd.concat([data.cat.value_counts(), data.cat.value_counts(normalize=True)], axis=1)
        x['file'] = fname
        #display(x)
        pairing_column_dist.append(x)
        
        x = pd.DataFrame({'examples': data.shape[0], 'cols': data.shape[1]}, index=[0])
        x['file'] = fname
        #display(x)
        pairing_size_dist.append(x)
        
        #break
                
    #break
pairing_label_dist = pd.concat(pairing_label_dist) #data
pairing_column_dist = pd.concat(pairing_column_dist)
pairing_size_dist = pd.concat(pairing_size_dist)

display(pairing_label_dist)
display(pairing_column_dist)
display(pairing_size_dist)

100%|██████████| 162/162 [00:10<00:00, 15.39it/s]


Unnamed: 0,0,1,0.1,1.1,file
label,240929,37632,0.864906,0.135094,PAIRING_small_subject_30_title_merged_SIMPLE_P...
label,240929,37632,0.864906,0.135094,PAIRING_small_subject_30_title_merged_PIPELINE...
label,240929,37632,0.864906,0.135094,PAIRING_small_subject_30_title_merged_PIPELINE...
label,240929,37632,0.864906,0.135094,PAIRING_small_subject_30_title_merged_PIPELINE...
label,240929,37632,0.864906,0.135094,PAIRING_small_subject_30_title_merged_SIMPLE_P...
...,...,...,...,...,...
label,22486,6419,0.777928,0.222072,PAIRING_small_subject_desc_t10_desc_en_en_50_3...
label,22486,6419,0.777928,0.222072,PAIRING_small_subject_desc_t10_desc_en_en_50_3...
label,22486,6419,0.777928,0.222072,PAIRING_small_subject_desc_t10_desc_en_en_50_3...
label,22486,6419,0.777928,0.222072,PAIRING_small_subject_desc_t10_desc_en_en_50_3...


Unnamed: 0,cat,cat.1,file
Precambrian,9292,0.033357,PAIRING_small_subject_30_title_merged_SIMPLE_P...
Quaternary,9290,0.033350,PAIRING_small_subject_30_title_merged_SIMPLE_P...
Ordovician,9290,0.033350,PAIRING_small_subject_30_title_merged_SIMPLE_P...
lithology,9290,0.033350,PAIRING_small_subject_30_title_merged_SIMPLE_P...
Devonian,9289,0.033346,PAIRING_small_subject_30_title_merged_SIMPLE_P...
...,...,...,...
stratigraphy,2890,0.099983,PAIRING_small_subject_desc_t10_desc_en_en_50_3...
geochemistry,2890,0.099983,PAIRING_small_subject_desc_t10_desc_en_en_50_3...
Paleozoic,2890,0.099983,PAIRING_small_subject_desc_t10_desc_en_en_50_3...
structural geology,2890,0.099983,PAIRING_small_subject_desc_t10_desc_en_en_50_3...


Unnamed: 0,examples,cols,file
0,278561,3,PAIRING_small_subject_30_title_merged_SIMPLE_P...
0,278561,3,PAIRING_small_subject_30_title_merged_PIPELINE...
0,278561,3,PAIRING_small_subject_30_title_merged_PIPELINE...
0,278561,3,PAIRING_small_subject_30_title_merged_PIPELINE...
0,278561,3,PAIRING_small_subject_30_title_merged_SIMPLE_P...
...,...,...,...
0,28905,3,PAIRING_small_subject_desc_t10_desc_en_en_50_3...
0,28905,3,PAIRING_small_subject_desc_t10_desc_en_en_50_3...
0,28905,3,PAIRING_small_subject_desc_t10_desc_en_en_50_3...
0,28905,3,PAIRING_small_subject_desc_t10_desc_en_en_50_3...


In [44]:
with pd.option_context('max_colwidth', None):
    with pd.option_context('max_rows', None):
        x = pairing_label_dist.set_index('file', append=True).reset_index(level=0, drop=True)
        x.to_csv('PAIRING_label_distribution.csv')
        display(x)
    
    display(pairing_column_dist)
    
    with pd.option_context('max_rows', None):
        x = pairing_size_dist.set_index('file')
        x.to_csv('PAIRING_size_distribution.csv')
        display(x)

Unnamed: 0_level_0,0,1,0,1
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet,240929,37632,0.864906,0.135094
PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29.parquet,240929,37632,0.864906,0.135094
PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nNone-Feb29.parquet,240929,37632,0.864906,0.135094
PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29.parquet,240929,37632,0.864906,0.135094
PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nNone-Feb29.parquet,240929,37632,0.864906,0.135094
PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29.parquet,240929,37632,0.864906,0.135094
PAIRING_small_subject_30_title_merged_PIPELINE_BERT_80_None_nNone-Feb29.parquet,240929,37632,0.864906,0.135094
PAIRING_small_subject_30_title_merged_PIPELINE_BERT_90_None_nNone-Feb29.parquet,240929,37632,0.864906,0.135094
PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29.parquet,240929,37632,0.864906,0.135094
PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet,88563,13815,0.865059,0.134941


Unnamed: 0,cat,cat.1,file
Precambrian,9292,0.033357,PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet
Quaternary,9290,0.033350,PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet
Ordovician,9290,0.033350,PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet
lithology,9290,0.033350,PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet
Devonian,9289,0.033346,PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet
...,...,...,...
stratigraphy,2890,0.099983,PAIRING_small_subject_desc_t10_desc_en_en_50_3000_None_None_nNone-Feb29.parquet
geochemistry,2890,0.099983,PAIRING_small_subject_desc_t10_desc_en_en_50_3000_None_None_nNone-Feb29.parquet
Paleozoic,2890,0.099983,PAIRING_small_subject_desc_t10_desc_en_en_50_3000_None_None_nNone-Feb29.parquet
structural geology,2890,0.099983,PAIRING_small_subject_desc_t10_desc_en_en_50_3000_None_None_nNone-Feb29.parquet


Unnamed: 0_level_0,examples,cols
file,Unnamed: 1_level_1,Unnamed: 2_level_1
PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet,278561,3
PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29.parquet,278561,3
PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nNone-Feb29.parquet,278561,3
PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29.parquet,278561,3
PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nNone-Feb29.parquet,278561,3
PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29.parquet,278561,3
PAIRING_small_subject_30_title_merged_PIPELINE_BERT_80_None_nNone-Feb29.parquet,278561,3
PAIRING_small_subject_30_title_merged_PIPELINE_BERT_90_None_nNone-Feb29.parquet,278561,3
PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29.parquet,278561,3
PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29.parquet,102378,3


### Investigate the MULTICLASS datasets...

In [59]:
import pandas as pd
import tqdm

multiclass_label_dist = []
#multiclass_column_dist = []
multiclass_size_dist = []
for model_type,input_df_file in tqdm.tqdm(zip(model_types,input_files), total=len(input_files)):
    #print(model_type, pathlib.Path(input_df_file).name)
    fname = pathlib.Path(input_df_file).name
    data = pd.read_parquet(input_df_file)
    
    if 'MULTICLASS' in fname:

        if sorted(data.columns) == sorted(['keyword_text', 'label']):
            vc = data.label.explode().value_counts()
            for item in vc.index:
                data[f'cat_{item}'] = data.label.apply(lambda x: 1 if item in x else 0)
                
        data_cats = []
        for cat in data.filter(regex='cat'):
            x = pd.concat([data[cat].value_counts(normalize=False).to_frame().transpose(),
                               data[cat].value_counts(normalize=True).to_frame().transpose()], axis=1)
            data_cats.append(x)

        x = pd.concat(data_cats)    
        x['file'] = fname
        #display(x)
        multiclass_label_dist.append(x)
        
#         x = pd.concat([data.cat.value_counts(), data.cat.value_counts(normalize=True)], axis=1)
#         x['file'] = fname
#         display(x)
#         multiclass_column_dist.append(x)
        
        x = pd.DataFrame({'examples': data.shape[0], 'cols': data.shape[1]}, index=[0])
        x['file'] = fname
        #display(x)
        multiclass_size_dist.append(x)
        
                
    #break
multiclass_label_dist = pd.concat(multiclass_label_dist) #data
#multiclass_column_dist = pd.concat(multiclass_column_dist)
multiclass_size_dist = pd.concat(multiclass_size_dist)

display(multiclass_label_dist)
#display(multiclass_column_dist)
display(multiclass_size_dist)

100%|██████████| 162/162 [00:33<00:00,  4.90it/s]


Unnamed: 0,0,1,0.1,1.1,file
cat_geophysics,7700,2280,0.771543,0.228457,MULTICLASS_small_subject_30_title_merged_SIMPL...
cat_stratigraphy,8093,1887,0.810922,0.189078,MULTICLASS_small_subject_30_title_merged_SIMPL...
cat_Quaternary,8207,1773,0.822345,0.177655,MULTICLASS_small_subject_30_title_merged_SIMPL...
cat_surficial geology/geomorphology,8465,1515,0.848196,0.151804,MULTICLASS_small_subject_30_title_merged_SIMPL...
cat_Precambrian,8446,1534,0.846293,0.153707,MULTICLASS_small_subject_30_title_merged_SIMPL...
...,...,...,...,...,...
cat_Quaternary,926,128,0.878558,0.121442,MULTICLASS_small_subject_desc_t10_desc_en_en_5...
cat_lithology,946,108,0.897533,0.102467,MULTICLASS_small_subject_desc_t10_desc_en_en_5...
cat_surficial geology/geomorphology,965,89,0.915560,0.084440,MULTICLASS_small_subject_desc_t10_desc_en_en_5...
cat_Paleozoic,968,86,0.918406,0.081594,MULTICLASS_small_subject_desc_t10_desc_en_en_5...


Unnamed: 0,examples,cols,file
0,9980,31,MULTICLASS_small_subject_30_title_merged_SIMPL...
0,9980,31,MULTICLASS_small_subject_30_title_merged_PIPEL...
0,9980,31,MULTICLASS_small_subject_30_title_merged_PIPEL...
0,9980,31,MULTICLASS_small_subject_30_title_merged_PIPEL...
0,9980,32,MULTICLASS_small_subject_30_title_merged_SIMPL...
...,...,...,...
0,1054,12,MULTICLASS_small_subject_desc_t10_desc_en_en_5...
0,1054,12,MULTICLASS_small_subject_desc_t10_desc_en_en_5...
0,1054,12,MULTICLASS_small_subject_desc_t10_desc_en_en_5...
0,1054,12,MULTICLASS_small_subject_desc_t10_desc_en_en_5...


In [61]:
with pd.option_context('max_colwidth', None):
    with pd.option_context('max_rows', None):
        x = multiclass_label_dist.set_index('file', append=True)
        x.to_csv('MULTICLASS_label_distribution.csv')
        display(x)
    
    #display(pairing_column_dist)
    
    with pd.option_context('max_rows', None):
        x = multiclass_size_dist.set_index('file')
        x.to_csv('MULTICLASS_size_distribution.csv')
        display(x)

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,0,1
Unnamed: 0_level_1,file,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cat_geophysics,MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,7700,2280,0.771543,0.228457
cat_stratigraphy,MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,8093,1887,0.810922,0.189078
cat_Quaternary,MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,8207,1773,0.822345,0.177655
cat_surficial geology/geomorphology,MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,8465,1515,0.848196,0.151804
cat_Precambrian,MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,8446,1534,0.846293,0.153707
cat_structural geology,MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,8433,1547,0.84499,0.15501
cat_geochemistry,MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,8465,1515,0.848196,0.151804
cat_lithology,MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,8579,1401,0.859619,0.140381
cat_economic geology,MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,8630,1350,0.864729,0.135271
cat_Cenozoic,MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,8748,1232,0.876553,0.123447


Unnamed: 0_level_0,examples,cols
file,Unnamed: 1_level_1,Unnamed: 2_level_1
MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,9980,31
MULTICLASS_small_subject_30_title_merged_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29.parquet,9980,31
MULTICLASS_small_subject_30_title_merged_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29.parquet,9980,31
MULTICLASS_small_subject_30_title_merged_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29.parquet,9980,31
MULTICLASS_small_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29.parquet,9980,32
MULTICLASS_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29.parquet,9980,32
MULTICLASS_small_subject_30_title_merged_PIPELINE_BERT_80_None_nodrop-Feb29.parquet,9980,32
MULTICLASS_small_subject_30_title_merged_PIPELINE_BERT_90_None_nodrop-Feb29.parquet,9980,32
MULTICLASS_small_subject_30_title_merged_None_None_nodrop-Feb29.parquet,9980,32
MULTICLASS_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29.parquet,1174,31


In [54]:
data_cats = []
for cat in data.filter(regex='cat'):
    x = pd.concat([data[cat].value_counts(normalize=False).to_frame().transpose(),
                       data[cat].value_counts(normalize=True).to_frame().transpose()], axis=1)
    data_cats.append(x)
    
pd.concat(data_cats)


Unnamed: 0,0,1,0.1,1.1
cat_geophysics,7700,2280,0.771543,0.228457
cat_stratigraphy,8093,1887,0.810922,0.189078
cat_Quaternary,8207,1773,0.822345,0.177655
cat_surficial geology/geomorphology,8465,1515,0.848196,0.151804
cat_Precambrian,8446,1534,0.846293,0.153707
cat_structural geology,8433,1547,0.84499,0.15501
cat_geochemistry,8465,1515,0.848196,0.151804
cat_lithology,8579,1401,0.859619,0.140381
cat_economic geology,8630,1350,0.864729,0.135271
cat_Cenozoic,8748,1232,0.876553,0.123447


## Investigate the split versions of the dataset

In [57]:
import json
import pandas as pd
datums = []
for input_file in input_files:
    output_name = pathlib.Path(input_file).stem
    
    if not pathlib.Path(input_file).exists():
        continue
    input_data = pd.read_parquet(input_file)
    datums.append({'size': input_data.shape[0], 'ifile': str(input_file), 'split': 'parent'})
    
    output_dir = '/nrcan_p2/data/03_primary/keyword_prediction/'
    name = pathlib.Path(output_dir) / 'splits' / output_name 
    print(name)
    for i in range(0,5):
        subdir = name / f'split_{i}'
        #print(subdir)
        if subdir.exists():
            for file in subdir.iterdir():
                #print(file)
                if not file.exists():
                    continue
                if file.suffix == '.json':
                    data = pd.read_json(file, orient='table')
                   
                elif file.suffix == '.csv':
                    data = pd.read_csv(file)
                    
                datums.append({'size': data.shape[0], 'ifile': str(file), 'split':i})

#                 data['infile'] = file
#                 data['split'] = i
        
    #display(data)
                #datums.append(data)
    
datums=pd.DataFrame(datums)

/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nNone-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nNone-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nNone-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_80_None_nNone-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subjec

/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nodrop-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nodrop-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_PIPELINE_GLOVE_80_POSTPIPE_GLOVE_nodrop-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_PIPELINE_GLOVE_90_POSTPIPE_GLOVE_nodrop-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_PIPELINE_BERT_PLUS_None_nodrop-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_small_subject_30_desc_en_en_PIPELINE_BERT_80_None_nodrop-Feb29
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTIC

In [58]:
datums.loc[datums.split == 'parent','parts'] = datums.loc[datums.split == 'parent',:].ifile.str.split('/').str[-1].str.split('_')
datums.loc[datums.split != 'parent','parts'] = datums.loc[datums.split != 'parent',:].ifile.str.split('/').str[-3].str.split('_')
datums.loc[datums.split != 'parent','tv'] = datums.loc[datums.split != 'parent',:].ifile.str.split('/').str[-1]

datums['task'] = datums.parts.str[0]
datums['dsize'] = datums.parts.str[1]
datums['rest'] = datums.parts.str[2:]
datums['subj'] = datums.rest.apply(lambda x: x[0:2] if x[1] in ['5', '30'] else x[0:3])
datums['rest2'] = datums.rest.apply(lambda x: x[2:] if x[1] in ['5', '30'] else x[3:])
datums['textcol'] = datums.rest2.apply(lambda x: x[0:2] if x[1] in ['merged'] else x[0:3])
datums['rest3'] = datums.rest2.apply(lambda x: x[2:-1] if x[1] in ['merged'] else x[3:-1])
datums

Unnamed: 0,size,ifile,split,parts,tv,task,dsize,rest,subj,rest2,textcol,rest3
0,278561,/nrcan_p2/data/03_primary/keyword_prediction/P...,parent,"[PAIRING, small, subject, 30, title, merged, S...",,PAIRING,small,"[subject, 30, title, merged, SIMPLE, PIPELINE,...","[subject, 30]","[title, merged, SIMPLE, PIPELINE, GLOVE, 3, PO...","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
1,55634,/nrcan_p2/data/03_primary/keyword_prediction/s...,0,"[PAIRING, small, subject, 30, title, merged, S...",train.csv,PAIRING,small,"[subject, 30, title, merged, SIMPLE, PIPELINE,...","[subject, 30]","[title, merged, SIMPLE, PIPELINE, GLOVE, 3, PO...","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
2,222867,/nrcan_p2/data/03_primary/keyword_prediction/s...,0,"[PAIRING, small, subject, 30, title, merged, S...",valid.csv,PAIRING,small,"[subject, 30, title, merged, SIMPLE, PIPELINE,...","[subject, 30]","[title, merged, SIMPLE, PIPELINE, GLOVE, 3, PO...","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
3,55607,/nrcan_p2/data/03_primary/keyword_prediction/s...,1,"[PAIRING, small, subject, 30, title, merged, S...",train.csv,PAIRING,small,"[subject, 30, title, merged, SIMPLE, PIPELINE,...","[subject, 30]","[title, merged, SIMPLE, PIPELINE, GLOVE, 3, PO...","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
4,222864,/nrcan_p2/data/03_primary/keyword_prediction/s...,1,"[PAIRING, small, subject, 30, title, merged, S...",valid.csv,PAIRING,small,"[subject, 30, title, merged, SIMPLE, PIPELINE,...","[subject, 30]","[title, merged, SIMPLE, PIPELINE, GLOVE, 3, PO...","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
...,...,...,...,...,...,...,...,...,...,...,...,...
943,235,/nrcan_p2/data/03_primary/keyword_prediction/s...,2,"[MULTICLASS, small, subject, desc, t10, desc, ...",valid.json,MULTICLASS,small,"[subject, desc, t10, desc, en, en, None, None,...","[subject, desc, t10]","[desc, en, en, None, None, nodrop-Feb29]","[desc, en, en]","[None, None]"
944,939,/nrcan_p2/data/03_primary/keyword_prediction/s...,3,"[MULTICLASS, small, subject, desc, t10, desc, ...",train.json,MULTICLASS,small,"[subject, desc, t10, desc, en, en, None, None,...","[subject, desc, t10]","[desc, en, en, None, None, nodrop-Feb29]","[desc, en, en]","[None, None]"
945,235,/nrcan_p2/data/03_primary/keyword_prediction/s...,3,"[MULTICLASS, small, subject, desc, t10, desc, ...",valid.json,MULTICLASS,small,"[subject, desc, t10, desc, en, en, None, None,...","[subject, desc, t10]","[desc, en, en, None, None, nodrop-Feb29]","[desc, en, en]","[None, None]"
946,940,/nrcan_p2/data/03_primary/keyword_prediction/s...,4,"[MULTICLASS, small, subject, desc, t10, desc, ...",train.json,MULTICLASS,small,"[subject, desc, t10, desc, en, en, None, None,...","[subject, desc, t10]","[desc, en, en, None, None, nodrop-Feb29]","[desc, en, en]","[None, None]"


In [59]:
datums[datums.split == 'parent'].filter(regex='size|split|task|dsize|tv|subj|textcol|rest3')

Unnamed: 0,size,split,tv,task,dsize,subj,textcol,rest3
0,278561,parent,,PAIRING,small,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
11,278561,parent,,PAIRING,small,"[subject, 30]","[title, merged]","[PIPELINE, GLOVE, PLUS, POSTPIPE, GLOVE]"
22,278561,parent,,PAIRING,small,"[subject, 30]","[title, merged]","[PIPELINE, GLOVE, 80, POSTPIPE, GLOVE]"
33,278561,parent,,PAIRING,small,"[subject, 30]","[title, merged]","[PIPELINE, GLOVE, 90, POSTPIPE, GLOVE]"
44,278561,parent,,PAIRING,small,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, BERT, 3, None]"
...,...,...,...,...,...,...,...,...
893,1174,parent,,MULTICLASS,small,"[subject, desc, t10]","[desc, en, en]","[SIMPLE, PIPELINE, BERT, 3, None]"
904,1174,parent,,MULTICLASS,small,"[subject, desc, t10]","[desc, en, en]","[PIPELINE, BERT, PLUS, None]"
915,1174,parent,,MULTICLASS,small,"[subject, desc, t10]","[desc, en, en]","[PIPELINE, BERT, 80, None]"
926,1174,parent,,MULTICLASS,small,"[subject, desc, t10]","[desc, en, en]","[PIPELINE, BERT, 90, None]"


In [None]:
datums[datums.split == 'parent'].filter(regex='size|split|task|dsize|tv|subj|textcol|rest3')

In [48]:
with pd.option_context('max_rows', None):
    display(datums[(datums.split != 'parent')& (datums.task== 'PAIRING') & (datums.tv == 'valid.csv')].filter(regex='size|split|task|dsize|tv|subj|textcol|rest3'))

Unnamed: 0,size,split,task,dsize,tv,subj,textcol,rest3
2,222867,0,PAIRING,small,valid.csv,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
4,222864,1,PAIRING,small,valid.csv,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
6,222867,2,PAIRING,small,valid.csv,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
8,222857,3,PAIRING,small,valid.csv,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
10,222864,4,PAIRING,small,valid.csv,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, GLOVE, 3, POSTPIPE, GLOVE]"
13,222867,0,PAIRING,small,valid.csv,"[subject, 30]","[title, merged]","[PIPELINE, GLOVE, PLUS, POSTPIPE, GLOVE]"
15,222864,1,PAIRING,small,valid.csv,"[subject, 30]","[title, merged]","[PIPELINE, GLOVE, PLUS, POSTPIPE, GLOVE]"
17,222867,2,PAIRING,small,valid.csv,"[subject, 30]","[title, merged]","[PIPELINE, GLOVE, PLUS, POSTPIPE, GLOVE]"
19,222857,3,PAIRING,small,valid.csv,"[subject, 30]","[title, merged]","[PIPELINE, GLOVE, PLUS, POSTPIPE, GLOVE]"
21,222864,4,PAIRING,small,valid.csv,"[subject, 30]","[title, merged]","[PIPELINE, GLOVE, PLUS, POSTPIPE, GLOVE]"


In [45]:
#with pd.option_context('max_rows', None):
display(datums[(datums.split != 'parent')& (datums.task== 'MULTICLASS')].filter(regex='size|split|task|dsize|tv|subj|textcol|rest3'))

Unnamed: 0,size,split,task,dsize,tv,subj,textcol,rest3
267,7968,0,MULTICLASS,small,train.json,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, BERT, 3, None]"
268,1996,0,MULTICLASS,small,valid.json,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, BERT, 3, None]"
269,7972,1,MULTICLASS,small,train.json,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, BERT, 3, None]"
270,1996,1,MULTICLASS,small,valid.json,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, BERT, 3, None]"
271,7971,2,MULTICLASS,small,train.json,"[subject, 30]","[title, merged]","[SIMPLE, PIPELINE, BERT, 3, None]"
...,...,...,...,...,...,...,...,...
433,12124,2,MULTICLASS,large,valid.json,"[subject, desc, t10]","[title, merged]","[PIPELINE, BERT, PLUS, None]"
434,47642,3,MULTICLASS,large,train.json,"[subject, desc, t10]","[title, merged]","[PIPELINE, BERT, PLUS, None]"
435,12124,3,MULTICLASS,large,valid.json,"[subject, desc, t10]","[title, merged]","[PIPELINE, BERT, PLUS, None]"
436,47620,4,MULTICLASS,large,train.json,"[subject, desc, t10]","[title, merged]","[PIPELINE, BERT, PLUS, None]"


In [28]:
a = '/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29.parquet'
b = '/nrcan_p2/data/03_primary/keyword_prediction/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29.parquet'

a = pd.read_parquet(a)
b = pd.read_parquet(b)

print(a.shape)
print(b.shape)

(102378, 3)
(102378, 3)


In [25]:
for split, group in datums.groupby(['split']):
    print(split)
    display(group.groupby(['infile']).apply(lambda x: x.shape[0]).to_frame())

0


Unnamed: 0_level_0,0
infile,Unnamed: 1_level_1
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29/split_0/train.csv,20466
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29/split_0/valid.csv,81912
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29/split_0/train.csv,20433
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29/split_0/valid.csv,81825
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_0/train.csv,20468
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_0/valid.csv,81880
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29/split_0/train.csv,55694
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29/split_0/valid.csv,222867
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29/split_0/train.csv,55394
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29/split_0/valid.csv,221665


1


Unnamed: 0_level_0,0
infile,Unnamed: 1_level_1
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29/split_1/train.csv,20463
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29/split_1/valid.csv,81915
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29/split_1/train.csv,20433
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29/split_1/valid.csv,81825
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_1/train.csv,20460
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_1/valid.csv,81888
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29/split_1/train.csv,55697
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29/split_1/valid.csv,222864
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29/split_1/train.csv,55386
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29/split_1/valid.csv,221673


2


Unnamed: 0_level_0,0
infile,Unnamed: 1_level_1
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29/split_2/train.csv,20460
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29/split_2/valid.csv,81918
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29/split_2/train.csv,20430
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29/split_2/valid.csv,81828
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_2/train.csv,20460
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_2/valid.csv,81888
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29/split_2/train.csv,55694
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29/split_2/valid.csv,222867
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29/split_2/train.csv,55393
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29/split_2/valid.csv,221666


3


Unnamed: 0_level_0,0
infile,Unnamed: 1_level_1
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29/split_3/train.csv,20463
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29/split_3/valid.csv,81915
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29/split_3/train.csv,20432
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29/split_3/valid.csv,81826
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_3/train.csv,20463
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_3/valid.csv,81885
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29/split_3/train.csv,55704
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29/split_3/valid.csv,222857
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29/split_3/train.csv,55388
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29/split_3/valid.csv,221671


4


Unnamed: 0_level_0,0
infile,Unnamed: 1_level_1
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29/split_4/train.csv,20462
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_None_None_nNone-Feb29/split_4/valid.csv,81916
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29/split_4/train.csv,20437
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_PIPELINE_GLOVE_PLUS_POSTPIPE_GLOVE_nNone-Feb29/split_4/valid.csv,81821
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_4/train.csv,20463
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_desc_en_en_SIMPLE_PIPELINE_GLOVE_3_POSTPIPE_GLOVE_nNone-Feb29/split_4/valid.csv,81885
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29/split_4/train.csv,55697
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_None_None_nNone-Feb29/split_4/valid.csv,222864
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29/split_4/train.csv,55387
/nrcan_p2/data/03_primary/keyword_prediction/splits/PAIRING_small_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nNone-Feb29/split_4/valid.csv,221672


In [39]:
for split, group in datums.groupby(['split']):
    print(split)
    display(group.groupby(['infile']).apply(lambda x: x.shape[0]).to_frame())

0


Unnamed: 0_level_0,0
infile,Unnamed: 1_level_1
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_0/train.json,48496
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_0/valid.json,12125
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_0/train.json,48496
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_0/valid.json,12125
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_0/train.json,48496
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_0/valid.json,12125
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_0/train.json,48496
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_0/valid.json,12125
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_desc_t10_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_0/train.json,48496
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_desc_t10_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_0/valid.json,12125


1


Unnamed: 0_level_0,0
infile,Unnamed: 1_level_1
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_1/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_1/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_1/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_1/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_1/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_1/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_1/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_1/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_desc_t10_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_1/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_desc_t10_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_1/valid.json,12124


2


Unnamed: 0_level_0,0
infile,Unnamed: 1_level_1
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_2/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_2/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_2/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_2/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_2/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_2/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_2/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_2/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_desc_t10_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_2/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_desc_t10_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_2/valid.json,12124


3


Unnamed: 0_level_0,0
infile,Unnamed: 1_level_1
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_3/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_3/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_3/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_3/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_3/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_3/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_3/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_3/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_desc_t10_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_3/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_desc_t10_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_3/valid.json,12124


4


Unnamed: 0_level_0,0
infile,Unnamed: 1_level_1
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_4/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_4/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_4/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_30_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_4/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_4/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_4/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_4/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_5_title_merged_SIMPLE_PIPELINE_BERT_3_None_nodrop-Feb29/split_4/valid.json,12124
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_desc_t10_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_4/train.json,48497
/nrcan_p2/data/03_primary/keyword_prediction/splits/MULTICLASS_large_subject_desc_t10_title_merged_PIPELINE_BERT_PLUS_None_nodrop-Feb29/split_4/valid.json,12124
