In [1]:
import pandas as pd
import numpy as np
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dset_v2 = datasets.load_dataset('indonlp/nusa_t2t', data_files='train_t2t_data_v2.csv')['train']

In [3]:
nusa_paragraph_dset = datasets.load_dataset('indonlp/nusa_t2t', data_files='nusaparagraph_data.csv')['train']

In [17]:
%%time
df = dset_v2.to_pandas()
df.groupby('subset_name').size()

CPU times: user 9.3 s, sys: 2.75 s, total: 12.1 s
Wall time: 12 s


subset_name
indo_puisi            144460
wikihow                97848
wikipedia_ace          22270
wikipedia_ban          92778
wikipedia_bjn          55585
wikipedia_bug            633
wikipedia_gor          62076
wikipedia_id         6069267
wikipedia_jv          375368
wikipedia_map-bms      64048
wikipedia_min        1431747
wikipedia_ms         2094812
wikipedia_nia           8865
wikipedia_su          465517
wikipedia_tet           7262
dtype: int64

In [10]:
idf = pd.read_csv('v2_data/identity_prompt.csv')
idf = idf.rename({'Prompt': 'input', 'Answer': 'output'}, axis='columns')
idf['dataset_name'] = 'nusa_t2t_v2'
idf['subset_name'] = 'identity_prompt'
idf['prompt_id'] = 'identity_prompt'
idf['template_name'] = 'identity_prompt'
idf['dataset_key'] = 'identity_prompt'

# Upsampling
idf = idf.loc[np.repeat(idf.index, 500)].reset_index(drop=True)

In [11]:
sdf = pd.read_csv('v2_data/safety_prompt.csv')
sdf = sdf.rename({'Prompt': 'input', 'Answer': 'output'}, axis='columns')
sdf['dataset_name'] = 'nusa_t2t_v2'
sdf['subset_name'] = 'safety_prompt'
sdf['prompt_id'] = 'safety_prompt'
sdf['template_name'] = 'safety_prompt'
sdf['dataset_key'] = 'safety_prompt'

# Upsampling
sdf = sdf.loc[np.repeat(sdf.index, 500)].reset_index(drop=True)

In [12]:
ddf = pd.read_csv('v2_data/databricks-dolly-15k_mt_nllb-200-3.3B.csv')
ddf['subset_name'] = 'dolly'

# Upsampling
ddf = ddf.loc[np.repeat(ddf.index, 5)].reset_index(drop=True)

In [24]:
sdf.shape, idf.shape, ddf.shape, len(dset_v2), len(nusa_paragraph_dset)

((93500, 7), (62500, 7), (75055, 7), 10992536, 1586799)

In [25]:
dset_v2_aug = datasets.concatenate_datasets([
    dset_v2, nusa_paragraph_dset,
    datasets.Dataset.from_pandas(sdf), 
    datasets.Dataset.from_pandas(idf), 
    datasets.Dataset.from_pandas(ddf)
])

In [27]:
dset_v2_aug.save_to_disk('./cache/dset_v2_aug')

Saving the dataset (23/23 shards): 100%|██████████| 12810390/12810390 [00:12<00:00, 1067305.45 examples/s]


In [5]:
dset_v2_aug = datasets.load_from_disk('./cache/dset_v2_aug')

In [7]:
dset_v2_aug

Dataset({
    features: ['dataset_name', 'subset_name', 'prompt_id', 'template_name', 'dataset_key', 'input', 'output'],
    num_rows: 12810390
})

In [None]:
dset_v2_aug.push_to_hub('indonlp/nusa_t2t_v2')

Uploading the dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|          | 0/557 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:   3%|▎         | 18/557 [00:00<00:03, 172.63ba/s][A
Creating parquet from Arrow format:   9%|▊         | 48/557 [00:00<00:02, 245.75ba/s][A
Creating parquet from Arrow format:  15%|█▍        | 81/557 [00:00<00:01, 280.89ba/s][A
Creating parquet from Arrow format:  20%|██        | 113/557 [00:00<00:01, 294.35ba/s][A
Creating parquet from Arrow format:  26%|██▌       | 145/557 [00:00<00:01, 302.82ba/s][A
Creating parquet from Arrow format:  36%|███▌      | 199/557 [00:00<00:00, 382.56ba/s][A
Creating parquet from Arrow format:  47%|████▋     | 260/557 [00:00<00:00, 456.20ba/s][A
Creating parquet from Arrow format:  61%|██████    | 339/557 [00:00<00:00, 561.65ba/s][A
Creating parquet from Arrow format:  76%|███████▌  | 424/557 [00:00<00:00, 650.39ba/s][A
Creating parquet from Arrow format: 100%|████

## XLM Style Resampling (We don't use it)

In [108]:
v2_df = pd.concat([df,idf, ddf])
v2_df.groupby('subset_name').size().sort_values(ascending=False) / len(v2_df)

In [279]:
v2_dist_df = v2_df.groupby('subset_name').size().sort_values(ascending=False) / len(v2_df)
sr_df = (v2_dist ** 0.5) / (v2_dist ** 0.5).sum()
adj_sr_df= sr_df / v2_dist_df
multiplier_df = np.ceil(adj_sr_df)

In [283]:
dfs = []
for name, multiplier in multiplier_df.items():
    if multiplier < 1:
        multiplier = 1
    if name == 'identity_prompt':
        multiplier = 1000
    xdf = v2_df.loc[v2_df['subset_name'] == name]
    dfs += [xdf for _ in range(int(multiplier))]
adj_v2_df = pd.concat(dfs)

In [291]:
(adj_v2_df.groupby('subset_name').size() / len(adj_v2_df)).sort_values(ascending=False)

subset_name
wikipedia_id         0.368688
wikipedia_min        0.173948
wikipedia_ms         0.127253
wikipedia_jv         0.068407
wikipedia_su         0.056557
indo_puisi           0.035102
wikihow              0.029720
wikipedia_ban        0.028180
wikipedia_map-bms    0.023344
wikipedia_gor        0.022625
wikipedia_bjn        0.020260
wikipedia_ace        0.012175
dolly                0.010031
identity_prompt      0.007593
wikipedia_nia        0.007539
wikipedia_tet        0.006617
wikipedia_bug        0.001961
dtype: float64