In [22]:
from datasets import load_from_disk, load_dataset, DatasetDict

### Downloading dataset from HF

#### For training of Fluency, Coherence, Clarity we use wanya/IteraTeR_v2
#### For val/test of Fluency, Coherence, Clarity we use wanyu/IteraTeR_human_sent

In [8]:
dataset_name = 'wanyu/IteraTeR_v2'
ds_train = load_dataset(dataset_name)['train']
ds_train

Dataset({
    features: ['before_sent', 'after_sent', 'before_sent_with_intent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
    num_rows: 293929
})

In [9]:
dataset_name = 'wanyu/IteraTeR_human_sent'
ds_val = load_dataset(dataset_name)['validation']
ds_val

Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
    num_rows: 400
})

In [10]:
dataset_name = 'wanyu/IteraTeR_human_sent'
ds_test = load_dataset(dataset_name)['test']
ds_test

Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
    num_rows: 364
})

#### Exploring Fluency data

In [12]:
ds_train_fluency= ds_train.filter(lambda ex: ex['labels']=='fluency')
ds_train_fluency= ds_train_fluency.filter(lambda ex: ex['labels']=='fluency' and len(ex["after_sent"]) > 1)

ds_train_fluency

Filter: 100%|██████████| 126895/126895 [00:03<00:00, 41313.12 examples/s]


Dataset({
    features: ['before_sent', 'after_sent', 'before_sent_with_intent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
    num_rows: 126892
})

In [13]:
ds_val_fluency= ds_val.filter(lambda ex: ex['labels']=='fluency' and len(ex["after_sent"]) > 1)
ds_val_fluency

Filter: 100%|██████████| 400/400 [00:00<00:00, 2549.67 examples/s]


Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
    num_rows: 115
})

In [14]:
ds_test_fluency= ds_test.filter(lambda ex: ex['labels']=='fluency'  and len(ex["after_sent"]) > 1)
ds_test_fluency

Filter: 100%|██████████| 364/364 [00:00<00:00, 1602.00 examples/s]


Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
    num_rows: 88
})

#### Exploring Clarity data

In [15]:
ds_train_clarity= ds_train.filter(lambda ex: ex['labels']=='clarity'  and len(ex["after_sent"]) > 1)
ds_train_clarity

Filter: 100%|██████████| 293929/293929 [00:01<00:00, 160628.50 examples/s]


Dataset({
    features: ['before_sent', 'after_sent', 'before_sent_with_intent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
    num_rows: 119300
})

In [16]:
ds_val_clarity= ds_val.filter(lambda ex: ex['labels']=='clarity'  and len(ex["after_sent"]) > 1)
ds_val_clarity

Filter: 100%|██████████| 400/400 [00:00<00:00, 4782.26 examples/s]


Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
    num_rows: 157
})

In [17]:
ds_test_clarity= ds_test.filter(lambda ex: ex['labels']=='clarity'  and len(ex["after_sent"]) > 1)
ds_test_clarity

Filter: 100%|██████████| 364/364 [00:00<00:00, 6953.70 examples/s]


Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
    num_rows: 185
})

#### Exploring Coherence data

In [18]:
ds_train_coherence= ds_train.filter(lambda ex: ex['labels']=='coherence'  and len(ex["after_sent"]) > 1)
ds_train_coherence

Filter: 100%|██████████| 293929/293929 [00:01<00:00, 161893.27 examples/s]


Dataset({
    features: ['before_sent', 'after_sent', 'before_sent_with_intent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
    num_rows: 38450
})

In [19]:
ds_val_coherence= ds_val.filter(lambda ex: ex['labels']=='coherence'  and len(ex["after_sent"]) > 1)
ds_val_coherence

Filter: 100%|██████████| 400/400 [00:00<00:00, 12955.28 examples/s]


Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
    num_rows: 41
})

In [20]:
ds_test_coherence= ds_test.filter(lambda ex: ex['labels']=='coherence'  and len(ex["after_sent"]) > 1)
ds_test_coherence

Filter: 100%|██████████| 364/364 [00:00<00:00, 10810.21 examples/s]


Dataset({
    features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
    num_rows: 35
})

### Saving data to disk

In [27]:
dir_path='/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/Small-LLM-Reasoning/datasets/'

In [23]:
ds_fluency=DatasetDict(
    {
        'train':ds_train_fluency,
        'val':ds_val_fluency,
        'test':ds_test_fluency
    }
)
ds_fluency
    

DatasetDict({
    train: Dataset({
        features: ['before_sent', 'after_sent', 'before_sent_with_intent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
        num_rows: 126892
    })
    val: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
        num_rows: 115
    })
    test: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
        num_rows: 88
    })
})

In [28]:
ds_fluency.save_to_disk(dir_path+'fluency')

Saving the dataset (1/1 shards): 100%|██████████| 126892/126892 [00:00<00:00, 136717.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 115/115 [00:00<00:00, 26590.13 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 88/88 [00:00<00:00, 23061.47 examples/s]


In [29]:
ds_coherence=DatasetDict(
    {
        'train':ds_train_coherence,
        'val':ds_val_coherence,
        'test':ds_test_coherence
    }
)
ds_coherence
    

DatasetDict({
    train: Dataset({
        features: ['before_sent', 'after_sent', 'before_sent_with_intent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
        num_rows: 38450
    })
    val: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
        num_rows: 41
    })
    test: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
        num_rows: 35
    })
})

In [30]:
ds_coherence.save_to_disk(dir_path+'coherence')

Saving the dataset (1/1 shards): 100%|██████████| 38450/38450 [00:00<00:00, 131235.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 41/41 [00:00<00:00, 9754.20 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 35/35 [00:00<00:00, 7026.31 examples/s]


In [31]:
ds_clarity=DatasetDict(
    {
        'train':ds_train_clarity,
        'val':ds_val_clarity,
        'test':ds_test_clarity
    }
)
ds_clarity
    

DatasetDict({
    train: Dataset({
        features: ['before_sent', 'after_sent', 'before_sent_with_intent', 'labels', 'confidence', 'doc_id', 'revision_depth'],
        num_rows: 119300
    })
    val: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
        num_rows: 157
    })
    test: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
        num_rows: 185
    })
})

In [32]:
ds_clarity.save_to_disk(dir_path+'clarity')

Saving the dataset (1/1 shards): 100%|██████████| 119300/119300 [00:00<00:00, 129072.72 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 157/157 [00:00<00:00, 30896.90 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 185/185 [00:00<00:00, 27511.92 examples/s]
