In [10]:
import pandas as pd
import datasets
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import load_dataset

In [2]:
FROM = 500

SEP_TOKEN = " <SEP> "
EOS_TOKEN = " <EOS>"
punctuations = ['!',',','.','?',":",";"]

In [3]:
mapping = {
    'C': 3,
    'M':4,
    'N':5,
    'O':6,
    'OB':7,
    'W':8,
}

In [4]:
def curate_text(text):
    curated_text = ''
    if type(text) == str:
        text = text.replace('\n',' ')
        for punctuation in punctuations:
            text = text.replace(punctuation, ' ' + punctuation + ' ')
        
        curated_text = " ".join(text.strip().split())
    return curated_text

In [5]:
def create_source(row):
    source = row['Dialogue']
    return source.split()

In [6]:
def create_tgt_sum(row):
    tgt_sum = row['Generated Summary']
    return tgt_sum.split()

In [7]:
def create_tags(row): 
    return row['Annotations'].split()

In [8]:
def token_count(tokens):
    return len(tokens)

def map_tag_ids(tags):
    return [mapping[tag] for tag in tags]

In [9]:
def split_stratified_into_train_val_test(df_input, frac_train=0.8, frac_val=0.1, frac_test=0.1):
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' % \
                         (frac_train, frac_val, frac_test))

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp = train_test_split(df_input, test_size=(1.0 - frac_train), random_state=42, shuffle=True)

    
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test = train_test_split(df_temp, test_size = relative_frac_test, random_state=42, shuffle=True)
    
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    
    df_train.reset_index(drop=True, inplace = True)
    df_val.reset_index(drop=True, inplace = True)
    df_test.reset_index(drop=True, inplace = True)

    return df_train, df_val, df_test

In [44]:
def curate_samsum(example):
    example['dialogue'] = curate_text(example['dialogue']).split()
    example['summary'] = curate_text(example['summary']).split()

    example['tags'] = ('O ' * len(example['summary'])).strip().split()

    example['tag_ids'] = [mapping[tag] for tag in example['tags']]

    return example

In [40]:
df = load_dataset("samsum")

In [45]:
curated_samsum = df.map(curate_samsum, batched=False)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [46]:
curated_samsum['train'][0]

{'id': '13818513',
 'dialogue': ['Amanda',
  ':',
  'I',
  'baked',
  'cookies',
  '.',
  'Do',
  'you',
  'want',
  'some',
  '?',
  'Jerry',
  ':',
  'Sure',
  '!',
  'Amanda',
  ':',
  "I'll",
  'bring',
  'you',
  'tomorrow',
  ':',
  '-)'],
 'summary': ['Amanda',
  'baked',
  'cookies',
  'and',
  'will',
  'bring',
  'Jerry',
  'some',
  'tomorrow',
  '.'],
 'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 'tag_ids': [6, 6, 6, 6, 6, 6, 6, 6, 6, 6]}

In [47]:
curated_samsum['train'].features

{'id': Value(dtype='string', id=None),
 'dialogue': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'summary': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'tag_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [52]:
curated_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 818
    })
})

In [51]:
!nvidia-smi

Fri Nov 24 21:22:55 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:A1:00.0 Off |                    0 |
| N/A   42C    P0             260W / 300W |  17821MiB / 81920MiB |     98%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [53]:
token = "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"
curated_samsum.push_to_hub('pvisnrt/special_samsum', token=token)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [54]:
curated_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 818
    })
})