In [1]:
import pandas as pd
import datasets
import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
FROM = 500

SEP_TOKEN = " <SEP> "
EOS_TOKEN = " <EOS>"
punctuations = ['!',',','.','?',":",";"]

In [3]:
def curate_text(text):
    curated_text = ''
    if type(text) == str:
        text = text.replace('\n',' ')
        for punctuation in punctuations:
            text = text.replace(punctuation, ' ' + punctuation + ' ')
        
        curated_text = " ".join(text.strip().split())
    return curated_text

In [4]:
def create_source(row):
    source = row['Dialogue']
    return source.split()

In [5]:
def create_tgt_sum(row):
    tgt_sum = row['Generated Summary']
    return tgt_sum.split()

In [6]:
def create_tags(row):    
    return row['Annotations'].split()

In [7]:
def token_count(tokens):
    return len(tokens)

def map_tag_ids(tags):
    return tagLabels.str2int(tags)

In [8]:
def split_stratified_into_train_val_test(df_input, frac_train=0.8, frac_val=0.1, frac_test=0.1):
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' % \
                         (frac_train, frac_val, frac_test))

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp = train_test_split(df_input, test_size=(1.0 - frac_train), random_state=42, shuffle=True)

    
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test = train_test_split(df_temp, test_size = relative_frac_test, random_state=42, shuffle=True)
    
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    
    df_train.reset_index(drop=True, inplace = True)
    df_val.reset_index(drop=True, inplace = True)
    df_test.reset_index(drop=True, inplace = True)

    return df_train, df_val, df_test

In [9]:
df = pd.read_csv('annotated_capstone_data.csv')

df = df.iloc[FROM:]

In [10]:
df['Dialogue'] = df['Dialogue'].apply(curate_text)
df['Generated Summary'] = df['Generated Summary'].apply(curate_text)
df['Annotations'] = df['Annotations'].apply(curate_text)


In [11]:
df['source'] = df.apply(create_source, axis = 1)

In [12]:
df['summary_target'] = df.apply(create_tgt_sum, axis=1)

In [13]:
df['tags'] = df.apply(create_tags, axis=1)

In [29]:
tag_labels = np.unique(df['tags'].sum()).tolist()
tagLabels = datasets.ClassLabel(num_classes=len(tag_labels), names=tag_labels)

df['tag_ids'] = df['tags'].apply(map_tag_ids)
df['gold_tags'] = df['summary_target'].apply(lambda x: [6] * len(list(x)))

In [33]:
ds_features = datasets.Features({
    'source': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None), 
    'summary_target': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None),
    'tags': datasets.Sequence(feature=tagLabels, length=-1, id=None),
    'gold_tags': datasets.Sequence(feature=datasets.Value(dtype='int32', id=None), length=-1, id=None),
})
print(ds_features)

{'source': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'summary_target': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'tags': Sequence(feature=ClassLabel(names=['C', 'M', 'N', 'O', 'OB', 'W'], id=None), length=-1, id=None), 'gold_tags': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}


In [31]:
dataset_df = pd.DataFrame()
dataset_df['source'] = df['source']
dataset_df['summary_target'] = df['summary_target']
dataset_df['tags'] = df['tag_ids']
dataset_df['gold_tags'] = df['gold_tags']

In [26]:
len(dataset_df['summary_target'][500]) == len(dataset_df['tags'][500])

False

In [34]:
df_train, df_val, df_test = split_stratified_into_train_val_test(dataset_df)

In [35]:
train_ds = datasets.Dataset.from_pandas(df_train, features=ds_features, split='train')
val_ds = datasets.Dataset.from_pandas(df_val, features=ds_features, split='validation')
test_ds = datasets.Dataset.from_pandas(df_test, features=ds_features, split='test')

In [37]:

token = "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"
dataset_name = 'samsum'
train_ds.push_to_hub(f'pvisnrt/{dataset_name}', token=token)
val_ds.push_to_hub(f'pvisnrt/{dataset_name}', token=token)
test_ds.push_to_hub(f'pvisnrt/{dataset_name}', token=token)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 527.85ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.18it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 707.66ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
Downloading metadata: 100%|██████████| 685/685 [00:00<00:00, 3.70MB/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 723.28ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
Downloading metadata: 100%|██████████| 798/798 [00:00<00:00, 4.68MB/s]


## Samsum adding Ground Truth (O) column 

In [None]:
from datasets import load_dataset

# Load the dataset
samsum_dataset = load_dataset("samsum")

samsum_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [None]:


# Function to tokenize summary and create 'O' sequence
def create_o_sequence(summary):
    tokens = summary.split()  # Simple tokenization by splitting on whitespace
    o_sequence = ["O"] * len(tokens)
    return ' '.join(o_sequence)

# Iterate through each split and modify the dataset
for split in samsum_dataset.keys():
    # Add a new column with 'O' sequence for each summary
    samsum_dataset[split] = samsum_dataset[split].map(lambda x: {"gold_tags": create_o_sequence(x["summary"])})



#  push it to the hub


In [None]:
token = "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"
samsum_dataset.push_to_hub("pvisnrt/samsum", token=token)

Creating parquet from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 305.50ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.78s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 320.74ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.43it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 400.22ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


In [None]:
samsum_dataset['train'][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.',
 'gold_tags': 'O O O O O O O O O'}