In [1]:
import pandas as pd
import datasets
import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
FROM = 500

SEP_TOKEN = " <SEP> "
EOS_TOKEN = " <EOS>"
punctuations = ['!',',','.','?',":",";"]

In [3]:
def curate_text(text):
    curated_text = ''
    if type(text) == str:
        text = text.replace('\n',' ')
        for punctuation in punctuations:
            text = text.replace(punctuation, ' ' + punctuation + ' ')
        
        curated_text = " ".join(text.strip().split())
    return curated_text

In [4]:
def create_source(row):
    source = row['dialogue']
    return source.split()

In [5]:
def create_tgt_sum(row):
    tgt_sum = row['summary']
    return tgt_sum.split()

In [6]:
def create_tags(row):    
    return row['Annotations'].split()

In [7]:
def token_count(tokens):
    return len(tokens)

def map_tag_ids(tags):
    return tagLabels.str2int(tags)

In [8]:
def split_stratified_into_train_val_test(df_input, frac_train=0.8, frac_val=0.1, frac_test=0.1):
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' % \
                         (frac_train, frac_val, frac_test))

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp = train_test_split(df_input, test_size=(1.0 - frac_train), random_state=42, shuffle=True)

    
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test = train_test_split(df_temp, test_size = relative_frac_test, random_state=42, shuffle=True)
    
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    
    df_train.reset_index(drop=True, inplace = True)
    df_val.reset_index(drop=True, inplace = True)
    df_test.reset_index(drop=True, inplace = True)

    return df_train, df_val, df_test

In [82]:
df = pd.read_csv('annotated_capstone_data.csv')

df = df.iloc[FROM:]

In [83]:
df

Unnamed: 0,ID,Dialogue,Reference Summary,Generated Summary,Annotations,Verified Tags,Missing Information,Redundant Information,Circumstance,Wrong Reference,Negation,Object,Tense,Modality,Score,Model Name
500,100,Ethan: somethin for Scott <file_photo>\nToby: ...,"Ethan, Toby and Marshall are making fun of Sco...",Ethan and Marshall enjoy making fun of Scott.,O O O O O O O O O M,,x,,,,,,,,7.0,BART-Baseline
501,101,Ethan: somethin for Scott <file_photo>\nToby: ...,"Ethan, Toby and Marshall are making fun of Sco...",Ethan and Toby are making fun of Scott.,O O O O O O O O O M,,x,,,,,,,,8.0,T5-Baseline
502,102,"Igor: Shit, I've got so much to do at work and...",Igor has a lot of work on his notice period an...,Igor has a lot of work to do at work. He's on ...,O O O O O O O O O O O O O O O O O O O O O O O ...,,x,x,,,,,,,7.0,T5-Cons
503,103,"Igor: Shit, I've got so much to do at work and...",Igor has a lot of work on his notice period an...,Igor has only two weeks left before he has to ...,O O O O O O O O O O O O O O O O O O O O O O O ...,,x,,,,,x,,,7.0,Pegasus-Baseline
504,104,"Igor: Shit, I've got so much to do at work and...",Igor has a lot of work on his notice period an...,"Igor is demotivated at work, because he has a ...",O O O O O O O O O O O O O O O O O O O O O O O M,,x,,,,,,,,7.0,Pegasus-Cons
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,195,Jack: Cocktails later?\nMay: YES!!!\nMay: You ...,Jack and May will drink cocktails later.\n,Jack and May are going to have some cocktails ...,O O O O O O O O O O O O O,,,,,,,,,,10.0,Pegasus-Baseline
596,196,Jack: Cocktails later?\nMay: YES!!!\nMay: You ...,Jack and May will drink cocktails later.\n,May and Jack will have some cocktails later.,O O O O O O O O O O,,,,,,,,,,10.0,T5-Cons
597,197,Jack: Cocktails later?\nMay: YES!!!\nMay: You ...,Jack and May will drink cocktails later.\n,May and Jack are going for cocktails later.,O O O O O O O O O O,,,,,,,,,,10.0,T5-Baseline
598,198,"Margaret: Honey, buy me some painkiller.\nJack...",Margaret is suffering from a terrible headache...,Jack will buy Margaret some painkiller.,O C C O O O O M,,,,,,,,,x,1.0,T5-Baseline


In [9]:
from datasets import load_dataset
from tqdm import tqdm

ds = load_dataset("samsum")

# empty dataframe
df = pd.DataFrame(columns=['dialogue', 'summary', 'gold_tags'])

# add all the data to the dataframe
for split in ds.keys():
    for i in tqdm(range(len(ds[split]))):
        new_row = {'dialogue': ds[split][i]['dialogue'], 'summary': ds[split][i]['summary'], 'gold_tags': ''}
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)



100%|██████████| 14732/14732 [00:05<00:00, 2489.32it/s]
100%|██████████| 819/819 [00:00<00:00, 2192.50it/s]
100%|██████████| 818/818 [00:00<00:00, 2133.49it/s]


In [10]:
df['dialogue'] = df['dialogue'].apply(curate_text)
df['summary'] = df['summary'].apply(curate_text)
# df['Annotations'] = df['Annotations'].apply(curate_text)


In [11]:
df['source'] = df.apply(create_source, axis = 1)

In [12]:
df['summary_target'] = df.apply(create_tgt_sum, axis=1)

In [68]:
# df['tags'] = df.apply(create_tags, axis=1)

In [13]:
# tag_labels = np.unique(df['tags'].sum()).tolist()
# tagLabels = datasets.ClassLabel(num_classes=len(tag_labels), names=tag_labels)

# df['tag_ids'] = df['tags'].apply(map_tag_ids)
df['gold_tags'] = df['summary_target'].apply(lambda x: [6] * len(list(x)))

In [14]:
ds_features = datasets.Features({
    'source': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None), 
    'summary_target': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None),
    # 'tags': datasets.Sequence(feature=tagLabels, length=-1, id=None),
    'gold_tags': datasets.Sequence(feature=datasets.Value(dtype='int32', id=None), length=-1, id=None),
})
print(ds_features)

{'source': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'summary_target': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'gold_tags': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}


In [15]:
dataset_df = pd.DataFrame()
dataset_df['source'] = df['source']
dataset_df['summary_target'] = df['summary_target']
# dataset_df['tags'] = df['tag_ids']
dataset_df['gold_tags'] = df['gold_tags']

In [None]:
len(dataset_df['summary_target'][500]) == len(dataset_df['tags'][500])

In [16]:
dataset_df

Unnamed: 0,source,summary_target,gold_tags
0,"[Amanda, :, I, baked, cookies, ., Do, you, wan...","[Amanda, baked, cookies, and, will, bring, Jer...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]"
1,"[Olivia, :, Who, are, you, voting, for, in, th...","[Olivia, and, Olivier, are, voting, for, liber...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]"
2,"[Tim, :, Hi, ,, what's, up, ?, Kim, :, Bad, mo...","[Kim, may, try, the, pomodoro, technique, reco...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]"
3,"[Edward, :, Rachel, ,, I, think, I'm, in, ove,...","[Edward, thinks, he, is, in, love, with, Bella...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ..."
4,"[Sam, :, hey, overheard, rick, say, something,...","[Sam, is, confused, ,, because, he, overheard,...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ..."
...,...,...,...
16364,"[Carla, :, I've, got, it, ., ., ., Diego, :, w...","[Carla's, date, for, graduation, is, on, June,...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]"
16365,"[Gita, :, Hello, ,, this, is, Beti's, Mum, Git...","[Bev, is, going, on, the, school, trip, with, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ..."
16366,"[Julia, :, Greg, just, texted, me, Robert, :, ...","[Greg, cheated, on, Julia, ., He, apologises, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ..."
16367,"[Marry, :, I, broke, my, nail, ;, (, Tina, :, ...","[Marry, broke, her, nail, and, has, a, party, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ..."


In [17]:
df_train, df_val, df_test = split_stratified_into_train_val_test(dataset_df)

In [107]:
train_ds = datasets.Dataset.from_pandas(df_train, features=ds_features, split='train')
val_ds = datasets.Dataset.from_pandas(df_val, features=ds_features, split='validation')
test_ds = datasets.Dataset.from_pandas(df_test, features=ds_features, split='test')

In [109]:

token = "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"
dataset_name = 'samsum_dataset'
train_ds.push_to_hub(f'pvisnrt/{dataset_name}', token=token)
val_ds.push_to_hub(f'pvisnrt/{dataset_name}', token=token)
test_ds.push_to_hub(f'pvisnrt/{dataset_name}', token=token)

Creating parquet from Arrow format: 100%|██████████| 14/14 [00:00<00:00, 68.52ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.46s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 59.88ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]
Downloading metadata: 100%|██████████| 536/536 [00:00<00:00, 3.49MB/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 61.64ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
Downloading metadata: 100%|██████████| 653/653 [00:00<00:00, 5.53MB/s]


## Samsum adding Ground Truth (O) column - old version (not used)

In [89]:
from datasets import load_dataset
from tqdm import tqdm

ds = load_dataset("samsum")

# empty dataframe
df = pd.DataFrame(columns=['dialogue', 'summary', 'gold_tags'])

# add all the data to the dataframe
for split in ds.keys():
    for i in tqdm(range(len(ds[split]))):
        new_row = {'dialogue': ds[split][i]['dialogue'], 'summary': ds[split][i]['summary'], 'gold_tags': ''}
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)



100%|██████████| 14732/14732 [00:05<00:00, 2548.57it/s]
100%|██████████| 819/819 [00:00<00:00, 2241.32it/s]
100%|██████████| 818/818 [00:00<00:00, 2221.35it/s]


In [90]:
df

Unnamed: 0,dialogue,summary,gold_tags
0,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...,
1,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...,
2,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...,
3,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...,
4,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com...",
...,...,...,...
16364,Carla: I've got it...\r\nDiego: what?\r\nCarla...,Carla's date for graduation is on June 4th. Di...,
16365,"Gita: Hello, this is Beti's Mum Gita, I wanted...",Bev is going on the school trip with her son. ...,
16366,"Julia: Greg just texted me\r\nRobert: ugh, del...",Greg cheated on Julia. He apologises to her. R...,
16367,"Marry: I broke my nail ;(\r\nTina: oh, no!\r\n...",Marry broke her nail and has a party tomorrow....,


In [91]:
df['dialogue'] = df['dialogue'].apply(curate_text)
df['summary'] = df['summary'].apply(curate_text)

In [93]:
df['source'] = df.apply(create_source, axis = 1)
df['summary'] = df.apply(create_tgt_sum, axis=1)

KeyError: 'Dialogue'

In [None]:


# Function to tokenize summary and create 'O' sequence
def create_o_sequence(summary):
    tokens = summary.split()  # Simple tokenization by splitting on whitespace
    o_sequence = ["O"] * len(tokens)
    return ' '.join(o_sequence)

# Iterate through each split and modify the dataset
for split in samsum_dataset.keys():
    # Add a new column with 'O' sequence for each summary
    samsum_dataset[split] = samsum_dataset[split].map(lambda x: {"gold_tags": create_o_sequence(x["summary"])})



#  push it to the hub


In [None]:
token = "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"
samsum_dataset.push_to_hub("pvisnrt/samsum", token=token)

Creating parquet from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 305.50ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.78s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 320.74ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.43it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 400.22ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


In [None]:
samsum_dataset['train'][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.',
 'gold_tags': 'O O O O O O O O O'}