In [1]:
import pandas as pd
import datasets
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
FROM = 500

SEP_TOKEN = " <SEP> "
EOS_TOKEN = " <EOS>"
punctuations = ['!',',','.','?',":",";"]

In [3]:
def curate_text(text):
    curated_text = ''
    if type(text) == str:
        text = text.replace('\n',' ')
        for punctuation in punctuations:
            text = text.replace(punctuation, ' ' + punctuation + ' ')
        
        curated_text = " ".join(text.strip().split())
    return curated_text

In [4]:
def create_source(row):
    source = row['Dialogue']
    return source.split()

In [5]:
def create_tgt_sum(row):
    tgt_sum = row['Generated Summary']
    return tgt_sum.split()

In [6]:
def create_tags(row):    
    return row['Annotations'].split()

In [7]:
def token_count(tokens):
    return len(tokens)

def map_tag_ids(tags):
    return tagLabels.str2int(tags)

In [8]:
def split_stratified_into_train_val_test(df_input, frac_train=0.8, frac_val=0.1, frac_test=0.1):
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' % \
                         (frac_train, frac_val, frac_test))

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp = train_test_split(df_input, test_size=(1.0 - frac_train), random_state=42, shuffle=True)

    
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test = train_test_split(df_temp, test_size = relative_frac_test, random_state=42, shuffle=True)
    
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    
    df_train.reset_index(drop=True, inplace = True)
    df_val.reset_index(drop=True, inplace = True)
    df_test.reset_index(drop=True, inplace = True)

    return df_train, df_val, df_test

In [9]:
df = pd.read_csv('annotated_capstone_data.csv')

df = df.iloc[FROM:]

In [10]:
df['Dialogue'] = df['Dialogue'].apply(curate_text)
df['Generated Summary'] = df['Generated Summary'].apply(curate_text)
df['Annotations'] = df['Annotations'].apply(curate_text)


In [11]:
df['source'] = df.apply(create_source, axis = 1)

In [12]:
df['summary_target'] = df.apply(create_tgt_sum, axis=1)

In [13]:
df['tags'] = df.apply(create_tags, axis=1)

In [14]:
tag_labels = np.unique(df['tags'].sum()).tolist()
tagLabels = datasets.ClassLabel(num_classes=len(tag_labels), names=tag_labels)

df['tag_ids'] = df['tags'].apply(map_tag_ids)

In [15]:
ds_features = datasets.Features({
    'source': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None), 
    'summary_target': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None),
    'tags': datasets.Sequence(feature=tagLabels, length=-1, id=None)
})
print(ds_features)

{'source': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'summary_target': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'tags': Sequence(feature=ClassLabel(names=['C', 'M', 'N', 'O', 'OB', 'W'], id=None), length=-1, id=None)}


In [16]:
dataset_df = pd.DataFrame()
dataset_df['source'] = df['source']
dataset_df['summary_target'] = df['summary_target']
dataset_df['tags'] = df['tag_ids']

In [17]:
dataset_df

Unnamed: 0,source,summary_target,tags
500,"[Ethan, :, somethin, for, Scott, <file_photo>,...","[Ethan, and, Marshall, enjoy, making, fun, of,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 1]"
501,"[Ethan, :, somethin, for, Scott, <file_photo>,...","[Ethan, and, Toby, are, making, fun, of, Scott...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 1]"
502,"[Igor, :, Shit, ,, I've, got, so, much, to, do...","[Igor, has, a, lot, of, work, to, do, at, work...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
503,"[Igor, :, Shit, ,, I've, got, so, much, to, do...","[Igor, has, only, two, weeks, left, before, he...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
504,"[Igor, :, Shit, ,, I've, got, so, much, to, do...","[Igor, is, demotivated, at, work, ,, because, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
...,...,...,...
595,"[Jack, :, Cocktails, later, ?, May, :, YES, !,...","[Jack, and, May, are, going, to, have, some, c...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"
596,"[Jack, :, Cocktails, later, ?, May, :, YES, !,...","[May, and, Jack, will, have, some, cocktails, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"
597,"[Jack, :, Cocktails, later, ?, May, :, YES, !,...","[May, and, Jack, are, going, for, cocktails, l...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"
598,"[Margaret, :, Honey, ,, buy, me, some, painkil...","[Jack, will, buy, Margaret, some, painkiller, .]","[3, 0, 0, 3, 3, 3, 3, 1]"


In [18]:
len(dataset_df['summary_target'][500]) == len(dataset_df['tags'][500])

False

In [19]:
df_train, df_val, df_test = split_stratified_into_train_val_test(dataset_df)

In [20]:
train_ds = datasets.Dataset.from_pandas(df_train, features=ds_features, split='train')
val_ds = datasets.Dataset.from_pandas(df_val, features=ds_features, split='validation')
test_ds = datasets.Dataset.from_pandas(df_test, features=ds_features, split='test')

In [21]:
token = "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"
train_ds.push_to_hub('pvisnrt/capstone_hal', token=token)
val_ds.push_to_hub('pvisnrt/capstone_hal', token=token)
test_ds.push_to_hub('pvisnrt/capstone_hal', token=token)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/498 [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/611 [00:00<?, ?B/s]