In [1]:
import pandas as pd
import datasets
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
FROM = 500

SEP_TOKEN = " <SEP> "
EOS_TOKEN = " <EOS>"
punctuations = ['!',',','.','?',":",";"]

In [3]:
mapping = {
    'C': 3,
    'M':3,
    'N':3,
    'O':4,
    'OB':3,
    'W':3,
}

In [4]:
def curate_text(text):
    curated_text = ''
    if type(text) == str:
        text = text.replace('\n',' ')
        for punctuation in punctuations:
            text = text.replace(punctuation, ' ' + punctuation + ' ')
        
        curated_text = " ".join(text.strip().split())
    return curated_text

In [5]:
def create_source(row):
    source = row['Dialogue']
    return source.split()

In [6]:
def create_tgt_sum(row):
    tgt_sum = row['Generated Summary']
    return tgt_sum.split()

In [7]:
def create_tags(row): 
    return row['Annotations'].split()

In [8]:
def token_count(tokens):
    return len(tokens)

def map_tag_ids(tags):
    return [mapping[tag] for tag in tags]

In [9]:
def split_stratified_into_train_val_test(df_input, frac_train=0.8, frac_val=0.1, frac_test=0.1):
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' % \
                         (frac_train, frac_val, frac_test))

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp = train_test_split(df_input, test_size=(1.0 - frac_train), random_state=42, shuffle=True)

    
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test = train_test_split(df_temp, test_size = relative_frac_test, random_state=42, shuffle=True)
    
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    
    df_train.reset_index(drop=True, inplace = True)
    df_val.reset_index(drop=True, inplace = True)
    df_test.reset_index(drop=True, inplace = True)

    return df_train, df_val, df_test

In [10]:
df = pd.read_csv('annotated_capstone_data.csv')

df = df.iloc[FROM:]

In [11]:
df['Dialogue'] = df['Dialogue'].apply(curate_text)
df['Generated Summary'] = df['Generated Summary'].apply(curate_text)
df['Annotations'] = df['Annotations'].apply(curate_text)


In [12]:
df['Annotations']

500                                  O O O O O O O O O M
501                                  O O O O O O O O O M
502    O O O O O O O O O O O O O O O O O O O O O O O ...
503    O O O O O O O O O O O O O O O O O O O O O O O ...
504      O O O O O O O O O O O O O O O O O O O O O O O M
                             ...                        
595                            O O O O O O O O O O O O O
596                                  O O O O O O O O O O
597                                  O O O O O O O O O O
598                                      O C C O O O O M
599                          O O O O O O O C C O O O O M
Name: Annotations, Length: 100, dtype: object

In [13]:
df['source'] = df.apply(create_source, axis = 1)

In [14]:
df['summary_target'] = df.apply(create_tgt_sum, axis=1)

In [15]:
df['tags'] = df.apply(create_tags, axis=1)

In [16]:
tag_labels = np.unique(df['tags'].sum()).tolist()
tagLabels = datasets.Sequence(feature=datasets.Value(dtype='int64'))

df['tag_ids'] = df['tags'].apply(map_tag_ids)

In [17]:
df['tag_ids']

500                       [4, 4, 4, 4, 4, 4, 4, 4, 4, 3]
501                       [4, 4, 4, 4, 4, 4, 4, 4, 4, 3]
502    [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...
503    [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...
504    [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...
                             ...                        
595              [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
596                       [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
597                       [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
598                             [4, 3, 3, 4, 4, 4, 4, 3]
599           [4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 3]
Name: tag_ids, Length: 100, dtype: object

In [23]:
ds_features = datasets.Features({
    'source': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None), 
    'summary_target': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None),
    'tags': datasets.Sequence(feature=datasets.Value(dtype='int64'), length=-1, id=None,)
})
print(ds_features)

{'source': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'summary_target': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'tags': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [24]:
dataset_df = pd.DataFrame()
dataset_df['source'] = df['source']
dataset_df['summary_target'] = df['summary_target']
dataset_df['tags'] = df['tag_ids']

In [25]:
len(dataset_df['summary_target'][500]) == len(dataset_df['tags'][500])

False

In [26]:
df_train, df_val, df_test = split_stratified_into_train_val_test(dataset_df)

In [27]:
train_ds = datasets.Dataset.from_pandas(df_train, features=ds_features, split='train')
val_ds = datasets.Dataset.from_pandas(df_val, features=ds_features, split='validation')
test_ds = datasets.Dataset.from_pandas(df_test, features=ds_features, split='test')

In [28]:
token = "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"
train_ds.push_to_hub('pvisnrt/capstone_hal_binary', token=token)
val_ds.push_to_hub('pvisnrt/capstone_hal_binary', token=token)
test_ds.push_to_hub('pvisnrt/capstone_hal_binary', token=token)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/367 [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/480 [00:00<?, ?B/s]

In [40]:
!nvidia-smi

Wed Nov 15 17:11:38 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:A1:00.0 Off |                    0 |
| N/A   33C    P0              63W / 300W |  17234MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    