In [1]:
import pandas
import re, json
import csv

import torch
import torch.nn as nn
from datasets import load_metric,Dataset,DatasetDict
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

import evaluate

import numpy as np
import nltk
import os
import random
from sklearn.model_selection import train_test_split
from typing import List, Optional, Tuple, Union, Dict, Any

In [2]:
data = open('annotated_capstone_data.csv')

In [3]:
reader = csv.DictReader(data)
myList = list()
for dictionary in reader:
    myList.append(dictionary)

In [4]:
dialogues = []
gold_sum = []
generated_sum = []
hal_tags = []

for entry in myList[:100]:
    dialogues.append(entry['Dialogue'].strip())
    gold_sum.append(entry['Reference Summary'].strip())
    generated_sum.append(entry['Generated Summary'].strip())
    hal_tags.append(entry['Annotations'].strip())

In [5]:
train = {'dialogue': dialogues, 'summary':generated_sum, 'tags':hal_tags}

In [6]:
train_hf = Dataset.from_dict(train)

In [7]:
raw_data = DatasetDict({'train':train_hf})

In [8]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', 'tags'],
        num_rows: 100
    })
})

In [9]:
model_checkpoint = "facebook/bart-large"

In [34]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [90]:
sum = generated_sum[1]
tags = hal_tags[1].split(' ')
with tokenizer.as_target_tokenizer():
     labels = tokenizer(sum, max_length=128, truncation=True)

In [91]:
tags

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'W',
 'O',
 'O',
 'O',
 'O']

In [92]:
print(sum)
split_tags = [None]
final = ['<s>']
for i, word in enumerate(sum.split(' ')):
    encoded = tokenizer([word], is_split_into_words=True)
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'])
    length_of_subtokens = len(tokens[1:-1])
    print(tags[i])
    print("length_of_subtokens: ", length_of_subtokens)
    print("tokens: ",  tokens)
    if length_of_subtokens >= 1:
        split_tags.append(tags[i])
        length_of_subtokens-=1
        while length_of_subtokens >= 1:
            split_tags.append('-100')
            length_of_subtokens-=1            
    
    final.extend(tokens[1:-1])
    print(tokens)
final.append('</s>')
split_tags.append(None)

Amanda can't find Betty's number. Larry called her last time they were at the park together. Amanda will text Larry.
O
length_of_subtokens:  1
tokens:  ['<s>', 'ĠAmanda', '</s>']
['<s>', 'ĠAmanda', '</s>']
O
length_of_subtokens:  2
tokens:  ['<s>', 'Ġcan', "'t", '</s>']
['<s>', 'Ġcan', "'t", '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'Ġfind', '</s>']
['<s>', 'Ġfind', '</s>']
O
length_of_subtokens:  2
tokens:  ['<s>', 'ĠBetty', "'s", '</s>']
['<s>', 'ĠBetty', "'s", '</s>']
O
length_of_subtokens:  2
tokens:  ['<s>', 'Ġnumber', '.', '</s>']
['<s>', 'Ġnumber', '.', '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'ĠLarry', '</s>']
['<s>', 'ĠLarry', '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'Ġcalled', '</s>']
['<s>', 'Ġcalled', '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'Ġher', '</s>']
['<s>', 'Ġher', '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'Ġlast', '</s>']
['<s>', 'Ġlast', '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'Ġtime', '</s>']
['<s>', 'Ġti

In [95]:
split_tags

[None,
 'O',
 'O',
 '-100',
 'O',
 'O',
 '-100',
 'O',
 '-100',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 '-100',
 'O',
 'O',
 'W',
 'O',
 '-100',
 None]

In [93]:
final

['<s>',
 'ĠAmanda',
 'Ġcan',
 "'t",
 'Ġfind',
 'ĠBetty',
 "'s",
 'Ġnumber',
 '.',
 'ĠLarry',
 'Ġcalled',
 'Ġher',
 'Ġlast',
 'Ġtime',
 'Ġthey',
 'Ġwere',
 'Ġat',
 'Ġthe',
 'Ġpark',
 'Ġtogether',
 '.',
 'ĠAmanda',
 'Ġwill',
 'Ġtext',
 'ĠLarry',
 '.',
 '</s>']

In [85]:
' '.join(split_tags)

'O O -100 O O -100 O -100 O O O O O O O O O O O -100 W O O O -100'

In [16]:
# def tokenize_and_align_labels(tokens, tags):
#     print(tokens)
#     print(tags)
#     with tokenizer.as_target_tokenizer():
#         tokenized_inputs = tokenizer(tokens, max_length=128, truncation=True)

#     labels = []
#     # for i, label in enumerate(tags):
#     word_ids = tokenized_inputs.word_ids()  # Map tokens to their respective word.
#     print(word_ids)
#     previous_word_idx = None
#     label_ids = []
#     for word_idx in word_ids:  # Set the special tokens to -100.
#         if word_idx is None:
#             label_ids.append(-100)
#         elif word_idx != previous_word_idx:  # Only label the first token of a given word.
#             label_ids.append(tags[word_idx])
#         else:
#             label_ids.append(-100)
#         previous_word_idx = word_idx
#     print(label_ids)
#         # labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     # return tokenized_inputs