In [1]:
import pandas
import re, json
import csv

import torch
import torch.nn as nn
from datasets import load_metric,Dataset,DatasetDict
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

import evaluate

import numpy as np
import nltk
import os
import random
from sklearn.model_selection import train_test_split
from typing import List, Optional, Tuple, Union, Dict, Any

In [2]:
data = open('annotated_capstone_data.csv')

In [3]:
reader = csv.DictReader(data)
myList = list()
for dictionary in reader:
    myList.append(dictionary)

In [4]:
dialogues = []
gold_sum = []
generated_sum = []
hal_tags = []

for entry in myList[:100]:
    dialogues.append(entry['Dialogue'].strip())
    gold_sum.append(entry['Reference Summary'].strip())
    generated_sum.append(entry['Generated Summary'].strip())
    hal_tags.append(entry['Annotations'].strip())

In [5]:
train = {'dialogue': dialogues, 'summary':generated_sum, 'tags':hal_tags}

In [6]:
train_hf = Dataset.from_dict(train)

In [7]:
raw_data = DatasetDict({'train':train_hf})

In [8]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', 'tags'],
        num_rows: 100
    })
})

In [9]:
model_checkpoint = "facebook/bart-large"

In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [11]:
sum = generated_sum[0]
tags = hal_tags[0]
with tokenizer.as_target_tokenizer():
     labels = tokenizer(sum, max_length=128, truncation=True)



In [12]:
labels['input_ids']

[0, 34318, 75, 33, 16666, 18, 346, 4, 10641, 3649, 11029, 7, 1394, 6045, 4, 2]

In [13]:
tokens = tokenizer.convert_ids_to_tokens(labels['input_ids'])
tokens

['<s>',
 'doesn',
 "'t",
 'Ġhave',
 'ĠBetty',
 "'s",
 'Ġnumber',
 '.',
 'ĠAmanda',
 'Ġsuggests',
 'ĠHannah',
 'Ġto',
 'Ġask',
 'ĠLarry',
 '.',
 '</s>']

In [29]:
def tokenize_and_align_labels(sum, tags):
    print(sum)
    tokenized_inputs = tokenizer(sum, truncation=True)

    labels = []
    # for i, label in enumerate(tags):
    word_ids = tokenized_inputs.word_ids(batch_index=0)  # Map tokens to their respective word.
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:  # Set the special tokens to -100.
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:  # Only label the first token of a given word.
            label_ids.append(tags[word_idx])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx
    print(label_ids)
        # labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    # return tokenized_inputs

In [30]:
tokenize_and_align_labels(sum, tags)

doesn't have Betty's number. Amanda suggests Hannah to ask Larry.
[-100, 'O', ' ', 'O', ' ', 'O', ' ', 'O', ' ', 'O', ' ', 'O', ' ', 'O', ' ', -100]


In [22]:
tags

'O O O O O O O O O O O O M'

In [26]:
tokens

['<s>',
 'doesn',
 "'t",
 'Ġhave',
 'ĠBetty',
 "'s",
 'Ġnumber',
 '.',
 'ĠAmanda',
 'Ġsuggests',
 'ĠHannah',
 'Ġto',
 'Ġask',
 'ĠLarry',
 '.',
 '</s>']