In [6]:
import pandas
import re, json
import csv

import torch
import torch.nn as nn
from datasets import load_metric,Dataset,DatasetDict
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

import evaluate

import numpy as np
import nltk
import os
import random
from sklearn.model_selection import train_test_split
from typing import List, Optional, Tuple, Union, Dict, Any

In [7]:
data = open('annotated_capstone_data.csv')

In [8]:
reader = csv.DictReader(data)
myList = list()
for dictionary in reader:
    myList.append(dictionary)

In [9]:
dialogues = []
gold_sum = []
generated_sum = []
hal_tags = []

for entry in myList[:100]:
    dialogues.append(entry['Dialogue'].strip())
    gold_sum.append(entry['Reference Summary'].strip())
    generated_sum.append(entry['Generated Summary'].strip())
    hal_tags.append(entry['Annotations'].strip())

In [10]:
train = {'dialogue': dialogues, 'summary':generated_sum, 'tags':hal_tags}

In [11]:
train_hf = Dataset.from_dict(train)

In [12]:
raw_data = DatasetDict({'train':train_hf})

In [13]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', 'tags'],
        num_rows: 100
    })
})

In [14]:
model_checkpoint = "facebook/bart-large"

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

Downloading (…)lve/main/config.json: 100%|██████████| 1.63k/1.63k [00:00<00:00, 4.41MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.02G/1.02G [02:51<00:00, 5.94MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 128kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 2.88MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 3.13MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.25MB/s]


In [16]:
sum = generated_sum[1]
tags = hal_tags[1].split(' ')
with tokenizer.as_target_tokenizer():
     labels = tokenizer(sum, max_length=128, truncation=True)



In [25]:
print(tags)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'W', 'O', 'O', 'O', 'O']


In [18]:
print(sum)
split_tags = [None]
final = ['<s>']
for i, word in enumerate(sum.split(' ')):
    encoded = tokenizer([word], is_split_into_words=True)
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'])
    length_of_subtokens = len(tokens[1:-1])
    print(tags[i])
    print("length_of_subtokens: ", length_of_subtokens)
    print("tokens: ",  tokens)
    if length_of_subtokens >= 1:
        split_tags.append(tags[i])
        length_of_subtokens-=1
        while length_of_subtokens >= 1:
            split_tags.append('-100')
            length_of_subtokens-=1            
    
    final.extend(tokens[1:-1])
    print(tokens)
final.append('</s>')
split_tags.append(None)

Amanda can't find Betty's number. Larry called her last time they were at the park together. Amanda will text Larry.
O
length_of_subtokens:  1
tokens:  ['<s>', 'ĠAmanda', '</s>']
['<s>', 'ĠAmanda', '</s>']
O
length_of_subtokens:  2
tokens:  ['<s>', 'Ġcan', "'t", '</s>']
['<s>', 'Ġcan', "'t", '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'Ġfind', '</s>']
['<s>', 'Ġfind', '</s>']
O
length_of_subtokens:  2
tokens:  ['<s>', 'ĠBetty', "'s", '</s>']
['<s>', 'ĠBetty', "'s", '</s>']
O
length_of_subtokens:  2
tokens:  ['<s>', 'Ġnumber', '.', '</s>']
['<s>', 'Ġnumber', '.', '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'ĠLarry', '</s>']
['<s>', 'ĠLarry', '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'Ġcalled', '</s>']
['<s>', 'Ġcalled', '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'Ġher', '</s>']
['<s>', 'Ġher', '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'Ġlast', '</s>']
['<s>', 'Ġlast', '</s>']
O
length_of_subtokens:  1
tokens:  ['<s>', 'Ġtime', '</s>']
['<s>', 'Ġti

In [23]:
print(split_tags)

[None, 'O', 'O', '-100', 'O', 'O', '-100', 'O', '-100', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '-100', 'O', 'O', 'W', 'O', '-100', None]


In [24]:
print(final)

['<s>', 'ĠAmanda', 'Ġcan', "'t", 'Ġfind', 'ĠBetty', "'s", 'Ġnumber', '.', 'ĠLarry', 'Ġcalled', 'Ġher', 'Ġlast', 'Ġtime', 'Ġthey', 'Ġwere', 'Ġat', 'Ġthe', 'Ġpark', 'Ġtogether', '.', 'ĠAmanda', 'Ġwill', 'Ġtext', 'ĠLarry', '.', '</s>']


In [None]:
# def tokenize_and_align_labels(tokens, tags):
#     print(tokens)
#     print(tags)
#     with tokenizer.as_target_tokenizer():
#         tokenized_inputs = tokenizer(tokens, max_length=128, truncation=True)

#     labels = []
#     # for i, label in enumerate(tags):
#     word_ids = tokenized_inputs.word_ids()  # Map tokens to their respective word.
#     print(word_ids)
#     previous_word_idx = None
#     label_ids = []
#     for word_idx in word_ids:  # Set the special tokens to -100.
#         if word_idx is None:
#             label_ids.append(-100)
#         elif word_idx != previous_word_idx:  # Only label the first token of a given word.
#             label_ids.append(tags[word_idx])
#         else:
#             label_ids.append(-100)
#         previous_word_idx = word_idx
#     print(label_ids)
#         # labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     # return tokenized_inputs

In [88]:
import torch
import torch.nn as nn
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

class ExtendedBartModel(nn.Module):
    def __init__(self, model_name):
        num_tags = 6
        super(ExtendedBartModel, self).__init__()
        self.bart = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        # self.token_tagging_layer = nn.Linear(self.bart.config.d_model, num_tags)  
        # Assuming num_tags is the number of possible tags


    def forward(self, input_ids, attention_mask=None):
        bart_outputs = self.bart(input_ids, attention_mask=attention_mask).last_hidden_state
        print("output: ", bart_outputs)
        # token_tags_logits = self.token_tagging_layer(bart_outputs)
        return bart_outputs  
    

    def generate(self, input_ids, attention_mask=None):
        bart_output = self.bart.generate(input_ids, attention_mask=attention_mask)
        print("bart out: ", bart_output)
        # tag 
        return bart_output



In [89]:
model_checkpoint = "facebook/bart-large"
model = ExtendedBartModel(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# freeze the weights of the BART model
for param in model.parameters():
    param.requires_grad = False


In [90]:
summary = generated_sum[1]
tags = hal_tags[1].split(' ')

print(summary)
print(tags)


Amanda can't find Betty's number. Larry called her last time they were at the park together. Amanda will text Larry.
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'W', 'O', 'O', 'O', 'O']


In [91]:
train_hf[21]

{'dialogue': "Will: hey babe, what do you want for dinner tonight?\nEmma:  gah, don't even worry about it tonight\nWill: what do you mean? everything ok?\nEmma: not really, but it's ok, don't worry about cooking though, I'm not hungry\nWill: Well what time will you be home?\nEmma: soon, hopefully\nWill: you sure? Maybe you want me to pick you up?\nEmma: no no it's alright. I'll be home soon, i'll tell you when I get home. \nWill: Alright, love you. \nEmma: love you too.",
 'summary': 'Will will pick Emma up when he gets home.',
 'tags': 'W O C W C O W O O O'}

In [92]:

inputs = tokenizer(
    train_hf[21]['dialogue'], return_tensors="pt", max_length=128, truncation=True)
summary_ids = model.generate(inputs["input_ids"])

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(summary)




output:  tensor([[    2,     0,     0,     0, 16750,  1916,    18,  1028, 11987,     4,
          2290,    35, 17232, 37502,     6,    99,   109,    47,   236,     2]])
Emma's phone rings. Will: hey babe, what do you want
