In [1]:
#!pip install datasets
#!pip install langchain
#!pip install openai
#!pip install backoff
#!pip install seqeval

In [2]:
import evaluate
seqeval = evaluate.load("seqeval")

def convert_to_iob(d):
    for i in range(len(d)):
        for j in range(len(d[i])):
            if d[i][j] != 'O':
                d[i][j] = 'B-' + d[i][j]
    return d


def compute_metrics_seqeval(gold, predictions):
    true_labels_iob = convert_to_iob(gold)
    true_predictions_iob = convert_to_iob(predictions)


    results = seqeval.compute(predictions=true_predictions_iob, references=true_labels_iob)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [3]:
from datasets import load_dataset
import os
import pandas as pd
import datasets
import json
from tqdm.notebook import tqdm_notebook

from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate, LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
import openai
import backoff  # for exponential backoff

TAGS_BEGIN = "Tags- <TG>"
TAGS_END = '<TG>'

MI_BEGIN = "Missing Information- <MI>"
MI_END = "<MI>"

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def get_dialog_turns(chat, query_text):
    dialog_response= {}
    dialog_response['Error'] = ''
    try:
        messages = [
            SystemMessage(content=system_prompt),
            HumanMessage(content=example1),
            AIMessage(content=answer1),
            HumanMessage(content=example2),
            AIMessage(content=answer2),
            HumanMessage(content=example3),
            AIMessage(content=answer3),
            HumanMessage(content=query_text)
        ]

        dialog_message = chat(messages)
        dialog_response['text'] = dialog_message.content
        dialog_response['success'] = True
    except Exception as e:
        dialog_response['text'] = ''
        dialog_response['success'] = False
        dialog_response['Error'] = str(e)

    return dialog_response



os.environ['OPENAI_API_KEY'] =  "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"

def get_dialog_query(record):
    query_text = 'Dialogue- "' + record['dialogue'] + '"'
    query_text += '\n Summary- "' + record['summary'] + '"'
    return query_text

ds = load_dataset('Deojoandco/capstone_forgpt_without_gold')
print(ds)

chat = ChatOpenAI(model_name='gpt-4', temperature=0.0, max_tokens=2048)

system_prompt = '''
Given a set of dialogues and its summary, the first task is to do token-level classification. Analyze each token in the summary (not the meaning of the entire sentence or phrase) and label each token based on following guidelines:
O = Not Hallucinated
W =  Wrong person reference, only applies to tokens mentioning humans not present in the dialogue or in cases where the actions taken in the summary sentence are as per the dialogue but by the wrong human.
C = Circumstancial error, applies in the predicate of a sentence when events or facts mentioned in the summary are completely wrong as they never were mentioned in the dialogue.
OB = Object error, only applies to inanimate objects incorrectly mentioned in the summary because a different object is mentioned in the dialogue for similar context.
N = uncommon errors like tense errors 

Once the token-level classification is done, the second task is to determine whether there is any important information from the dialogue missing in the summary. Answer 'yes' if there is any missing information else 'No'. 
'''

example1 = '''
Dialogue- "Mary: hey, im kinda broke, lend me a few box
Carter: okay, give me an hour, im at the train station
Mary: cool, thanks"

Summary- "Adam will lend Mary a box."
'''

answer1= '''
Tags- <TG>Adam(W) will(O) lend(O) Mary(O) a(O) box(OB) .(O)<TG>

There is important missing information that Carter will need another 1 hour to reach and lend money. Hence "Yes" for the missing information

Missing Information- <MI>Yes<MI>
'''

example2 = '''
Dialogue- "Ernest: hey Mike, did you park your car on our street?
Mike: no, took it into garage today
Ernest: ok good
Mike: why?
Ernest: someone just crashed into a red honda looking just like yours
Mike: lol lucky me"

Summary- "Mike took his car to the garage today because it had been hit by another car."
'''
answer2='''
Tags- <TG>Mike(O) took(O) his(O) car(O) to(O) the(O) garage(O) today(O) because(C) it(OB) had(N) been(N) hit(O) by(O) another(O) car(O) .(O)<TG>

There is important missing information that Ernst is relieved as Mike's car is ok. Hence "Yes" for the missing information

Missing Information- <MI>Yes<MI>
'''

example3 = '''
Dialogue- "Anne: You were right, he was lying to me :/
Irene: Oh no, what happened?
Jane: who? that Mark guy?
Anne: yeah, he told me he's 30, today I saw his passport - he's 40
Irene: You sure it's so important?
Anne: he lied to me Irene"

Summary- "Mark lied to Anne about his age. He's 40 now."
'''

answer3='''
Tags- <TG>Mark(O) lied(O) to(O) Anne(O) about(O) his(O) age(O) .(O) He's(O) 40(O) now(O) .(O)<TG>

There is no important missing information in the summary. Hence "No" for the missing information.

Missing Information- <MI>No<MI>
'''

def run_gpt(ds, split):
    records = []
    GPT_SUCCESS = 0
    GPT_MI_FOUND = 0
    GPT_TAGS_FOUND = 0
    token_count_matches = 0

    pbar = tqdm_notebook(ds, desc=f'Processing {split}')
    for i, record in enumerate(pbar):
        query_text = get_dialog_query(record)

        dialog_response = get_dialog_turns(chat, query_text)

        #record['query'] = query_text
        record['gpt_success'] = dialog_response['success']
        record['gpt_response'] = dialog_response['text']

        gold_tag_tokens = record['gold_tags'].split()
        record['gold_tags_tokens_count'] = len(gold_tag_tokens)

        if dialog_response['success'] == True:
            GPT_SUCCESS += 1

            output_tokens= []
            output_tags = []
            tags_start_index = record['gpt_response'].find(TAGS_BEGIN) + len(TAGS_BEGIN)
            tags_end_index = record['gpt_response'].find(TAGS_END, tags_start_index)
            if tags_start_index != -1 and tags_end_index != -1:
                record['GPT_TAGS_FOUND'] = True
                GPT_TAGS_FOUND += 1

                record['gpt_output_tags'] = record['gpt_response'][tags_start_index:tags_end_index]

                output_tag_tokens = record['gpt_output_tags'].split()
                record['gpt_output_tag_tokens_count'] = len(output_tag_tokens)

                for output_tag_token in output_tag_tokens:
                    tag_tokens = output_tag_token.split('(')
                    output_tokens.append(tag_tokens[0])
                    if len(tag_tokens) == 2:
                        output_tags.append(tag_tokens[1][:-1])

            mi_start_index = record['gpt_response'].find(MI_BEGIN) + len(MI_BEGIN)
            mi_end_index = record['gpt_response'].find(MI_END, mi_start_index)

            if mi_start_index != -1 and mi_end_index != -1:
                GPT_MI_FOUND += 1
                record['GPT_MI_FOUND'] = True

                mi_response = record['gpt_response'][mi_start_index:mi_end_index]

                if mi_response.lower() == "yes":
                    output_tags.append("M")
                else:
                    output_tags.append("O")


            record['gpt_tags_token_count'] = len(output_tags)
            record['gpt_tags'] = ' '.join(output_tags)

            record['tag_token_count_match'] = record['gold_tags_tokens_count'] == record['gpt_tags_token_count']
            if record['tag_token_count_match'] == True:
                token_count_matches += 1
        else:
          record['GPT_TAGS_FOUND'] = False

        records.append(record)

    df = pd.DataFrame(data=records)
    
    print('-'*89)
    print(f'stats for split: {split}')
    print(f'GPT success Count: {GPT_SUCCESS}, Percentage: {GPT_SUCCESS * 100/len(records)}')

    print(f'GPT Tags Found: {GPT_TAGS_FOUND}, Percentage: {GPT_TAGS_FOUND * 100/len(records)}')
    print(f'GPT Missing Information Found: {GPT_MI_FOUND}, Percentage: {GPT_MI_FOUND * 100/len(records)}')

    print(f'Gold and Prediction token length matching. Count: {token_count_matches}, Percentage: {token_count_matches * 100 / len(records)}')

    length_matching_golds = []
    length_matching_preds = []
    for record in records:
        if record['tag_token_count_match'] == True:
            length_matching_golds.append(record['gold_tags'].split())
            length_matching_preds.append(record['gpt_tags'].split())

    score = compute_metrics_seqeval(length_matching_golds, length_matching_preds)
    print(score)
    
    ds = datasets.Dataset.from_pandas(df, split=split)

    # push splits to huggingface repo
    print("Uploading to Huggingface")
    ds.push_to_hub('capstone_fromgpt_without_gold_v9_all', token ='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')
    ds.to_csv(f'capstone_fromgpt_without_gold_v9_{split}.csv')
    

DatasetDict({
    train: Dataset({
        features: ['dialog_id', 'dialogue', 'summary', 'gold_tags'],
        num_rows: 76
    })
    validation: Dataset({
        features: ['dialog_id', 'dialogue', 'summary', 'gold_tags'],
        num_rows: 12
    })
    test: Dataset({
        features: ['dialog_id', 'dialogue', 'summary', 'gold_tags'],
        num_rows: 12
    })
})


In [4]:
run_gpt(ds['train'], 'train')
run_gpt(ds['validation'], 'validation')
run_gpt(ds['test'], 'test')

Processing train:   0%|          | 0/76 [00:00<?, ?it/s]

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


-----------------------------------------------------------------------------------------
stats for split: train
GPT success Count: 76, Percentage: 100.0
GPT Tags Found: 76, Percentage: 100.0
GPT Missing Information Found: 76, Percentage: 100.0
Gold and Prediction token length matching. Count: 70, Percentage: 92.10526315789474
{'precision': 0.6611570247933884, 'recall': 0.6349206349206349, 'f1': 0.6477732793522267, 'accuracy': 0.9525462962962963}
Uploading to Huggingface


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing validation:   0%|          | 0/12 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
stats for split: validation
GPT success Count: 12, Percentage: 100.0
GPT Tags Found: 12, Percentage: 100.0
GPT Missing Information Found: 12, Percentage: 100.0
Gold and Prediction token length matching. Count: 12, Percentage: 100.0
{'precision': 0.5490196078431373, 'recall': 0.6363636363636364, 'f1': 0.5894736842105264, 'accuracy': 0.8817891373801917}
Uploading to Huggingface


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing test:   0%|          | 0/12 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
stats for split: test
GPT success Count: 12, Percentage: 100.0
GPT Tags Found: 12, Percentage: 100.0
GPT Missing Information Found: 12, Percentage: 100.0
Gold and Prediction token length matching. Count: 12, Percentage: 100.0
{'precision': 0.5625, 'recall': 0.8181818181818182, 'f1': 0.6666666666666666, 'accuracy': 0.9540816326530612}
Uploading to Huggingface


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]