In [None]:
!pip install datasets
!pip install langchain
!pip install openai
!pip install backoff
!pip install seqeval
!pip install evaluate





In [None]:
import evaluate
seqeval = evaluate.load("seqeval")

def convert_to_iob(d):
    for i in range(len(d)):
        for j in range(len(d[i])):
            if d[i][j] != 'O':
                d[i][j] = 'B-' + d[i][j]
    return d


def compute_metrics_seqeval(gold, predictions):
    true_labels_iob = convert_to_iob(gold)
    true_predictions_iob = convert_to_iob(predictions)


    results = seqeval.compute(predictions=true_predictions_iob, references=true_labels_iob)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from datasets import load_dataset
import os
import pandas as pd
import datasets
import json
from tqdm.notebook import tqdm_notebook

from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate, LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
import openai
import backoff  # for exponential backoff

TAGS_BEGIN = 'Tags: '

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def get_dialog_turns(chat, prompt, query_text):
    dialog_response= {}
    dialog_response['Error'] = ''
    try:
        messages = [
            SystemMessage(content=prompt),
            HumanMessage(content=query_text)
        ]

        dialog_message = chat(messages)
        dialog_response['text'] = dialog_message.content
        dialog_response['success'] = True
    except Exception as e:
        dialog_response['text'] = ''
        dialog_response['success'] = False
        dialog_response['Error'] = str(e)

    return dialog_response



os.environ['OPENAI_API_KEY'] =  "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"

def get_dialog_query(record):
    return f"Dialog:\n{record['dialogue']}\nSummary:\n{record['summary']}"

ds = load_dataset('Deojoandco/capstone_forgpt_without_gold')
print(ds)

chat = ChatOpenAI(model_name='gpt-4', temperature=0.0, max_tokens=2048)

prompt = '''
    Below is a dialog between people and its summary. At the end of summary there is an extra '<EOS>' token. Your task is identify how much the summary is hallucinated. The output should be token by token classification whether its hallucinated or not. Following are the available hallucination classification labels for each token. O : Not Hallucinated, W: Wrong person reference, C: circumstancial error, OB: Object error, N: uncommon error like tense errors. At the end you have to identify if there is any missing information in the summary. For the '<EOS>', the possible labels are either 'M' if the summary has missed any information from the dialog else 'O'.  Remember to tag punctuations and not remove them.

Example 1:
Dialog: Jesse : I have an idea that'll cheer u up ! Melvin : What is it ? Jesse : I was thinking about doing something 4 the less fortunate this year . Lee : Gr8 idea ! Anything in mind ? Maxine : So no presents 4 me ? : ( Jesse : U'll get ur presents , no worries ; ) Maxine : Phew ! Was getting a bit worried for a moment ; ) Melvin : Bt what do u have in store ? Jesse : Well , have u heard about the Refuge ? Lee : No . What's that ? Melvin : That's the Christmas foundation to help women and children ? Maxine : I think I've heard of them . So what about them ? Jesse : That's right ! They help women and children who escape from abuse . And every year they post wish lists of such ppl online and I thought that we could choose one and chip in . Melvin : That's a great idea ! Lee : Count me in ! Maxine : Me too . Jesse : Have a look at these 3 lists : <file_other> <file_other> <file_other> Lee : I think the second one would be the easiest to arrange . Maxine : Agree . Melvin : What about number 3 ? A bit ambitious , but if we pull together , we'll manage . Jesse : Actually , I'm in for the 3rd one . Maxine : I think the 2nd list would be better . The items cos more or less the same and we can easily divide it . Melvin : But if we agree to chip in the same amount of money , we can deal with the 3rd one easily . Lee : Come to think of it , the 3rd one is not that bad . A bit of planning and logistics and were good to go . Jesse : So it's settled ? Melvin : Yup . Lee : Sure . Maxine : Fine .

Summary: Jesse , Lee and Maxine will chip in for the Refuge , a Christmas foundation for women and children who escape from abuse . <EOS>

Tags: Jesse(O) ,(O) Lee(O) and(O) Maxine(O) will(O) chip(O) in(O) for(O) the(O) Refuge(O) ,(O) a(O) Christmas(O) foundation(O) for(O) women(O) and(O) children(O) who(O) escape(O) from(O) abuse(O) .(O) <EOS>(O)

Example 2:
Dialog:
Ernest : hey Mike , did you park your car on our street ? Mike : no , took it into garage today Ernest : ok good Mike : why ? Ernest : someone just crashed into a red honda looking just like yours Mike : lol lucky me
Summary:
Mike's car has been damaged beyond repair after being hit by another car . <EOS>

Tags: Mike's(W) car(O) has(O) been(O) damaged(O) beyond(C) repair(C) after(O) being(O) hit(O) by(O) another(O) car(O) .(O) <EOS>(M)

Looking at the example above please look at the below dialog and its summary. Analyse if the summary is hallucinated and output tags for each token in summary.
'''

records = []
length_matching_golds = []
length_matching_preds = []
GPT_SUCCESS = 0
GPT_OUTPUT_FOUND = 0
GPT_OUTPUT_NOT_FOUND = 0

pbar = tqdm_notebook(ds['train'], desc='Creating Dialog')
for i, record in enumerate(pbar):
    query_text = get_dialog_query(record)

    dialog_response = get_dialog_turns(chat, prompt, query_text)

    record['query'] = prompt + '\n' + query_text
    record['gpt_success'] = dialog_response['success']
    record['gpt_response'] = dialog_response['text']

    gold_tag_tokens = record['gold_tags'].split()
    record['gold_tags_tokens_count'] = len(gold_tag_tokens)

    if dialog_response['success'] == True:
        GPT_SUCCESS += 1

        tags_start_index = record['gpt_response'].find(TAGS_BEGIN)
        if tags_start_index != -1:
            record['GPT_OUTPUT_FOUND'] = True
            GPT_OUTPUT_FOUND += 1

            record['gpt_output_tags'] = record['gpt_response'][tags_start_index + len(TAGS_BEGIN):]

            output_tag_tokens = record['gpt_output_tags'].split()
            record['gpt_output_tag_tokens'] = len(output_tag_tokens)
            record['summary_gpt_token_count_match'] = record['gold_tags_tokens_count'] == record['gpt_output_tag_tokens']

            output_tokens= []
            output_tags = []
            for output_tag_token in output_tag_tokens:
                tag_tokens = output_tag_token.split('(')
                output_tokens.append(tag_tokens[0])
                if len(tag_tokens) == 2:
                    output_tags.append(tag_tokens[1][:-1])


            record['gpt_output_token_count'] = len(output_tokens)
            record['gpt_output_tag_count'] = len(output_tags)
            record['gpt_output_tags'] = ' '.join(output_tags)

            if record['summary_gpt_token_count_match'] == True:
                length_matching_golds.append(gold_tag_tokens)
                length_matching_preds.append(output_tags)

            record['summary_gpt_tags_token_count_match'] = record['gold_tags_tokens_count'] == record['gpt_output_tags']

            #print(f"Token match: {record['summary_gpt_token_count_match']}, tag count match: {record['summary_gpt_tags_token_count_match']}")

        else:
          record['GPT_OUTPUT_FOUND'] = False
          GPT_OUTPUT_NOT_FOUND += 1


    records.append(record)

df = pd.DataFrame(data=records)
df.head()

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary', 'gold_tags'],
        num_rows: 100
    })
})


Creating Dialog:   0%|          | 0/100 [00:00<?, ?it/s]

Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True
Token match: True


IndexError: list index out of range

In [None]:
print(f'GPT success Count: {GPT_SUCCESS}, Percentage: {GPT_SUCCESS * 100/len(records)}')

print(f'Valid GPT output Count: {GPT_OUTPUT_FOUND}, Percentage: {GPT_OUTPUT_FOUND * 100/len(records)}')
print(f'Invalid GPT output Count: {GPT_OUTPUT_NOT_FOUND}, Percentage: {GPT_OUTPUT_NOT_FOUND * 100/len(records)}')

print(f'Gold and Prediction token length matching. Count: {len(length_matching_golds)}, Percentage: {len(length_matching_golds) * 100 / len(records)}')

not_match_count = len(records) - len(length_matching_golds)
print(f'Gold and Prediction token length not matching. Count: {not_match_count}, Percentage: {not_match_count * 100 / len(records)}')

score = compute_metrics_seqeval(length_matching_golds, length_matching_preds)
print(score)

GPT success Count: 100, Percentage: 100.0
Valid GPT output Count: 100, Percentage: 100.0
Invalid GPT output Count: 0, Percentage: 0.0
Gold and Prediction token length matching. Count: 98, Percentage: 98.0
Gold and Prediction token length not matching. Count: 2, Percentage: 2.0
{'precision': 0.16666666666666666, 'recall': 0.3333333333333333, 'f1': 0.2222222222222222, 'accuracy': 0.6428571428571429}


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
ds = datasets.Dataset.from_pandas(df)

# push splits to huggingface repo
print("Uploading to Huggingface")
ds.push_to_hub('capstone_fromgpt_without_gold', token ='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')

Uploading to Huggingface


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]