In [44]:
# from prompts import system_prompt

system_prompt = """You are a linguist analyzing dialogue transcripts for examples of "hedging" or "authority" pragmatic markers. You are given a transcript with the terms highlighted in brackets and capitalized. Provide an answer for every single term in the list of "terms" you are given. Return your answer in the following JSON format:

---BEGIN SAMPLE INPUT---
terms: ["believe", "maybe", "certainly", "best"]
transcript:
Speaker 1: "In the run for the presidency."
Speaker 2: "I knew about one incident. Understand the whole time that he ran for office, I knew that he had had one liaison. It still -- it still tore me up, I mean, personally tore me up. Did I think that one liaison would disqualify him to be the president? You know, we've had great presidents who I would hope one liaison would not have -- have stopped from serving us. That's what I believed. And I believed that until, golly, <MAYBE> long after it made any sense to but, <CERTAINLY> long after -- I mean, long after he was out of the race. And so sometimes I had to, you know, bite my tongue. I talked a lot about his policies, which I still <BELIEVE> were the <BEST> policies and set the standard for the other candidates on a lot of issues -- health care being one of them, but environment and poverty and corporate interference with government. And I really believed that that I could talk about those things and mean every word that I was saying, and have him as an advocate for those issues and meaning that as well."
---END SAMPLE INPUT---

---BEGIN SAMPLE RESPONSE---
    {{
        "believe": "hedge",
        "maybe": "hedge",
        "certainly": "authority",
        "best": "none"
    }}
---END SAMPLE RESPONSE---"""

In [2]:
import json
from openai import OpenAI
from pyauth import openaikey

client = OpenAI(api_key=openaikey)

sample_text = {
        "transcript_id": "CNN-235715",
        "statement_id": "640bf44d-6500-4a11-a473-4195d48baf31",
        "matched_terms": {
            "about": "none",
            "know": "none"
        },
        "previous_statement": "Actually this is",
        "statement": "The Vines that you've posted show some incredible sights and sounds, rockets, shaking buildings, ambulances in the night. What do you want people to <KNOW> <ABOUT> life inside Gaza?"
    }




In [55]:
def parse_json(json_response, verbose=False):
    matched_terms_list = []
    for matched_term in json_response["matched_terms"]:
        matched_terms_list.append(matched_term)

    string = f"""terms: {matched_terms_list}"\nTranscript:\nSpeaker 1: "{json_response["previous_statement"]}"\nSpeaker 2: "{json_response["statement"]}" """
    
    transcript_id = json_response["transcript_id"]
    statement_id = json_response["statement_id"]
    if verbose:
        print("Parsing", statement_id)
    previous_statement = json_response["previous_statement"]
    statement = json_response["statement"]
    
    if verbose:
        print("Parsed JSON:", transcript_id, statement_id, matched_terms_list, previous_statement, statement)

    return transcript_id, statement_id, json_response["matched_terms"], matched_terms_list, previous_statement, statement, matched_terms_list, string


def get_json_response(string, model="gpt-3.5-turbo", system_prompt=system_prompt, verbose=False):
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={ "type": "json_object" },
        messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": string}
        ]
    )
    completion = response.choices[0].message.content
    parsed_response = json.loads(completion)
    if verbose:
        print("Received response from model.")

    return parsed_response

def update_grade_json_gpt(json_example, model = "gpt-3.5-turbo", verbose=False):
    transcript_id, statement_id, matched_terms, matched_terms_list, previous_statement, statement, matched_terms_list, string = parse_json(json_example)
    parsed_response = get_json_response(string)
    output_matched_items = {}
    # print("Matched Terms:",matched_terms_list)
    for item in matched_terms_list:
        if verbose:
            print("Matching item:", item)
            print("   Correct Response:", matched_terms[item])
            print("   Model Response:", parsed_response[item])
        item_vals = {
            "correct": matched_terms[item],
            model : parsed_response[item]
        }
        output_matched_items[item] = item_vals

    benchmark_output = {
        "transcript_id": transcript_id,
        "statement_id": statement_id,
        "matched_terms": output_matched_items,
        "previous_statement": previous_statement,
        "statement": statement
    }
    if verbose:
        print("Completed grading:", benchmark_output)    
    return benchmark_output

def insert_new_row_json(json_example, model = "gpt-3.5-turbo", verbose=False):
    transcript_id, statement_id, matched_terms, matched_terms_list, previous_statement, statement, matched_terms_list, string = parse_json(json_example)
    parsed_response = get_json_response(string)
    output_matched_items = {}
    # print("Matched Terms:",matched_terms_list)
    for item in matched_terms_list:
        if verbose:
            print("Matching item:", item)
            print("   Correct Response:", matched_terms[item])
            print("   Model Response:", parsed_response[item])
        item_vals = {
            "correct": matched_terms[item],
            model : parsed_response[item]
        }
        output_matched_items[item] = item_vals

    benchmark_output = {
        "transcript_id": transcript_id,
        "statement_id": statement_id,
        "matched_terms": output_matched_items,
        "previous_statement": previous_statement,
        "statement": statement
    }
    if verbose:
        print("Completed grading:", benchmark_output)    
    return benchmark_output


# x = update_grade_json_gpt(text)
# print(x)

#### Get GPT3.5 Responses

Update code to also incorporate 4, gemini, etc. Need to have the code update if something already exists

In [83]:
import json

# Load the JSON data from a file
file_path = 'data/human_annotated_model_response.json'
with open(file_path, 'r') as file:
    ground_truth = json.load(file)

print("Loaded", len(ground_truth), "examples from", file_path)

# output_dict = {f"{{sample['statement_id']}": sample for sample in ground_truth}
output_dict = {f"{sample['transcript_id']}_{sample['statement_id']}": sample for sample in ground_truth}

models_list = ["correct", "gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo"]
tally = {model: 0 for model in models_list}
i = 0
for sample in ground_truth:  # Adjust this to process more samples if needed
    transcript_id, statement_id, matched_terms, matched_terms_list, previous_statement, statement, matched_terms_list, string = parse_json(sample)
    
    # print("Matched_terms", matched_terms)

    # Iterate through only the first matched term
    first_term, models = next(iter(matched_terms.items()))
    
    # Iterate through each model name and its matched value
    for model_name in models_list:
        if model_name in models:
            # print("Skipping model", model_name)
            tally[model_name] += 1
        else:
            response = get_json_response(string, model=model_name)
            for item in matched_terms:
                matched_terms[item][model_name] = response[item]
            tally[model_name] += 1
    sample_output = {
        "transcript_id": transcript_id,
        "statement_id": statement_id,
        "matched_terms": matched_terms,
        "previous_statement": previous_statement,
        "statement": statement
    }
    output_dict[f"{transcript_id}_{statement_id}"] = sample_output
    i+=1
    if i % 10 == 0:
        output = list(output_dict.values())

        with open(file_path, 'w') as file:
            json.dump(output, file, indent=4, ensure_ascii=False)
        print(f"Processed and saved {i} samples")

output = list(output_dict.values())

with open(file_path, 'w') as file:
    json.dump(output, file, indent=4, ensure_ascii=False)

# Print the tally
print("\nModel Tally:")
for model, count in tally.items():
    print(f" {model}: {count}")

Loaded 249 examples from data/human_annotated_model_response.json
Processed and saved 10 samples
Processed and saved 20 samples
Processed and saved 30 samples
Processed and saved 40 samples
Processed and saved 50 samples
Processed and saved 60 samples
Processed and saved 70 samples
Processed and saved 80 samples
Processed and saved 90 samples
Processed and saved 100 samples
Processed and saved 110 samples
Processed and saved 120 samples
Processed and saved 130 samples
Processed and saved 140 samples
Processed and saved 150 samples
Processed and saved 160 samples
Processed and saved 170 samples
Processed and saved 180 samples
Processed and saved 190 samples
Processed and saved 200 samples
Processed and saved 210 samples
Processed and saved 220 samples
Processed and saved 230 samples
Processed and saved 240 samples

Model Tally:
 correct: 249
 gpt-3.5-turbo: 249
 gpt-4o: 249
 gpt-4-turbo: 249


In [5]:

# with open(file_path, 'w') as file:
#     json.dump(model_responses, file, indent=4, ensure_ascii=False)

In [84]:
import json

# Load the JSON data from a file
file_path = 'data/human_annotated_model_response.json'
with open(file_path, 'r') as file:
    data_list = json.load(file)

# Initialize a dictionary to hold the confusion matrix

model_list = ['gpt-3.5-turbo', 'gpt-4-turbo', "gpt-4o"]

for model in model_list:
    # Populate confusion matrix from the JSON data
    confusion_matrix = {}
    for entry in data_list:
        matched_terms = entry["matched_terms"]
        for term, details in matched_terms.items():
            correct_answer = details["correct"]
            model_answer = details[model]

            if correct_answer not in confusion_matrix:
                confusion_matrix[correct_answer] = {}

            if model_answer not in confusion_matrix[correct_answer]:
                confusion_matrix[correct_answer][model_answer] = 0

            confusion_matrix[correct_answer][model_answer] += 1

    # Calculate precision, recall, F1, and accuracy for each class

    print("\n\nResults for Model:", model)
    precisions = []
    recalls = []
    f1_scores = []
    accuracies = []

    total_correct = 0
    total_instances = 0

    for correct_class, predictions in confusion_matrix.items():
        true_positive = predictions.get(correct_class, 0)
        false_positive = sum(predictions.get(pred, 0) for pred in predictions if pred != correct_class)
        false_negative = sum(confusion_matrix.get(pred, {}).get(correct_class, 0) for pred in confusion_matrix if pred != correct_class)
        true_negative = total_instances - (true_positive + false_positive + false_negative)  # Not used directly

        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        accuracy = true_positive / (true_positive + false_positive + false_negative) if (true_positive + false_positive + false_negative) > 0 else 0

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        accuracies.append(accuracy)
        
        total_correct += true_positive
        total_instances += sum(predictions.values())

        print(f" * Class: {correct_class}")
        print(f"   Precision: {precision:.4f}")
        print(f"   Recall: {recall:.4f}")
        print(f"   F1 Score: {f1:.4f}")
        print(f"   Accuracy: {accuracy:.4f}")
        print()

    print("Overall Model Performance:")
    print(f"Average Precision: {sum(precisions) / len(precisions):.4f}")
    print(f"Average Recall: {sum(recalls) / len(recalls):.4f}")
    print(f"Average F1 Score: {sum(f1_scores) / len(f1_scores):.4f}")
    print(f"Average Accuracy: {sum(accuracies) / len(accuracies):.4f}")
    print(f"Total Accuracy: {total_correct / total_instances:.4f}")




Results for Model: gpt-3.5-turbo
 * Class: none
   Precision: 0.5345
   Recall: 0.9029
   F1 Score: 0.6715
   Accuracy: 0.5054

 * Class: hedge
   Precision: 0.6540
   Recall: 0.6603
   F1 Score: 0.6571
   Accuracy: 0.4894

 * Class: authority
   Precision: 0.8571
   Recall: 0.2176
   F1 Score: 0.3471
   Accuracy: 0.2100

Overall Model Performance:
Average Precision: 0.6819
Average Recall: 0.5936
Average F1 Score: 0.5586
Average Accuracy: 0.4016
Total Accuracy: 0.6020


Results for Model: gpt-4-turbo
 * Class: none
   Precision: 0.7787
   Recall: 0.9377
   F1 Score: 0.8509
   Accuracy: 0.7404

 * Class: hedge
   Precision: 0.9242
   Recall: 0.7358
   F1 Score: 0.8193
   Accuracy: 0.6940

 * Class: authority
   Precision: 0.7143
   Recall: 0.6481
   F1 Score: 0.6796
   Accuracy: 0.5147

Overall Model Performance:
Average Precision: 0.8057
Average Recall: 0.7739
Average F1 Score: 0.7833
Average Accuracy: 0.6497
Total Accuracy: 0.8240


Results for Model: gpt-4o
 * Class: none
   Precis

In [63]:
import json

# Load the JSON data from a file
file_path = 'data/human_annotated_model_response.json'
model = "gpt-4"  # Example new model

# Read the existing data from the file
try:
    with open(file_path, 'r') as file:
        model_responses = json.load(file)  # Load existing data into a Python data structure
        print("File Found")
except FileNotFoundError:
    model_responses = []  # Assuming the file might not exist and the root is a list
    print("File Created")

existing_ids = {item['statement_id'] for item in model_responses}
print("existing_ids:", existing_ids)

# Function to update grade JSON using GPT model
def update_grade_json_gpt(json_example, model="gpt-4", verbose=True):
    transcript_id, statement_id, matched_terms, previous_statement, statement, matched_terms_list, string = parse_json(json_example, verbose)
    parsed_response = get_json_response(string, model=model, verbose=verbose)
    output_matched_items = {}

    for item in matched_terms_list:
        if verbose:
            print("Matching item:", item)
            print("   Correct Response:", matched_terms[item])
            print("   Model Response:", parsed_response[item])
        item_vals = {
            "correct": matched_terms[item]["correct"],
            model: parsed_response[item]
        }
        output_matched_items[item] = item_vals

    # Merge the existing matched_terms with the new model results
    for term, values in output_matched_items.items():
        if term in matched_terms:
            matched_terms[term][model] = values[model]
        else:
            matched_terms[term] = values

    benchmark_output = {
        "transcript_id": transcript_id,
        "statement_id": statement_id,
        "matched_terms": matched_terms,
        "previous_statement": previous_statement,
        "statement": statement
    }
    if verbose:
        print("Completed grading:", benchmark_output)
    return benchmark_output


# Iterate through examples in ground_truth
for example in ground_truth[:1]:
    statement_id = example['statement_id']

    # Check if statement_id is already in the data
    if statement_id in existing_ids:
        # Find the corresponding entry in model_responses
        print(f"Statement ID {statement_id} found in data. Checking for model results")
        matched_entry = next(item for item in model_responses if item['statement_id'] == statement_id)

        # Update the existing entry with the new model results
        updated_data = update_grade_json_gpt(matched_entry, model=model)
        model_responses = [updated_data if item['statement_id'] == statement_id else item for item in model_responses]
    else:
        print(f"Statement ID {statement_id} not in data. Updating")
        new_data = update_grade_json_gpt(example, model=model)
        model_responses.append(new_data)
        existing_ids.add(statement_id)  # Add new statement_id to the set

# Save the updated model responses back to the file
with open(file_path, 'w') as outfile:
    json.dump(model_responses, outfile, indent=2)


File Found
existing_ids: {'00bf56cc-fe9c-4d47-a819-4c9afc2f752e', '02fc136b-ee5e-48ee-9f49-a23393726098', '62d61c24-0dd4-4eee-9b39-5411249b7eed', '163388f2-1595-42d0-b05c-133fdf728565', 'dfd8e585-a87f-4f2a-abb6-bfa9a524faee', '7e3f2290-f887-4c2b-b197-f2ebf2fdb1ca', '982c73d7-30e3-48fe-8190-588e49c92d1c', '81af1662-0e4f-48be-8a6d-43f88d61e057', '96b7405f-271e-4a81-8c32-c8f2002bcfa2', 'b3ce21d5-64b0-4d6a-99f0-6a4220ef242b', 'c6c25597-79ab-4b73-b03a-382b1d458dce', '6fc74628-4118-4440-b1bb-e270006ba947', '4cc981b3-c351-4539-ba7f-fb4d4f5b8e94', '84be6cab-37c1-4322-87cf-3419ad31d53f', 'ba25fea4-9b70-4969-91e5-32daabe0ac11', '627a7126-325b-4299-98d4-ef1d119bf740', '640bf44d-6500-4a11-a473-4195d48baf31', '764918f9-d387-4693-924a-3c0feb09c51e', '1f0054fd-2145-4ace-9b93-4b647ed610b6', '586f558e-edc0-4a87-a4a7-a3e2e5b94206', '70f315e2-8b16-44a4-8d37-9222180ea992', '52ccdd93-9217-4ce4-8a9f-4db59a04fb63', 'be90174f-d9ae-4768-80fa-61e61354b93e', '5887e546-168a-474f-ae38-14e7aead8ce6', '9dd5620e-e89f

ValueError: too many values to unpack (expected 7)