In [1]:
from prompts import system_prompt

In [2]:
import json
from openai import OpenAI
from pyauth import openaikey

client = OpenAI(api_key=openaikey)

sample_text = {
        "transcript_id": "CNN-235715",
        "statement_id": "640bf44d-6500-4a11-a473-4195d48baf31",
        "matched_terms": {
            "about": "none",
            "know": "none"
        },
        "previous_statement": "Actually this is",
        "statement": "The Vines that you've posted show some incredible sights and sounds, rockets, shaking buildings, ambulances in the night. What do you want people to <KNOW> <ABOUT> life inside Gaza?"
    }

# transcript_id, statement_id, matched_terms, matched_terms_list, previous_statement, statement, matched_terms_list, string = parse_json(sample_text)

In [3]:
# # # print(system_prompt)
# # # print(string)

# google_prompt = system_prompt + "\n-----BEGIN INPUT-----\n" + string
# print(google_prompt)

In [3]:
import pathlib
import textwrap
import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown

from pyauth import gemini_api_key
genai.configure(api_key= gemini_api_key)

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

model = genai.GenerativeModel('gemini-1.5-flash',
                              generation_config={"response_mime_type": "application/json"})

# prompt = google_prompt

# response = model.generate_content(prompt)
# print(response.text)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# import anthropic
# from pyauth import claudeapikey

# client = anthropic.Client(api_key=claudeapikey)

# response = client.messages.create(
#     model="claude-2.1",
#     max_tokens=50,
#     system=system_prompt, # <-- system prompt
#     messages=[
#         {"role": "user", "content": string} # <-- user prompt
#     ]
# )

# print(response.content)

[TextBlock(text='terms: [\'about\', \'know\']\ntranscript:\nSpeaker 1: "Actually this is"\nSpeaker 2: "The Vines that you\'ve posted show some incredible sights and sounds, rockets, shaking buildings, ambulances in the', type='text')]


In [99]:
# print(response)

Message(id='msg_01RoFtC87msrxzRQYrxxaB8j', content=[TextBlock(text="Unfortunately there are no terms provided in brackets in the transcript. The terms list contains 'about' and 'know', but those words do not appear bracketed in capital letters in the transcript. So there are no terms I can analyze and provide answers for. Please provide a transcript with bracketed and capitalized terms that match the list of terms you want me to analyze.", type='text')], model='claude-2.1', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=482, output_tokens=78))


In [4]:
import time 
def parse_json(json_response, verbose=False):
    matched_terms_list = []
    for matched_term in json_response["matched_terms"]:
        matched_terms_list.append(matched_term)

    string = f"""terms: {matched_terms_list}"\nTranscript:\nSpeaker 1: "{json_response["previous_statement"]}"\nSpeaker 2: "{json_response["statement"]}" """
    
    transcript_id = json_response["transcript_id"]
    statement_id = json_response["statement_id"]
    if verbose:
        print("Parsing", statement_id)
    previous_statement = json_response["previous_statement"]
    statement = json_response["statement"]
    
    if verbose:
        print("Parsed JSON:", transcript_id, statement_id, matched_terms_list, previous_statement, statement)
    matched_terms = json_response["matched_terms"]

    return transcript_id, statement_id, matched_terms, matched_terms_list, previous_statement, statement, matched_terms_list, string




def get_json_response_gemini(string, model_name='gemini-1.5-flash', system_prompt=system_prompt, verbose=False):
    model = genai.GenerativeModel(model_name,
                              generation_config={"response_mime_type": "application/json"})

    prompt = system_prompt + "\n-----BEGIN INPUT-----\n" + string

    response = model.generate_content(prompt).text
    # print(response)
    try:
        parsed_response = json.loads(response)
        if verbose:
            print("Received response from model.")

        return parsed_response
    except json.JSONDecodeError:
        print("Error decoding JSON response.")
    except Exception as e:
        print("Error:", e)
        time.sleep(60)

def get_json_response(string, model="gpt-3.5-turbo", system_prompt=system_prompt, verbose=False):
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        response_format={ "type": "json_object" },
        messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": string}
        ]
    )
    completion = response.choices[0].message.content
    try:
        parsed_response = json.loads(completion)
        if verbose:
            print("Received response from model.")
        return parsed_response
    
    except json.JSONDecodeError:
        print("Error decoding JSON response.")
    except Exception as e:
        print("Error:", e)

def update_grade_json_gpt(json_example, model = "gpt-3.5-turbo", verbose=False):
    transcript_id, statement_id, matched_terms, matched_terms_list, previous_statement, statement, matched_terms_list, string = parse_json(json_example)
    parsed_response = get_json_response(string)
    output_matched_items = {}
    # print("Matched Terms:",matched_terms_list)
    for item in matched_terms_list:
        if verbose:
            print("Matching item:", item)
            print("   Correct Response:", matched_terms[item])
            print("   Model Response:", parsed_response[item])
        item_vals = {
            "correct": matched_terms[item],
            model : parsed_response[item]
        }
        output_matched_items[item] = item_vals

    benchmark_output = {
        "transcript_id": transcript_id,
        "statement_id": statement_id,
        "matched_terms": output_matched_items,
        "previous_statement": previous_statement,
        "statement": statement
    }
    if verbose:
        print("Completed grading:", benchmark_output)    
    return benchmark_output

def insert_new_row_json(json_example, model = "gpt-3.5-turbo", verbose=False):
    transcript_id, statement_id, matched_terms, matched_terms_list, previous_statement, statement, matched_terms_list, string = parse_json(json_example)
    parsed_response = get_json_response(string)
    output_matched_items = {}
    # print("Matched Terms:",matched_terms_list)
    for item in matched_terms_list:
        if verbose:
            print("Matching item:", item)
            print("   Correct Response:", matched_terms[item])
            print("   Model Response:", parsed_response[item])
        item_vals = {
            "correct": matched_terms[item],
            model : parsed_response[item]
        }
        output_matched_items[item] = item_vals

    benchmark_output = {
        "transcript_id": transcript_id,
        "statement_id": statement_id,
        "matched_terms": output_matched_items,
        "previous_statement": previous_statement,
        "statement": statement
    }
    if verbose:
        print("Completed grading:", benchmark_output)    
    return benchmark_output


# x = update_grade_json_gpt(text)
# print(x)

#### Get GPT3.5 Responses

Update code to also incorporate 4, gemini, etc. Need to have the code update if something already exists

In [5]:
models_list = ["correct", "gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gemini-1.5-flash", "gemini-1.5-pro"]

In [6]:

import time
import json

# Load the JSON data from a file
file_path = 'data/human_annotated_model_response.json'
with open(file_path, 'r') as file:
    ground_truth = json.load(file)

print("Loaded", len(ground_truth), "examples from", file_path)

# output_dict = {f"{{sample['statement_id']}": sample for sample in ground_truth}
output_dict = {f"{sample['transcript_id']}_{sample['statement_id']}": sample for sample in ground_truth}

tally = {model: 0 for model in models_list}

i = 0
j = 0
for sample in ground_truth:  # Adjust this to process more samples if needed
    j+=1
    transcript_id, statement_id, matched_terms, matched_terms_list, previous_statement, statement, matched_terms_list, string = parse_json(sample)
    
    # print("Matched_terms", matched_terms)

    # Iterate through only the first matched term
    first_term, models = next(iter(matched_terms.items()))
    
    # Iterate through each model name and its matched value
    for model_name in models_list:
        if model_name in models:
            # print("Skipping model", model_name)
            tally[model_name] += 1
        else:
            # response = get_json_response(string, model=model_name)
            if model_name == "correct":
                asdfas = 0
            elif "gpt" in model_name:
                try:
                    response = get_json_response(string, model=model_name)
                    # print(response)
                    for item in matched_terms:
                        matched_terms[item][model_name] = response[item]
                    tally[model_name] += 1
                except Exception as e:
                    print(f"Error on {model_name}: {e}")
            elif "gemini" in model_name:
                try:
                    response = get_json_response_gemini(string, model_name=model_name)
                    for item in matched_terms:
                        matched_terms[item][model_name] = response[item]
                    tally[model_name] += 1
                    i+=1
                except Exception as e:
                    print(f"Error on {model_name}: {e}")
                    # print(f"Error on {model_name}: {e}, on string: {string}")
            else:
                "Model Not Recognized"

    sample_output = {
        "transcript_id": transcript_id,
        "statement_id": statement_id,
        "matched_terms": matched_terms,
        "previous_statement": previous_statement,
        "statement": statement
    }
    output_dict[f"{transcript_id}_{statement_id}"] = sample_output
    if (i+1) % 15 == 0:
        output = list(output_dict.values())

        with open(file_path, 'w') as file:
            json.dump(output, file, indent=4, ensure_ascii=False)
        print(f"Processed and saved {j} samples, with {i} Gemini samples")
        # time.sleep(60)
    elif j % 100 == 0:
        output = list(output_dict.values())

        with open(file_path, 'w') as file:
            json.dump(output, file, indent=4, ensure_ascii=False)
        print(f"Processed and saved {j} samples, with {i} Gemini samples")

output = list(output_dict.values())

with open(file_path, 'w') as file:
    json.dump(output, file, indent=4, ensure_ascii=False)

# Print the tally
print("\nModel Tally:")
for model, count in tally.items():
    print(f" {model}: {count}")

Loaded 741 examples from data/human_annotated_model_response.json
Processed and saved 100 samples, with 0 Gemini samples
Processed and saved 200 samples, with 0 Gemini samples
Processed and saved 300 samples, with 0 Gemini samples
Processed and saved 400 samples, with 0 Gemini samples
Processed and saved 500 samples, with 0 Gemini samples
Processed and saved 600 samples, with 2 Gemini samples
Processed and saved 606 samples, with 14 Gemini samples
Processed and saved 621 samples, with 44 Gemini samples
Processed and saved 636 samples, with 74 Gemini samples
Processed and saved 651 samples, with 104 Gemini samples
Processed and saved 666 samples, with 134 Gemini samples
Processed and saved 681 samples, with 164 Gemini samples
Processed and saved 696 samples, with 194 Gemini samples
Processed and saved 700 samples, with 202 Gemini samples
Processed and saved 711 samples, with 224 Gemini samples
Processed and saved 726 samples, with 254 Gemini samples
Processed and saved 741 samples, with

In [5]:

# with open(file_path, 'w') as file:
#     json.dump(model_responses, file, indent=4, ensure_ascii=False)

In [7]:
import json

# Load the JSON data from a file
file_path = 'data/human_annotated_model_response.json'
with open(file_path, 'r') as file:
    data_list = json.load(file)

# Initialize a dictionary to hold the confusion matrix

for model in models_list:
    # Populate confusion matrix from the JSON data
    confusion_matrix = {}
    for entry in data_list:
        matched_terms = entry["matched_terms"]
        for term, details in matched_terms.items():
            correct_answer = details["correct"]
            model_answer = details[model]

            if correct_answer not in confusion_matrix:
                confusion_matrix[correct_answer] = {}

            if model_answer not in confusion_matrix[correct_answer]:
                confusion_matrix[correct_answer][model_answer] = 0

            confusion_matrix[correct_answer][model_answer] += 1

    # Calculate precision, recall, F1, and accuracy for each class
    print("###"*20)

    print("Results for Model:", model)
    precisions = []
    recalls = []
    f1_scores = []
    accuracies = []

    total_correct = 0
    total_instances = 0

    for correct_class, predictions in confusion_matrix.items():
        true_positive = predictions.get(correct_class, 0)
        false_positive = sum(predictions.get(pred, 0) for pred in predictions if pred != correct_class)
        false_negative = sum(confusion_matrix.get(pred, {}).get(correct_class, 0) for pred in confusion_matrix if pred != correct_class)
        true_negative = total_instances - (true_positive + false_positive + false_negative)  # Not used directly

        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        accuracy = true_positive / (true_positive + false_positive + false_negative) if (true_positive + false_positive + false_negative) > 0 else 0

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        accuracies.append(accuracy)
        
        total_correct += true_positive
        total_instances += sum(predictions.values())

        print(f" * Class: {correct_class}")
        print(f"   F1 Score: {f1:.4f}")
        print(f"   Precision: {precision:.4f}")
        print(f"   Recall: {recall:.4f}")
        print(f"   Accuracy: {accuracy:.4f}")
        print()

    print("Overall Model Performance:")
    print(f"Average F1 Score: {sum(f1_scores) / len(f1_scores):.4f}")
    print(f"Average Precision: {sum(precisions) / len(precisions):.4f}")
    print(f"Average Recall: {sum(recalls) / len(recalls):.4f}")
    print(f"Total Accuracy: {total_correct / total_instances:.4f}")
    print("\n\n")


############################################################
Results for Model: correct
 * Class: none
   F1 Score: 1.0000
   Precision: 1.0000
   Recall: 1.0000
   Accuracy: 1.0000

 * Class: hedge
   F1 Score: 1.0000
   Precision: 1.0000
   Recall: 1.0000
   Accuracy: 1.0000

 * Class: authority
   F1 Score: 1.0000
   Precision: 1.0000
   Recall: 1.0000
   Accuracy: 1.0000

Overall Model Performance:
Average F1 Score: 1.0000
Average Precision: 1.0000
Average Recall: 1.0000
Total Accuracy: 1.0000



############################################################
Results for Model: gpt-3.5-turbo
 * Class: none
   F1 Score: 0.5026
   Precision: 0.3462
   Recall: 0.9170
   Accuracy: 0.3356

 * Class: hedge
   F1 Score: 0.6774
   Precision: 0.7664
   Recall: 0.6068
   Accuracy: 0.5121

 * Class: authority
   F1 Score: 0.4880
   Precision: 0.8918
   Recall: 0.3359
   Accuracy: 0.3228

Overall Model Performance:
Average F1 Score: 0.5560
Average Precision: 0.6681
Average Recall: 0.6199
Total Ac