In [1]:
import os
import json
from openai import OpenAI
from pydantic import BaseModel, Field
from IPython.display import HTML, display
import pandas as pd

In [None]:
key =  # ADD OPEN AI KEY HERE
client = OpenAI(api_key=key)

## Prompts

In [3]:
import importlib
import prompts
from prompts import *
intro_and_output = "\n".join([intro_prompt, output_format])
display(HTML(intro_and_output))

# Functions

In [4]:
folder = "files"
plain_text_files = [
        f for f in sorted(os.listdir(folder))
        if f.lower().endswith((".txt", ".md"))
]

In [5]:
def responses_call(document, intro_prompt, criteria_prompt): 
        response = client.responses.create(
        model='gpt-4o',
        temperature=0.1,
        instructions=intro_prompt,
        input=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_text",
                        "text": f"{criteria_prompt}\n"
                    },
                    {
                        "type": "input_text",
                        "text": f"\nHere is the paper:\n{document}"
                    }
                ]
            },
        ]
    )
        response_text = response.output_text
        return response_text   

In [6]:
def extract_responses(intro_prompt, criteria_prompt):
    responses = []
    for i, file_name in enumerate(plain_text_files):
        with open(os.path.join(folder, file_name), "r", encoding="utf-8") as f:
            document = f.read()
        response_text = responses_call(document, intro_prompt=intro_prompt, criteria_prompt=criteria_prompt)
        resp_dict = {
            'file_name':  file_name,
            'response_text': response_text
        }
        responses.append(resp_dict)
    return responses

In [7]:
def add_results_to_dataframe(responses, criteria_name):
    """
    Add results to test_results dataframe and include human answers from airs_answers.csv.
    """
    # Extract filenames and results from responses
    filenames = []
    results = []
    ids = []

    for i, resp in enumerate(responses, start=1):
        file_name = resp['file_name']
        response_data = json.loads(resp['response_text'])
        result = response_data.get('Reply', 'N/A')

        ids.append(i)
        filenames.append(file_name)
        results.append(result)

    global test_results
    try:
        test_results
    except NameError:
        # Create new dataframe if it doesn't exist
        test_results = pd.DataFrame({
            'id': ids,
            'names': filenames
        })

    test_results[criteria_name] = test_results['id'].map(
            dict(zip(ids, results))
        )

    # Load human answers from airs_answers.csv
    human_answers = pd.read_csv('airs_answers.csv')

    # Add human answer column with _human suffix, mapping by id
    human_column_name = f'{criteria_name}_human'
    test_results[human_column_name] = test_results['id'].map(
        dict(zip(human_answers['id'], human_answers[criteria_name]))
    )

    # Print only relevant columns
    relevant_columns = ['names', criteria_name, human_column_name]
    display(test_results[relevant_columns])

    return test_results

In [8]:
def display_responses(responses):
    """
    Display responses with filename, explanation, and result
    """
    for i, resp in enumerate(responses, 1):
        file_name = resp['file_name']
        response_data = json.loads(resp['response_text'])
        explanation = response_data.get('Explanation', 'N/A')
        result = response_data.get('Reply', 'N/A')
        
        # Display each part
        display(HTML(f"<h3>Response {i}</h3>"))
        display(HTML(f"<strong>File Name:</strong> {file_name}"))
        display(HTML(f"<strong>Explanation:</strong> {explanation}"))
        display(HTML(f"<strong>Result:</strong> {result}"))
        display(HTML("<hr>"))


# Criteria 1

## 1.2

### v1

In [9]:
display(HTML(criteria_prompt12))
responses12 = extract_responses(
    intro_prompt = intro_and_output, 
    criteria_prompt=criteria_prompt12
    )
test_results = add_results_to_dataframe(responses12, 'criteria_1_2')

Unnamed: 0,names,criteria_1_2,criteria_1_2_human
0,"Allcott, H (2011).md",**Yes**,no
1,"Andor, Gerster, Peters, Schmidt, 2020.md",Yes,no
2,"Asensio, OI; Delmas, MA (2016).md",Yes,yes
3,"Ayres, Raseman, Shih, 2012.md",Yes,no
4,"Bager, S; Mundaca, L (2017).md",No,no
5,"Becker, LJ; Rabinowitz, VC; Seligman, C.md",No,no
6,"Carroll, J; Lyons, S; Denny, E (2014).md",Yes,no
7,"Houde, S; Todd, A; Sudarshan, A; Flora, JA; Ar...",**Yes**,no
8,"Matsukawa, I. (2018).md",**Yes**,no
9,"Nguyen, TTK; Shimada, K; Ochi, Y; Matsumoto, T...",Yes,no


In [10]:
display_responses(responses12)

### V2

In [11]:
display(HTML(criteria_prompt12_v2))
test_results = test_results.drop(columns=['criteria_1_2', 'criteria_1_2_human'])
responses12 = extract_responses(
    intro_prompt = intro_and_output, 
    criteria_prompt=criteria_prompt12_v2
    )
test_results = add_results_to_dataframe(responses12, 'criteria_1_2')

Unnamed: 0,names,criteria_1_2,criteria_1_2_human
0,"Allcott, H (2011).md",No,no
1,"Andor, Gerster, Peters, Schmidt, 2020.md",No,no
2,"Asensio, OI; Delmas, MA (2016).md",**Yes**,yes
3,"Ayres, Raseman, Shih, 2012.md",**Yes**,no
4,"Bager, S; Mundaca, L (2017).md",No,no
5,"Becker, LJ; Rabinowitz, VC; Seligman, C.md",No,no
6,"Carroll, J; Lyons, S; Denny, E (2014).md",**No**,no
7,"Houde, S; Todd, A; Sudarshan, A; Flora, JA; Ar...",No,no
8,"Matsukawa, I. (2018).md",No,no
9,"Nguyen, TTK; Shimada, K; Ochi, Y; Matsumoto, T...",No,no


In [12]:
display_responses(responses12)

### V3

In [13]:
display(HTML(criteria_prompt12_v3))
# test_results = test_results.drop(columns=['criteria_1_2', 'criteria_1_2_human'])
responses12 = extract_responses(
    intro_prompt = intro_and_output, 
    criteria_prompt=criteria_prompt12_v3
    )
test_results = add_results_to_dataframe(responses12, 'criteria_1_2')

Unnamed: 0,names,criteria_1_2,criteria_1_2_human
0,"Allcott, H (2011).md",No,no
1,"Andor, Gerster, Peters, Schmidt, 2020.md",No,no
2,"Asensio, OI; Delmas, MA (2016).md",Yes,yes
3,"Ayres, Raseman, Shih, 2012.md",Yes,no
4,"Bager, S; Mundaca, L (2017).md",No,no
5,"Becker, LJ; Rabinowitz, VC; Seligman, C.md",No,no
6,"Carroll, J; Lyons, S; Denny, E (2014).md",No,no
7,"Houde, S; Todd, A; Sudarshan, A; Flora, JA; Ar...",No,no
8,"Matsukawa, I. (2018).md",No,no
9,"Nguyen, TTK; Shimada, K; Ochi, Y; Matsumoto, T...",No,no


In [14]:
display_responses(responses12)

## 1.3

### V1

In [16]:
display(HTML(criteria_prompt13))
responses13 = extract_responses(
    intro_prompt = intro_and_output, 
    criteria_prompt=criteria_prompt13
    )
test_results = add_results_to_dataframe(responses13, 'criteria_1_3')

Unnamed: 0,names,criteria_1_3,criteria_1_3_human
0,"Allcott, H (2011).md",**Yes**,yes
1,"Andor, Gerster, Peters, Schmidt, 2020.md",No,no
2,"Asensio, OI; Delmas, MA (2016).md",**Yes**,no
3,"Ayres, Raseman, Shih, 2012.md",Yes,no
4,"Bager, S; Mundaca, L (2017).md",No,no
5,"Becker, LJ; Rabinowitz, VC; Seligman, C.md",No,yes
6,"Carroll, J; Lyons, S; Denny, E (2014).md",**Yes**,yes
7,"Houde, S; Todd, A; Sudarshan, A; Flora, JA; Ar...",Yes,no
8,"Matsukawa, I. (2018).md",**Yes**,yes
9,"Nguyen, TTK; Shimada, K; Ochi, Y; Matsumoto, T...",**Yes**,yes


In [17]:
display_responses(responses13)

### V2

In [18]:
display(HTML(criteria_prompt13_v2))
test_results = test_results.drop(columns=['criteria_1_3', 'criteria_1_3_human'])
responses13 = extract_responses(
    intro_prompt = intro_and_output, 
    criteria_prompt=criteria_prompt13_v2
    )
test_results = add_results_to_dataframe(responses13, 'criteria_1_3')

Unnamed: 0,names,criteria_1_3,criteria_1_3_human
0,"Allcott, H (2011).md",**No**,yes
1,"Andor, Gerster, Peters, Schmidt, 2020.md",No,no
2,"Asensio, OI; Delmas, MA (2016).md",No,no
3,"Ayres, Raseman, Shih, 2012.md",Yes,no
4,"Bager, S; Mundaca, L (2017).md",No,no
5,"Becker, LJ; Rabinowitz, VC; Seligman, C.md",No,yes
6,"Carroll, J; Lyons, S; Denny, E (2014).md",**No**,yes
7,"Houde, S; Todd, A; Sudarshan, A; Flora, JA; Ar...",No,no
8,"Matsukawa, I. (2018).md",No,yes
9,"Nguyen, TTK; Shimada, K; Ochi, Y; Matsumoto, T...",No,yes


In [19]:
display_responses(responses13)

### V3

In [20]:
display(HTML(criteria_prompt13_v3))
test_results = test_results.drop(columns=['criteria_1_3', 'criteria_1_3_human'])
responses13 = extract_responses(
    intro_prompt = intro_and_output, 
    criteria_prompt=criteria_prompt13_v3
    )
test_results = add_results_to_dataframe(responses13, 'criteria_1_3')

Unnamed: 0,names,criteria_1_3,criteria_1_3_human
0,"Allcott, H (2011).md",No,yes
1,"Andor, Gerster, Peters, Schmidt, 2020.md",**No**,no
2,"Asensio, OI; Delmas, MA (2016).md",No,no
3,"Ayres, Raseman, Shih, 2012.md",No,no
4,"Bager, S; Mundaca, L (2017).md",No,no
5,"Becker, LJ; Rabinowitz, VC; Seligman, C.md",No,yes
6,"Carroll, J; Lyons, S; Denny, E (2014).md",No,yes
7,"Houde, S; Todd, A; Sudarshan, A; Flora, JA; Ar...",No,no
8,"Matsukawa, I. (2018).md",**No**,yes
9,"Nguyen, TTK; Shimada, K; Ochi, Y; Matsumoto, T...",No,yes


In [21]:
display_responses(responses13)