In [11]:
import pandas as pd
import os
from pathlib import Path
import json
import jsonlines
from pprint import pprint
import tiktoken
import openai
import xmltodict as xmd
from dotenv import load_dotenv

load_dotenv()


True

In [111]:
def process_xml(path: Path, dataset: str):
    '''
    process_xml : Converts xml file from scientsbank or beetle dataset to a pandas DataFrame
    ----------
    Parameters
    path : pathlib.Path
        The path to the xml file which you want to convert
    dataset : str
        The dataset being processed - either "scientsbank" or "beetle" (otehrwise raises ValueError)

    Returns
    -------
    df : pandas.DataFrame
        The dataframe containing the data from that xml file
    '''

    dataset = dataset.lower()

    if dataset not in ['scientsbank', 'beetle']:
        raise ValueError(f'\"{dataset}\" is not a valid value for \'dataset\' input. Use either \"sciEntsBank\" or \"beetle\".')
    with open(path, 'r') as f:
        data = f.read()
    master = xmd.parse(data)

    q_id = master['question']['@id']; q_text = master['question']['questionText']; q_module =  master['question']['@module']

    
    reference_answers = master['question']['referenceAnswers']['referenceAnswer']

    
    reference_processed = [None, None, None]
    if dataset == 'scientsbank':
        best_answers = [reference_answers]
        good_answers = []
        minimal_answers = []




    elif type(reference_answers) == dict:
        best_answers = [reference_answers if reference_answers['@category'] == 'BEST' else []]
        good_answers = [reference_answers if reference_answers['@category'] == 'GOOD' else []]
        minimal_answers = [reference_answers if reference_answers['@category'] == 'MINIMAL' else []]




    elif type(reference_answers) in [set, list]:
        best_answers = [a for a in reference_answers if a['@category'] == 'BEST']
        good_answers = [a for a in reference_answers if a['@category'] == 'GOOD']
        minimal_answers = [a for a in reference_answers if a['@category'] == 'MINIMAL']

        

    
  
    reference_processed = [best_answers, good_answers, minimal_answers]
    answers = master['question']['studentAnswers']['studentAnswer']
    data = []
    for answer in answers:
        data.append(
            [q_id, q_text, q_module] + reference_processed + [answer['@id'], answer['#text'], answer['@accuracy']]
        )
    
    columns = ['question_id', 'question_text', 'module', 'best_answers', 'good_answers', 'minimal_answers', 'answer_id', 'answer_text', 'accuracy']
    df = pd.DataFrame(data=data, columns=columns)
    return df

    
def process_dir(path : Path, dataset, write_filepath):
    if dataset.lower() not in ['scientsbank', 'beetle']:
        raise ValueError(f'\"{dataset}\" is not a valid value for \'dataset\' input. Use either \"sciEntsBank\" or \"beetle\".')
    files = os.listdir(path)
    dfs = {}
    for file in files:
        dfs[file] = process_xml(path / file, dataset)
    

    joined = pd.concat(dfs.values(), ignore_index=True)
    return joined

def get_prompt(df : pd.DataFrame, ways : int, dataset : str, model : str, file_name : str | Path, tune : bool = False, student_A_count : int = 1, student_B_count : int =0, student_C_count : int = 0, best_ref_count : int = 0, good_ref_count : int = 0, minimal_ref_count : int = 0):
    ''''
    get_rows - converts a DataFrame into a format for OpenAI finetuning API, writes to jsonl
    ----------
    Parameters
    df : pandas.DataFrame
        The dataframe to be used
    ways : int
        The number of possible outputs for grading (MUST BE 2 OR 3)
    dataset : str
        Which dataset is being used (MUST BE \"scientsbank\" or \"beetle\")
    model : str
        Which model is being used (MUST BE \"gpt-3.5\", or \"gpt4\", or \"davinci-003\")
    filename : str or pathlib.Path
        The path/filename to write the final file to
    tune : bool
        Optional - True if this is to fine-tune, false otherwise. Default is false. Controls where or not there is response from the model. 
    student_A_count : int
        Optional - How many of the correct student answers to use. min==1. - -1 means the max possible.
    student_B_count : int
        Optional - How many of the incorrect student answers to use. -1 means the max possible.
    student_C_count : int
        Optional - How many of the contradictory student answers to use. -1 means the max possible.
    best_ref_count : int
        Optional - How many of the best reference answers to use. 
    good_ref_count : int
        Optional - How many of the good reference answers to use
    minimal_ref_count : int
        Optional - How many of the minimal reference answers to use

    NOTES:
    - student_A, student_B, student_C counts and best_ref, good_ref, and minimal_ref counts, -1 means the max
    
    '''
    dataset = dataset.lower()
    model = model.lower()

    if dataset not in ['scientsbank', 'beetle']:
        raise ValueError(f'\"{dataset}\" is not a valid value for \'dataset\' input. Use either \"sciEntsBank\" or \"beetle\".')
    if ways not in [2,3]:
        raise ValueError(f'\'{str(ways)}\' is not a 2-way or a 3-way classification. Use either the integers 2 or 3.')
    if model not in ['gpt-3.5', 'gpt4', 'davinci-003']:
        raise ValueError(f'\'{model}\' is not a valid model. Use \"gpt-3.5\", \"gpt4\", or \"davinci-003\".')
    
    questions = list(set(df['question_id']))

    checker = lambda a : 1 if any(a) else 0

    grade_scale_dict = {
        2 : 'correct or incorrect',
        3 : 'correct, incorrect, or contradictory; contradictory only in the case that the answer contradicts the provided correct answers'
    }

    grade_sample_dict = {
        2 : 'correct or incorrect',
        3 : 'correct, incorrect, or contradictory'
    }

    count_dict = {
        2 : [student_A_count, student_B_count],
        3 : [student_A_count, student_B_count, student_C_count]
    }

    role = 'This asistant is a chatbot designed to assess students\' short answer responses on an exam.'

    messages = []
    data = []
    

    for j,question in enumerate(questions):
        answers = df[df['question_id'] == question]

        best_ref_count = min(best_ref_count, checker(answers['best_answers']),  len(answers['best_answers'].iloc[0]))
        good_ref_count = min(good_ref_count, checker(answers['good_answers']), len(answers['good_answers'].iloc[0]))
        minimal_ref_count = min(minimal_ref_count, checker(answers['minimal_answers']), len(answers['minimal_answers'].iloc[0]))

        best_answers = answers['best_answers'].iloc[0][:best_ref_count]
        good_answers = answers['good_answers'].iloc[0][:good_ref_count]
        min_answers = answers['minimal_answers'].iloc[0][:minimal_ref_count]

        module_check = lambda module, dataset: f' in {module}' if dataset == 'beetle' else ''

        any_check = lambda ans: ans if any(ans) else ''
        any_sep_check = lambda check, rl: rl if any(check) else ''

        best_answer_str = [f' Best_answer_{str(i+1)} - "{str(ans["#text"])}, "' for i,ans in enumerate(best_answers)]
        best_str = [f' BEST - what the optimal answer would look like: '] + best_answer_str + ['.']
        best_str = any_sep_check(best_answer_str, best_str)
        best_str = ''.join(best_str)

        good_answer_str = [f'Good_answer_{str(i+1)} - "{ans["#text"]}, "' for i,ans in enumerate(good_answers)]
        good_str = [f' GOOD - a sufficient answer: '] + good_answer_str + ['.']
        good_str = any_sep_check(good_answer_str, good_str)
        good_str = ''.join(good_str)

        minimal_answer_str = [f'Minimal_answer_{str(i+1)} - "{ans["#text"]}, "' for i,ans in enumerate(min_answers)]
        minimal_str = [f' Minimal - answers that are not correct: '] + minimal_answer_str + ['.']
        minimal_str = any_sep_check(minimal_answer_str, minimal_str)
        minimal_str = ''.join(minimal_str)


        student_A_ans = answers[answers['accuracy'] == 'correct']
        student_B_ans = answers[answers['accuracy'] == 'incorrect']
        student_C_ans = answers[answers['accuracy'] == 'contradictory']


        student_A_count = len(student_A_ans) if student_A_count == -1 else student_A_count
        student_B_count = len(student_B_ans) if student_B_count == -1 else student_B_count
        student_C_count = len(student_C_ans) if student_C_count == -1 else student_C_count
        

        student_A_ans = student_A_ans[:student_A_count]
        student_B_ans = student_B_ans[:student_B_count]
        student_C_ans = student_C_ans[:student_C_count]

        final_sans = pd.concat([student_A_ans, student_B_ans, student_C_ans]).sample(frac=1).reset_index()


        final_ans_li = []

        for i,ans in final_sans.iterrows(): 
            final_ans_li.append(f'student_answer_{str(i+1)} - "{ans["answer_text"]}"')


        final_ans_li.append('.')

        final_ans_str = ', '.join(final_ans_li)

        final_sam_li = []

        for i,ans in final_sans.iterrows(): 
            final_sam_li.append(f'student_answer_{str(i+1)} - "{grade_sample_dict[ways]}"')
        final_sam_li.append('.')

        final_sam_str = ', '.join(final_sam_li)

        final_sol_li = []

        for i,ans in final_sans.iterrows(): 
            final_sol_li.append(f'student_answer_{str(i+1)} - "{ans["accuracy"]}"')
        final_sol_li.append('.')

        final_sol_str = ', '.join(final_sol_li)

        beginning_text = 'Suppose you are an educator, specifically, a K-12 teacher, focusing in science.'
        module_text = f' You are grading an exam which aims to assess students\' understanding{module_check(answers["module"].iloc[0], dataset)}.'
        questionText = f' This is the question they have been asked: "{answers["question_text"].iloc[0]}". You should assess the student responses on the following scale: {grade_scale_dict[ways]}.'
        ref_intro_text = f' You can gain a better understanding of the task through the following reference responses.'
        ref_mid_text = f' They are classified in the following {any(best_answers) + any(good_answers) + any(min_answers)} category(s): '
        ref_cat_list = ["BEST" if any(best_answers) else "", "GOOD" if any(good_answers) else "", "MINIMAL" if any(min_answers) else ""]
        ref_cat_list = list(filter(None, ref_cat_list))
        ref_cat_text = ' ,'.join(ref_cat_list)
        ref_end_text = f' {ref_cat_text}.{best_str}{good_str}{minimal_str}'
        task_intro_text = f' Based on these reference answers, could you grade the following {sum(count_dict[ways])} student responses.'
        task_mid_text = f' Each number represents a different student\'s response to the same question: {final_ans_str}'
        task_end_text = f' Please respond in the following format: {final_sam_li}'
        
        prompt_text = beginning_text + module_text + questionText + ref_intro_text + ref_mid_text + ref_end_text + task_intro_text + task_mid_text + task_end_text
        answer_text = f'Sure! Here are the grades that these students recieved: {final_sol_str}.'

        prompt_text = prompt_text.replace(r'\\', '', -1); answer_text = answer_text.replace(r'\"', '', -1)
        if model == 'davinci-003':
            if tune:
                messages.append({{"prompt" : prompt_text, "completion" : answer_text}})
            else: 
                messages.append({{"prompt" : prompt_text}})

        else:
            if tune:
                messages.append(
                    {
                        "messages" : [
                            {"role" : "system", "content" : role},
                            {"role" : "user", "content" : prompt_text},
                            {"role" : "assistant", "content" : answer_text}
                        ]
                    }
                )
            else:
                messages.append(
                    {
                        "messages" : [
                            {"role" : "system", "content" : role},
                            {"role" : "user", "content" : prompt_text},
                        ]
                    }
                )
        
        
        
    
    
    
    # I should probably make a system for storing objects (the dicts) so they aren't volatile - its fucking annoying and I can't recover the model responses
    
    return messages
    
    

### DataFrames:
1. Dataset one
2. Prompts + responses and everything
3. Non-volatile responses + identifiers

Need to code:
- Add DF #2 and #3
- Create storing code 
- Create function for recovering model responses from csv and into DF #2 and DF #3
- Create function for getting response from GPT over prompt df
    - Asyn
- Repetitive storing per iteration of loop (make sure no bad overwriting)

In [112]:
base_path = Path(os.path.abspath('../'))

dataset_path = base_path / 'datasets' 

semeval_path = dataset_path / 'cleaning' / 'SemEval-2013-task7'

semeval3_path = dataset_path / 'semeval-2013-task7' / 'semeval-3way'

training_path = semeval3_path / 'training'

scients2way_path = training_path / '2way' / 'sciEntsbank'
beetle2way_path = training_path / '2way' / 'beetle'
scients3way_path = training_path / '3way' / 'sciEntsbank'
beetle3way_path = training_path / '3way' / 'beetle'
two_scientsbank = process_dir(scients2way_path, 'scientsbank', 'training_2way_scientsbank.csv')
two_beetle = process_dir(beetle2way_path, 'beetle', 'training_2way_beetle.csv')
three_scientsbank = process_dir(scients3way_path, 'scientsbank', 'training_3way_scientsbank.csv')
three_beetle = process_dir(beetle3way_path, 'beetle', 'training_3way_beetle.csv')


'''
two_scientsbank = get_prompt(two_scientsbank, 2, 'scientsbank', 'gpt-3.5', True, 3, 3, 0, 1)
two_beetle = get_prompt(two_beetle, 2, 'beetle', 'gpt-3.5', True, 3, 3, 0, 2, 1, 2)
three_scientsbank = get_prompt(three_scientsbank, 2, 'scientsbank', 'gpt-3.5', True, 3, 3, 2, 1)
three_beetle = get_prompt(three_beetle, 2, 'beetle', 'gpt-3.5', True, 3, 3, 2, 2, 1, 2)
'''


'''

class DataSet():
    def __init__(self, train : bool, dataset : str, ways : int, path : str | Path):
        self.train = train
        self.dataset = dataset
        self.ways = ways
        self.path = path


semeval = {
    'test' : {
        '2beetle' : {
            'path' : semeval3_path / 'test'
            'data' : 
            'prompt' : 
        },
        '3beetle' : {
            'path' : 
            'data' : 
            'prompt' : 
        },
        '2scientsbank' : {
            'path' : 
            'data' : 
            'prompt' : 
        },
        '3scientsbank' : {
            'path' : 
            'data' : 
            'prompt' : 
        },
    },
    'train' : {
        '2beetle' : {
            'path' : 
            'data' : 
            'prompt' : 
        },
        '3beetle' : {
            'path' : 
            'data' : 
            'prompt' : 
        },
        '2scientsbank' : {
            'path' : 
            'data' : 
            'prompt' : 
        },
        '3scientsbank' : {
            'path' : 
            'data' : 
            'prompt' : 
        },
    },
}

'''


"\n\nclass DataSet():\n    def __init__(self, train : bool, dataset : str, ways : int, path : str | Path):\n        self.train = train\n        self.dataset = dataset\n        self.ways = ways\n        self.path = path\n\n\nsemeval = {\n    'test' : {\n        '2beetle' : {\n            'path' : semeval3_path / 'test'\n            'data' : \n            'prompt' : \n        },\n        '3beetle' : {\n            'path' : \n            'data' : \n            'prompt' : \n        },\n        '2scientsbank' : {\n            'path' : \n            'data' : \n            'prompt' : \n        },\n        '3scientsbank' : {\n            'path' : \n            'data' : \n            'prompt' : \n        },\n    },\n    'train' : {\n        '2beetle' : {\n            'path' : \n            'data' : \n            'prompt' : \n        },\n        '3beetle' : {\n            'path' : \n            'data' : \n            'prompt' : \n        },\n        '2scientsbank' : {\n            'path' : \n   

In [113]:
two_scientsbank.head()

Unnamed: 0,question_id,question_text,module,best_answers,good_answers,minimal_answers,answer_id,answer_text,accuracy
0,ST_59,Elena has a male lizard that has lived for sev...,ST,"[{'@id': 'ST_59-a1', '#text': 'Elena should in...",[],[],ST.59.382.1,Elena should add shelter.,correct
1,ST_59,Elena has a male lizard that has lived for sev...,ST,"[{'@id': 'ST_59-a1', '#text': 'Elena should in...",[],[],ST.59.386.1,She needs shelter.,correct
2,ST_59,Elena has a male lizard that has lived for sev...,ST,"[{'@id': 'ST_59-a1', '#text': 'Elena should in...",[],[],ST.59.389.1,Elena has to put homes inside of the habitat.,correct
3,ST_59,Elena has a male lizard that has lived for sev...,ST,"[{'@id': 'ST_59-a1', '#text': 'Elena should in...",[],[],ST.59.393.1,She needs to add another house or shelter and ...,correct
4,ST_59,Elena has a male lizard that has lived for sev...,ST,"[{'@id': 'ST_59-a1', '#text': 'Elena should in...",[],[],ST.59.396.1,Elena should include another home.,correct


In [114]:
twoscients_prompts_test = get_prompt(two_scientsbank, 2, 'scientsbank', 'gpt-3.5', 'scientsbank2waytrain', False, 3, 3, 0, 1)

twoscients_prompts_test[0]

{'messages': [{'role': 'system',
   'content': "This asistant is a chatbot designed to assess students' short answer responses on an exam."},
  {'role': 'user',
   'content': 'Suppose you are an educator, specifically, a K-12 teacher, focusing in science. You are grading an exam which aims to assess students\' understanding. This is the question they have been asked: "A solution is a type of mixture. What makes it different from other mixtures?". You should assess the student responses on the following scale: correct or incorrect. You can gain a better understanding of the task through the following reference responses. They are classified in the following 1 category(s):  BEST. BEST - what the optimal answer would look like:  Best_answer_1 - "A solution is a mixture formed when a solid dissolves in a liquid., ". Based on these reference answers, could you grade the following 6 student responses. Each number represents a different student\'s response to the same question: student_answer

In [39]:
client = openai.OpenAI()


In [60]:
twoscients_prompts_test[0]['messages'][1]['content']

'Suppose you are an educator, specifically, a K-12 teacher, focusing in science. You are grading an exam which aims to assess students\' understanding. This is the question they have been asked: "A solution is a type of mixture. What makes it different from other mixtures?". You should assess the student responses on the following scale: correct or incorrect. You can gain a better understanding of the task through the following reference responses. They are classified in the following 1 category(s):  BEST. BEST - what the optimal answer would look like:  Best_answer_1 - "A solution is a mixture formed when a solid dissolves in a liquid., ". Based on these reference answers, could you grade the following 6 student responses. Each number represents a different student\'s response to the same question: student_answer_820 - "It is one material that dissolves into the other. Making a clear mixture. Although it could be colored, it has to be see through.", student_answer_822 - "It dissolves 

In [48]:
response = client.chat.completions.create(
    model='gpt-4-1106-preview',
    messages=twoscients_prompts_test[0]['messages']
)

In [52]:
print(response.choices[0].message.content)

Based on the optimal answer provided and the nature of the question, the key aspects we're looking for in a student's response to determine if it's correct are:

1. Mention of a solution being a mixture where one substance (solid) dissolves into another (typically a liquid).
2. The characteristic of the solution being clear or see-through, although it may be colored.

Here are the assessments for each student response:

student_answer_820 - Correct. The response indicates that one material dissolves into another, creating a clear mixture, which captures the essence of a solution.

student_answer_822 - Correct. The student points out that a solution involves a solid dissolving into a liquid and the result is see-through, aligning with the fundamental properties of a solution.

student_answer_825 - Correct. This answer correctly states that a solution is a mixture where a material dissolves and the resulting mixture is see-through.

student_answer_819 - Incorrect. While it is true that a