In [1]:
import pandas as pd
import os
from pathlib import Path
import json
import jsonlines
import tiktoken
import openai
import xmltodict as xmd
from dotenv import load_dotenv
import pickle
import asyncio
from random import randint
import importlib
import io
import datetime
from sklearn.metrics import precision_recall_fscore_support
load_dotenv()
import pyperclip
import numpy as np

# Custom file imports
import dataset_util
import dataset

importlib.reload(dataset)
importlib.reload(dataset_util)
pd.set_option('max_colwidth', 100)



In [3]:
def calc_f1_multi_df(dfs : [pd.DataFrame], model : str, prompt_style : str):
    avg_way_dict = {
        2 : 'binary', 
        3 : 'weighted',
    }
    avg_label_dict = {
        2 : 'correct', 
        3 : 1
    }
    non_label_dict = {
        2 : None,
        3 : ['correct', 'incorrect', 'contradictory']
    }
    ground_truths = [x for df in dfs for x in df.prompts['accuracy'].tolist()]
    model_predicts = [x for df in dfs for x in df.prompts[f'{prompt_style}_{model}_answer'].tolist()]
    print(set(model_predicts) - set(ground_truths))
    # return precision_recall_fscore_support(ground_truths, model_predicts, pos_label=avg_label_dict[dfs[0].ways],labels=non_label_dict[dfs[0].ways], average=avg_way_dict[dfs[0].ways], zero_division=np.nan)[2]
    return precision_recall_fscore_support(ground_truths, model_predicts, average='weighted')[2]

In [4]:
base_path = Path(os.path.abspath('../'))
dataset_path = base_path / 'datasets' 
semeval_path = dataset_path / 'cleaning' / 'SemEval-2013-task7'
semeval3_path = dataset_path / 'semeval-2013-task7' / 'semeval-3way'
training_path = semeval3_path / 'training'
testing_path = semeval3_path / 'test'

In [5]:
# DEFINE TESTING SET AND MAKE PROMPTS
testing_sets = [
    dataset.DataSet(testing_path / '2way' / 'beetle' / 'test-unseen-answers', False, 'beetle', 2, test_state='UA'), 
    dataset.DataSet(testing_path / '2way' / 'beetle' / 'test-unseen-questions', False, 'beetle', 2, test_state='UQ'), 
    dataset.DataSet(testing_path / '2way' / 'sciEntsBank' / 'test-unseen-answers', False, 'scientsbank', 2, test_state='UA'), 
    dataset.DataSet(testing_path / '2way' / 'sciEntsBank' / 'test-unseen-questions', False, 'scientsbank', 2, test_state='UQ'), 
    dataset.DataSet(testing_path / '2way' / 'sciEntsBank' / 'test-unseen-domains', False, 'scientsbank', 2, test_state='UD'), 
    dataset.DataSet(testing_path / '3way' / 'beetle' / 'test-unseen-answers', False, 'beetle', 3, test_state='UA'), 
    dataset.DataSet(testing_path / '3way' / 'beetle' / 'test-unseen-questions' , False, 'beetle', 3, test_state='UQ'), 
    dataset.DataSet(testing_path / '3way' / 'sciEntsBank' / 'test-unseen-answers', False, 'scientsbank', 3, test_state='UA'), 
    dataset.DataSet(testing_path / '3way' / 'sciEntsBank' / 'test-unseen-questions', False, 'scientsbank', 3, test_state='UQ'), 
    dataset.DataSet(testing_path / '3way' / 'sciEntsBank' / 'test-unseen-domains', False, 'scientsbank', 3, test_state='UD'), 
    ]




training_sets = [
    dataset.DataSet(training_path / '2way' / 'beetle', True, 'beetle', 2),
    dataset.DataSet(training_path / '2way' / 'sciEntsBank', True, 'sciEntsBank', 2),
    dataset.DataSet(training_path / '3way' / 'beetle', True, 'beetle', 2),
    dataset.DataSet(training_path / '3way' / 'sciEntsBank', True, 'sciEntsBank', 2),
]


tokens = []
for i, setd in enumerate(testing_sets):
    setd.make_prompts('gpt-3.5-turbo', 'kortemeyer', 3, 3, 2,2,1,2)
    tokens.append(setd.count_tokens('kortemeyer', 'gpt-3.5-turbo'))
for i, setd in enumerate(training_sets):
    setd.make_prompts('gpt-3.5-turbo', 'kortemeyer', 3, 3, 2,2,1,2)

print((sum(tokens)))



455152


In [5]:
client = openai.AsyncOpenAI()
non_client = openai.OpenAI()

three_epoch_id = os.getenv("THREE_EPOCH_MODEL")
two_epoch_id = os.getenv("TWO_EPOCH_MODEL")
training_file = os.getenv("TRAINING_FILE")
valid_file = os.getenv("TESTING_FILE")

In [6]:
async def evaluate_model(client, model : str, prompt_style: str, rate_limit : int, sets):
    for i,ds in enumerate(sets):
        print(f'{datetime.datetime.now()}: {i} - Starting')
        await ds.gpt_async(client, model, prompt_style, rate_limit)
        print(f'{datetime.datetime.now()}: {i} - Completed')
        print(f'{datetime.datetime.now()}: Sleeping after {i}')
        await asyncio.sleep(65)
        print(f'{datetime.datetime.now()}: Awake from sleep after {i}')

In [None]:
# Evaluate gpt-3.5 turbo

await evaluate_model(client, 'gpt-3.5-turbo-1106', 'kortemeyer', 70000, testing_sets)

In [None]:
# Evaluate gpt-4

await evaluate_model(client, 'gpt-4', 'kortemeyer', 35000, testing_sets)

In [None]:
# Evaluate three_epoch model

await evaluate_model(client, three_epoch_id, 'kortemeyer', 72000, testing_sets)

In [None]:
# Evaluate two_epoch model

await evaluate_model(client, two_epoch_id, 'kortemeyer', 60000, testing_sets)

In [8]:
# Reload datasets for GPT-4, GPT-3.5, and Finetuned

for i, ds in enumerate(testing_sets):
    for j, model in enumerate(['gpt-4', 'gpt-3.5-turbo-1106', three_epoch_id, two_epoch_id]):
        ds.load_processed(model, 'kortemeyer')

In [9]:
# Code that regenerated results on specific places where the token limit was surpassed

async def revaluate_answer(testing_set : dataset.DataSet, model, prompt_style):
    loop = asyncio.get_event_loop()
    tasks = []
    lens = []
    
    problems = testing_set.prompts[testing_set.prompts[f'{prompt_style}_{model}_answer'].isna()]
    print(problems.shape)
    questions = problems['question_id'].drop_duplicates()

    for quest in questions:

        current_problem = problems[problems['question_id'] == quest]
        display(HTML(current_problem.isin(problems).to_html()))
        lens.append(current_problem.shape[0])

        prmptse = list(dataset_util.kortemeyer_prompt(current_problem, testing_set.ways, testing_set.dataset, (-1,-1,-1), (-1,-1,-1), False)[0:2])

        tasks.append(asyncio.create_task(testing_set.single_prompt(client, [{'role' : 'system', 'content' : prmptse[0]}, {'role' : 'user', 'content' : prmptse[1]}], model, prompt_style, current_problem['question_id'].iloc[0])))

    for task in tasks:
        await task
    testing_set.prompts.drop(columns=[f'{prompt_style}_{model}_answer', f'{prompt_style}_{model}_correct'], inplace=True)

    testing_set.process_responses(model, prompt_style)

    new_problems = testing_set.prompts[testing_set.prompts[f'{prompt_style}_{model}_answer'].isna()]
    print(new_problems.shape)
    return new_problems, lens

In [None]:
# Checking Cell
for i, ds in enumerate(testing_sets):
    counts = ds.prompts.count()
    if len(counts.value_counts()) != 1:
            print(f'Dataset {i}, {ds.dataset} {ds.ways}way {ds.test_state} had the following counts:')
            print(counts)
    for model in ['gpt-4', 'gpt-3.5-turbo-1106', three_epoch_id, two_epoch_id]:
        values = ds.prompts[f'kortemeyer_{model}_answer'].value_counts()
        if len(values) != ds.ways:
            print(f'Dataset {i}, {ds.dataset} {ds.ways}way {ds.test_state}, on {model}, had the following value counts:')
            print(values)
            print()

In [22]:
col = [['SCIENTSBANK'] * 2 + ['BEETLE'] * 2, ['2-way','3-way','2-way','3-way']]
inds = ['Questions', 'Student Answers']

order_training_sets = [training_sets[1], training_sets[3], training_sets[0], training_sets[2]]

for ds in order_training_sets:
    print(f'{ds.dataset} - {ds.ways}')

def testing_unique_count(ds):
    li = []

    for qid in ds.prompts['question_id'].unique():
        li.append(min(ds.prompts[ds.prompts['question_id'] == qid].shape[0], 15))
    return sum(li)

data = [
    [len(ds.prompts['question_id'].unique()) for ds in order_training_sets],
    [testing_unique_count(ds) for ds in order_training_sets],
]

training_counts = pd.DataFrame(data=data, index=inds, columns=col)
training_counts.to_clipboard()

scientsbank - 2
scientsbank - 2
beetle - 2
beetle - 2


In [20]:
def hello(ds):
    li = []

    for qid in ds.prompts['question_id'].unique():
        li.append(min(ds.prompts[ds.prompts['question_id'] == qid].shape[0], 15))
        v = ds.prompts[ds.prompts['question_id'] == qid].shape[0]
        print(v) if v <=16 else '' 
    return sum(li)
hello(training_sets[0])

705

In [23]:
col = [['SCIENTSBANK'] * 6 + ['BEETLE'] * 4, ['2-way'] * 3 + ['3-way'] * 3 + ['2-way'] * 2 + ['3-way'] * 2, ['UA', 'UQ', 'UD'] * 2 + ['UA', 'UQ'] * 2]
inds = ['Questions', 'Student Answers']

order_testing_sets = testing_sets[2:5] + testing_sets[7:] + testing_sets[:2] + testing_sets[5:7]


data = [
    [len(ds.prompts['question_id'].unique()) for ds in order_testing_sets],
    [ds.prompts.shape[0] for ds in order_testing_sets],
]

testing_counts = pd.DataFrame(data=data, index=inds, columns=col)
testing_counts.to_clipboard()

In [45]:
col = [['SCIENTSBANK'] * 6 + ['BEETLE'] * 4, ['2-way'] * 3 + ['3-way'] * 3 + ['2-way'] * 2 + ['3-way'] * 2, ['UA', 'UQ', 'UD'] * 2 + ['UA', 'UQ'] * 2]
inds = ['FT-2_Epochs: GPT-3.5', 'FT-3_Epochs: GPT-3.5',  'GPT-3.5', 'GPT-4']

order_testing_sets = testing_sets[2:5] + testing_sets[7:] + testing_sets[:2] + testing_sets[5:7]


data = [
    [ds.model_f1_score(two_epoch_id, 'kortemeyer') for ds in order_testing_sets],
    [ds.model_f1_score(three_epoch_id, 'kortemeyer') for ds in order_testing_sets],
    [pd.NA, calc_f1_multi_df(testing_sets[2:5], 'gpt-3.5-turbo-1106', 'kortemeyer'), pd.NA, 
     pd.NA, calc_f1_multi_df(testing_sets[7:], 'gpt-3.5-turbo-1106', 'kortemeyer'), pd.NA, 
     pd.NA, calc_f1_multi_df(testing_sets[:2], 'gpt-3.5-turbo-1106', 'kortemeyer'), 
     pd.NA, calc_f1_multi_df(testing_sets[5:7], 'gpt-3.5-turbo-1106', 'kortemeyer'), 
    ],
    [pd.NA, calc_f1_multi_df(testing_sets[2:5], 'gpt-4', 'kortemeyer'), pd.NA, 
     pd.NA, calc_f1_multi_df(testing_sets[7:], 'gpt-4', 'kortemeyer'), pd.NA, 
     pd.NA, calc_f1_multi_df(testing_sets[:2], 'gpt-4', 'kortemeyer'), 
     pd.NA, calc_f1_multi_df(testing_sets[5:7], 'gpt-4', 'kortemeyer'), 
    ],
]

results = pd.DataFrame(data=data, index=inds, columns=col)

In [75]:
results.head()


Unnamed: 0_level_0,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,BEETLE,BEETLE,BEETLE,BEETLE
Unnamed: 0_level_1,2-way,2-way,2-way,3-way,3-way,3-way,2-way,2-way,3-way,3-way
Unnamed: 0_level_2,UA,UQ,UD,UA,UQ,UD,UA,UQ,UA,UQ
FT-2_Epochs: GPT-3.5,0.783133,0.736301,0.717856,0.796875,0.783151,0.707384,0.742991,0.717029,0.712329,0.705602
FT-3_Epochs: GPT-3.5,0.767635,0.719449,0.693975,0.765531,0.746544,0.724348,0.736318,0.631579,0.730769,0.648045
GPT-3.5,,0.663121,,,0.644678,,,0.56148,,0.583514
GPT-4,,0.758691,,,0.742158,,,0.644295,,0.681452


In [49]:
results.to_pickle('results.pickle')

In [50]:
rn_results = results.replace(pd.NA, 0).round(2).replace(0, pd.NA)
rn_results

Unnamed: 0_level_0,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,BEETLE,BEETLE,BEETLE,BEETLE
Unnamed: 0_level_1,2-way,2-way,2-way,3-way,3-way,3-way,2-way,2-way,3-way,3-way
Unnamed: 0_level_2,UA,UQ,UD,UA,UQ,UD,UA,UQ,UA,UQ
FT-2_Epochs: GPT-3.5,0.78,0.74,0.72,0.8,0.78,0.71,0.74,0.72,0.71,0.71
FT-3_Epochs: GPT-3.5,0.77,0.72,0.69,0.77,0.75,0.72,0.74,0.63,0.73,0.65
GPT-3.5,,0.66,,,0.64,,,0.56,,0.58
GPT-4,,0.76,,,0.74,,,0.64,,0.68


In [None]:
col = ['agregate']
inds = ['FT-2_Epochs: GPT-3.5', 'FT-3_Epochs: GPT-3.5',  'GPT-3.5', 'GPT-4']



order_testing_sets = testing_sets[2:5] + testing_sets[7:] + testing_sets[:2] + testing_sets[5:7]


data = [
    calc_f1_multi_df(testing_sets, two_epoch_id, 'kortemeyer'),
    calc_f1_multi_df(testing_sets, three_epoch_id, 'kortemeyer'),
    calc_f1_multi_df(testing_sets, 'gpt-3.5-turbo-1106', 'kortemeyer'), 
    calc_f1_multi_df(testing_sets, 'gpt-4', 'kortemeyer'), 
]
    




ag_results = pd.DataFrame(data=data, index=inds, columns=col)

In [33]:
ag_results.to_pickle('ag_results.pickle')

In [None]:
col = [['SCIENTSBANK'] * 6 + ['BEETLE'] * 4, ['2-way'] * 3 + ['3-way'] * 3 + ['2-way'] * 2 + ['3-way'] * 2, ['UA', 'UQ', 'UD'] * 2 + ['UA', 'UQ'] * 2]
inds = ['FT-2_Epochs: GPT-3.5', 'FT-3_Epochs: GPT-3.5',  'GPT-3.5', 'GPT-4']



order_testing_sets = testing_sets[2:5] + testing_sets[7:] + testing_sets[:2] + testing_sets[5:7]


data = [
    [ds.model_f1_score(two_epoch_id, 'kortemeyer') for ds in order_testing_sets],
    [ds.model_f1_score(three_epoch_id, 'kortemeyer') for ds in order_testing_sets],
    [ds.model_f1_score('gpt-3.5-turbo-1106', 'kortemeyer') for ds in order_testing_sets],
    [ds.model_f1_score('gpt-4', 'kortemeyer') for ds in order_testing_sets],
]
    




noag_results = pd.DataFrame(data=data, index=inds, columns=col)

In [34]:
noag_results.to_pickle('noag_results.pickle')

In [None]:
non_client.fine_tuning.jobs.create(
  training_file=training_file, 
  validation_file=valid_file,
  model="gpt-3.5-turbo-1106",
  hyperparameters={
    "n_epochs":2
  },
)

In [180]:

train_msgs = []
valid_msgs = []
for tset in training_sets:
    new_msgs = tset.tune_messages('kortemeyer')
    train_msgs = train_msgs + new_msgs[len(new_msgs) // 10:]
    valid_msgs = valid_msgs + new_msgs[:len(new_msgs) // 10]

with jsonlines.open('semeval-kortemeyer-tuning-v1.jsonl', mode='w') as writer:
        writer.write_all(train_msgs)
with jsonlines.open('semeval-kortemeyer-valid-v1.jsonl', mode='w') as writer:
        writer.write_all(valid_msgs)