In [88]:
import pandas as pd
import os
from pathlib import Path
import json
import jsonlines
import tiktoken
import openai
import xmltodict as xmd
from dotenv import load_dotenv
import pickle
import asyncio
from random import randint
import importlib
import io
import datetime
from sklearn.metrics import precision_recall_fscore_support
load_dotenv()
import pyperclip
import numpy as np

# Custom file imports
import dataset_util
import dataset

importlib.reload(dataset)
importlib.reload(dataset_util)
pd.set_option('max_colwidth', 100)



In [69]:
def calc_f1_multi_df(dfs : [pd.DataFrame], model : str, prompt_style : str):
    avg_way_dict = {
        2 : 'binary', 
        3 : 'weighted',
    }
    avg_label_dict = {
        2 : 'correct', 
        3 : 1
    }
    non_label_dict = {
        2 : None,
        3 : ['correct', 'incorrect', 'contradictory']
    }
    ground_truths = [x for df in dfs for x in df.prompts['accuracy'].tolist()]
    model_predicts = [x for df in dfs for x in df.prompts[f'{prompt_style}_{model}_answer'].tolist()]
    print(set(model_predicts) - set(ground_truths))
    # return precision_recall_fscore_support(ground_truths, model_predicts, pos_label=avg_label_dict[dfs[0].ways],labels=non_label_dict[dfs[0].ways], average=avg_way_dict[dfs[0].ways], zero_division=np.nan)[2]
    return precision_recall_fscore_support(ground_truths, model_predicts, average='weighted')[2]

In [3]:
base_path = Path(os.path.abspath('../'))
dataset_path = base_path / 'datasets' 
semeval_path = dataset_path / 'cleaning' / 'SemEval-2013-task7'
semeval3_path = dataset_path / 'semeval-2013-task7' / 'semeval-3way'
training_path = semeval3_path / 'training'
testing_path = semeval3_path / 'test'

In [70]:
# DEFINE TESTING SET AND MAKE PROMPTS
testing_sets = [
    dataset.DataSet(testing_path / '2way' / 'beetle' / 'test-unseen-answers', False, 'beetle', 2, test_state='UA'), 
    dataset.DataSet(testing_path / '2way' / 'beetle' / 'test-unseen-questions', False, 'beetle', 2, test_state='UQ'), 
    dataset.DataSet(testing_path / '2way' / 'sciEntsBank' / 'test-unseen-answers', False, 'scientsbank', 2, test_state='UA'), 
    dataset.DataSet(testing_path / '2way' / 'sciEntsBank' / 'test-unseen-questions', False, 'scientsbank', 2, test_state='UQ'), 
    dataset.DataSet(testing_path / '2way' / 'sciEntsBank' / 'test-unseen-domains', False, 'scientsbank', 2, test_state='UD'), 
    dataset.DataSet(testing_path / '3way' / 'beetle' / 'test-unseen-answers', False, 'beetle', 3, test_state='UA'), 
    dataset.DataSet(testing_path / '3way' / 'beetle' / 'test-unseen-questions' , False, 'beetle', 3, test_state='UQ'), 
    dataset.DataSet(testing_path / '3way' / 'sciEntsBank' / 'test-unseen-answers', False, 'scientsbank', 3, test_state='UA'), 
    dataset.DataSet(testing_path / '3way' / 'sciEntsBank' / 'test-unseen-questions', False, 'scientsbank', 3, test_state='UQ'), 
    dataset.DataSet(testing_path / '3way' / 'sciEntsBank' / 'test-unseen-domains', False, 'scientsbank', 3, test_state='UD'), 
    ]




training_sets = [
    dataset.DataSet(training_path / '2way' / 'beetle', True, 'beetle', 2),
    dataset.DataSet(training_path / '2way' / 'sciEntsBank', True, 'sciEntsBank', 2),
    dataset.DataSet(training_path / '3way' / 'beetle', True, 'beetle', 2),
    dataset.DataSet(training_path / '3way' / 'sciEntsBank', True, 'sciEntsBank', 2),
]


tokens = []
for i, setd in enumerate(testing_sets):
    setd.make_prompts('gpt-3.5-turbo', 'kortemeyer', 3, 3, 2,2,1,2)
    tokens.append(setd.count_tokens('kortemeyer', 'gpt-3.5-turbo'))

print((sum(tokens)))



455152


In [89]:
client = openai.AsyncOpenAI()
non_client = openai.OpenAI()

three_epoch_id = os.getenv("THREE_EPOCH_MODEL")
two_epoch_id = os.getenv("TWO_EPOCH_MODEL")
training_file = os.getenv("TRAINING_FILE")
valid_file = os.getenv("TESTING_FILE")

In [6]:
async def evaluate_model(client, model : str, prompt_style: str, rate_limit : int, sets):
    for i,ds in enumerate(sets):
        print(f'{datetime.datetime.now()}: {i} - Starting')
        await ds.gpt_async(client, model, prompt_style, rate_limit)
        print(f'{datetime.datetime.now()}: {i} - Completed')
        print(f'{datetime.datetime.now()}: Sleeping after {i}')
        await asyncio.sleep(65)
        print(f'{datetime.datetime.now()}: Awake from sleep after {i}')

In [25]:

await evaluate_model(client, 'gpt-3.5-turbo-1106', 'kortemeyer', 70000, testing_sets)

0 - Starting
0 - Completed
Sleeping after 0
Awake from sleep after 0
1 - Starting
1 - Completed
Sleeping after 1
Awake from sleep after 1
2 - Starting
Rate limit reached, sleeping for a bit :)
Up and awake
2 - Completed
Sleeping after 2
Awake from sleep after 2
3 - Starting
3 - Completed
Sleeping after 3
Awake from sleep after 3
4 - Starting
4 - Completed
Sleeping after 4
Awake from sleep after 4
5 - Starting
5 - Completed
Sleeping after 5
Awake from sleep after 5
6 - Starting
6 - Completed
Sleeping after 6
Awake from sleep after 6
7 - Starting
Rate limit reached, sleeping for a bit :)
Up and awake
7 - Completed
Sleeping after 7
Awake from sleep after 7


In [26]:
await evaluate_model(client, 'gpt-4', 'kortemeyer', 35000, testing_sets)

0 - Starting
0 - Completed
Sleeping after 0
Awake from sleep after 0
1 - Starting
1 - Completed
Sleeping after 1
Awake from sleep after 1
2 - Starting
Rate limit reached, sleeping for a bit :)
Up and awake
2 - Completed
Sleeping after 2
Awake from sleep after 2
3 - Starting
3 - Completed
Sleeping after 3
Awake from sleep after 3
4 - Starting
Rate limit reached, sleeping for a bit :)
Up and awake
Rate limit reached, sleeping for a bit :)
Up and awake
Rate limit reached, sleeping for a bit :)
Up and awake
4 - Completed
Sleeping after 4
Awake from sleep after 4
5 - Starting
5 - Completed
Sleeping after 5
Awake from sleep after 5
6 - Starting
6 - Completed
Sleeping after 6
Awake from sleep after 6
7 - Starting
Rate limit reached, sleeping for a bit :)
Up and awake
7 - Completed
Sleeping after 7
Awake from sleep after 7
8 - Starting
8 - Completed
Sleeping after 8
Awake from sleep after 8
9 - Starting
Rate limit reached, sleeping for a bit :)
Up and awake
Rate limit reached, sleeping for a b

In [35]:
await evaluate_model(client, three_epoch_id, 'kortemeyer', 72000, testing_sets)

2023-12-22 19:37:48.271868: 0 - Starting
2023-12-22 19:37:54.427153: 0 - Completed
2023-12-22 19:37:54.427245: Sleeping after 0
2023-12-22 19:38:59.428829: Awake from sleep after 0
2023-12-22 19:38:59.429985: 1 - Starting
2023-12-22 19:39:44.635762: 1 - Completed
2023-12-22 19:39:44.636390: Sleeping after 1
2023-12-22 19:40:49.638416: Awake from sleep after 1
2023-12-22 19:40:49.639581: 2 - Starting
2023-12-22 19:40:52.157553: 2 - Completed
2023-12-22 19:40:52.157631: Sleeping after 2
2023-12-22 19:41:57.159440: Awake from sleep after 2
2023-12-22 19:41:57.161001: 3 - Starting
2023-12-22 19:42:10.423803: 3 - Completed
2023-12-22 19:42:10.424157: Sleeping after 3
2023-12-22 19:43:15.425797: Awake from sleep after 3
2023-12-22 19:43:15.426943: 4 - Starting
Rate limit reached, sleeping for a bit :)
Up and awake
2023-12-22 19:44:42.050491: 4 - Completed
2023-12-22 19:44:42.050601: Sleeping after 4
2023-12-22 19:45:47.052922: Awake from sleep after 4
2023-12-22 19:45:47.054826: 5 - Starting

In [13]:
# Evaluate GPT-3.5, Fine tuned, 2 epochs

await evaluate_model(client, two_epoch_id, 'kortemeyer', 60000, testing_sets)

2023-12-31 23:07:12.042782: 0 - Starting
0 : 3884.79
1 : 7700.65
2 : 11317.63
3 : 15123.32
4 : 18334.63
5 : 21896.24
6 : 25341.460000000003
7 : 29045.450000000004
8 : 32715.540000000005
9 : 36098.61
10 : 39775.48
11 : 43268.16
12 : 46830.9
13 : 50499.86
14 : 54111.19
15 : 57479.57
Rate limit reached, sleeping for a bit :)
Up and awake
16 : 3420.36
17 : 6800.04
18 : 10266.73
19 : 13863.369999999999
20 : 17221.579999999998
21 : 20819.35
22 : 24130.1
23 : 27681.539999999997
24 : 31067.999999999996
25 : 34695.149999999994
26 : 38115.509999999995
27 : 41830.799999999996
28 : 45073.74999999999
29 : 48353.98999999999
30 : 51852.31999999999
31 : 55606.02999999999
32 : 58881.74999999999
Rate limit reached, sleeping for a bit :)
Up and awake
33 : 3628.2799999999997
34 : 7335.66
35 : 10967.33
36 : 14501.82
37 : 17849.86
38 : 21419.38
39 : 24900.760000000002
40 : 28319.99
41 : 32675.99
42 : 36120.08
43 : 39515.58
44 : 43183.41
45 : 46608.29
46 : 50484.04
2023-12-31 23:09:37.788079: 0 - Completed
2

Task exception was never retrieved
future: <Task finished name='Task-50' coro=<DataSet.single_prompt() done, defined at /Users/mustafakhan/Library/Mobile Documents/com~apple~CloudDocs/My Stuff/Tech Shtuff/Code/OSR/Project1-ASAG/code/dataset.py:73> exception=RateLimitError("Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-1106-shared in organization org-kNWrlbE4H3rBX7IlaZ2VXPbH on tokens_usage_based per min: Limit 80000, Used 76726, Requested 4096. Please try again in 616ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens_usage_based', 'param': None, 'code': 'rate_limit_exceeded'}}")>
Traceback (most recent call last):
  File "/Users/mustafakhan/Library/Mobile Documents/com~apple~CloudDocs/My Stuff/Tech Shtuff/Code/OSR/Project1-ASAG/code/dataset.py", line 74, in single_prompt
    completion = await client.chat.completions.create(messages=messages, model=model, max_tokens=2000)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^

Up and awake
124 : 3462.17
125 : 6925.47
126 : 10340.18
127 : 13791.05
128 : 17080.329999999998
129 : 20398.989999999998
130 : 23785.449999999997
131 : 27169.649999999998
132 : 30405.82
133 : 33750.47
134 : 37046.53
2023-12-31 23:21:33.621185: 2 - Completed
2023-12-31 23:21:33.621275: Sleeping after 2
2023-12-31 23:22:38.622770: Awake from sleep after 2
2023-12-31 23:22:38.623173: 3 - Starting
0 : 4179.719999999999
1 : 8321.02
2 : 12943.7
3 : 17221.73
4 : 21618.41
5 : 25706.6
6 : 30058.079999999998
7 : 34702.229999999996
8 : 39792.729999999996
9 : 43980.35999999999
10 : 50251.70999999999
11 : 54656.29999999999
12 : 59191.96999999999
Rate limit reached, sleeping for a bit :)
Up and awake
13 : 4265.6
14 : 8626.12
2023-12-31 23:23:53.343926: 3 - Completed
2023-12-31 23:23:53.344072: Sleeping after 3
2023-12-31 23:24:58.345473: Awake from sleep after 3
2023-12-31 23:24:58.345802: 4 - Starting
0 : 5762.85
1 : 11705.369999999999
2 : 19399.39
3 : 25640.23
4 : 31170.3
5 : 37153.5
6 : 42430.45


In [71]:
# Reload datasets for GPT-4, GPT-3.5, and Finetuned

for i, ds in enumerate(testing_sets):
    for j, model in enumerate(['gpt-4', 'gpt-3.5-turbo-1106', three_epoch_id, two_epoch_id]):
        ds.load_processed(model, 'kortemeyer')

        
        



In [9]:
# Code that regenerated results on specific places where the token limit was surpassed
from IPython.display import display, HTML

async def revaluate_answer(testing_set : dataset.DataSet, model, prompt_style):
    loop = asyncio.get_event_loop()
    tasks = []
    lens = []
    
    problems = testing_set.prompts[testing_set.prompts[f'{prompt_style}_{model}_answer'].isna()]
    print(problems.shape)
    questions = problems['question_id'].drop_duplicates()

    for quest in questions:

        current_problem = problems[problems['question_id'] == quest]
        display(HTML(current_problem.isin(problems).to_html()))
        lens.append(current_problem.shape[0])

        prmptse = list(dataset_util.kortemeyer_prompt(current_problem, testing_set.ways, testing_set.dataset, (-1,-1,-1), (-1,-1,-1), False)[0:2])

        tasks.append(asyncio.create_task(testing_set.single_prompt(client, [{'role' : 'system', 'content' : prmptse[0]}, {'role' : 'user', 'content' : prmptse[1]}], model, prompt_style, current_problem['question_id'].iloc[0])))

    for task in tasks:
        await task
    testing_set.prompts.drop(columns=[f'{prompt_style}_{model}_answer', f'{prompt_style}_{model}_correct'], inplace=True)

    testing_set.process_responses(model, prompt_style)

    new_problems = testing_set.prompts[testing_set.prompts[f'{prompt_style}_{model}_answer'].isna()]
    print(new_problems.shape)
    return new_problems, lens



In [72]:
print(testing_sets[1].prompts.shape)
hielo=testing_sets[1].prompts.duplicated(subset='answer_id', keep=False)

testing_sets[1].prompts[hielo]



# testing_sets[1].prompts[576:583][['answer_id', 'kortemeyer_gpt-4_completion', 'kortemeyer_gpt-4_answer', 'kortemeyer_gpt-4_correct']]
testing_sets[6].prompts = testing_sets[6].prompts.drop(index=[578]).reset_index()
testing_sets[1].prompts = testing_sets[1].prompts.drop(index=[579]).reset_index()
# 

(820, 24)


In [None]:
# Checking Cell
for i, ds in enumerate(testing_sets):
    counts = ds.prompts.count()
    if len(counts.value_counts()) != 1:
            print(f'Dataset {i}, {ds.dataset} {ds.ways}way {ds.test_state} had the following counts:')
            print(counts)
    for model in ['gpt-4', 'gpt-3.5-turbo-1106', three_epoch_id, two_epoch_id]:
        values = ds.prompts[f'kortemeyer_{model}_answer'].value_counts()
        if len(values) != ds.ways:
            print(f'Dataset {i}, {ds.dataset} {ds.ways}way {ds.test_state}, on {model}, had the following value counts:')
            print(values)
            print()

In [45]:
col = [['SCIENTSBANK'] * 6 + ['BEETLE'] * 4, ['2-way'] * 3 + ['3-way'] * 3 + ['2-way'] * 2 + ['3-way'] * 2, ['UA', 'UQ', 'UD'] * 2 + ['UA', 'UQ'] * 2]
inds = ['FT-2_Epochs: GPT-3.5', 'FT-3_Epochs: GPT-3.5',  'GPT-3.5', 'GPT-4']



order_testing_sets = testing_sets[2:5] + testing_sets[7:] + testing_sets[:2] + testing_sets[5:7]


data = [
    [ds.model_f1_score(two_epoch_id, 'kortemeyer') for ds in order_testing_sets],
    [ds.model_f1_score(three_epoch_id, 'kortemeyer') for ds in order_testing_sets],
    [pd.NA, calc_f1_multi_df(testing_sets[2:5], 'gpt-3.5-turbo-1106', 'kortemeyer'), pd.NA, 
     pd.NA, calc_f1_multi_df(testing_sets[7:], 'gpt-3.5-turbo-1106', 'kortemeyer'), pd.NA, 
     pd.NA, calc_f1_multi_df(testing_sets[:2], 'gpt-3.5-turbo-1106', 'kortemeyer'), 
     pd.NA, calc_f1_multi_df(testing_sets[5:7], 'gpt-3.5-turbo-1106', 'kortemeyer'), 
    ],
    [pd.NA, calc_f1_multi_df(testing_sets[2:5], 'gpt-4', 'kortemeyer'), pd.NA, 
     pd.NA, calc_f1_multi_df(testing_sets[7:], 'gpt-4', 'kortemeyer'), pd.NA, 
     pd.NA, calc_f1_multi_df(testing_sets[:2], 'gpt-4', 'kortemeyer'), 
     pd.NA, calc_f1_multi_df(testing_sets[5:7], 'gpt-4', 'kortemeyer'), 
    ],
]
    




results = pd.DataFrame(data=data, index=inds, columns=col)

In [75]:
results.head()


Unnamed: 0_level_0,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,BEETLE,BEETLE,BEETLE,BEETLE
Unnamed: 0_level_1,2-way,2-way,2-way,3-way,3-way,3-way,2-way,2-way,3-way,3-way
Unnamed: 0_level_2,UA,UQ,UD,UA,UQ,UD,UA,UQ,UA,UQ
FT-2_Epochs: GPT-3.5,0.783133,0.736301,0.717856,0.796875,0.783151,0.707384,0.742991,0.717029,0.712329,0.705602
FT-3_Epochs: GPT-3.5,0.767635,0.719449,0.693975,0.765531,0.746544,0.724348,0.736318,0.631579,0.730769,0.648045
GPT-3.5,,0.663121,,,0.644678,,,0.56148,,0.583514
GPT-4,,0.758691,,,0.742158,,,0.644295,,0.681452


In [49]:
results.to_pickle('results.pickle')

In [50]:
rn_results = results.replace(pd.NA, 0).round(2).replace(0, pd.NA)
rn_results

Unnamed: 0_level_0,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,SCIENTSBANK,BEETLE,BEETLE,BEETLE,BEETLE
Unnamed: 0_level_1,2-way,2-way,2-way,3-way,3-way,3-way,2-way,2-way,3-way,3-way
Unnamed: 0_level_2,UA,UQ,UD,UA,UQ,UD,UA,UQ,UA,UQ
FT-2_Epochs: GPT-3.5,0.78,0.74,0.72,0.8,0.78,0.71,0.74,0.72,0.71,0.71
FT-3_Epochs: GPT-3.5,0.77,0.72,0.69,0.77,0.75,0.72,0.74,0.63,0.73,0.65
GPT-3.5,,0.66,,,0.64,,,0.56,,0.58
GPT-4,,0.76,,,0.74,,,0.64,,0.68


In [None]:
non_client.fine_tuning.jobs.create(
  training_file=training_file, 
  validation_file=valid_file,
  model="gpt-3.5-turbo-1106",
  hyperparameters={
    "n_epochs":2
  },
)

In [180]:

train_msgs = []
valid_msgs = []
for tset in training_sets:
    new_msgs = tset.tune_messages('kortemeyer')
    train_msgs = train_msgs + new_msgs[len(new_msgs) // 10:]
    valid_msgs = valid_msgs + new_msgs[:len(new_msgs) // 10]

with jsonlines.open('semeval-kortemeyer-tuning-v1.jsonl', mode='w') as writer:
        writer.write_all(train_msgs)
with jsonlines.open('semeval-kortemeyer-valid-v1.jsonl', mode='w') as writer:
        writer.write_all(valid_msgs)