In [1]:
from json import JSONDecodeError
from utils import *
from output_formatter import transform_to_prodigy, prodigy_to_interpreteval
from nervaluate import Evaluator

In [4]:
base_model = get_base_model()
tokenizer = get_base_tokenizer()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
sentences = []
true_responses = []
pred_responses_base = []
pred_responses_finetuned = []

In [3]:
with open('data/CDR_TestSet.json', 'r') as file:
    testing_data = json.load(file)

for item in testing_data:
    if 'user' not in item or 'assistant' not in item:
        print("problem with item:", item)
    sentences.append(item['user'])
    true_responses.append(transform_to_prodigy(item['user'], item['assistant']))

### Responses From Base Model

In [7]:
generated_responses_base = []
for sentence in sentences:
    generated_responses_base.append(generate_response(sentence, base_model, tokenizer))

To use the generated responses after restarting the notebook:

In [6]:
from datetime import datetime
with open('output/base_model.txt', 'w') as file:
    file.write("\n$SEP$\n".join(generated_responses_base))

# Create a backup, as to avoid accidental overwriting
with open('output/base_model_' + datetime.now().strftime("%Y%m%d%H%M%S") + '.txt' , 'w') as file:
    file.write("\n$SEP$\n".join(generated_responses_base))

In [34]:
with open('output/base_model_20250228150609.txt', 'r') as file:
    generated_responses_base1 = file.read().split("\n$SEP$\n")

In [26]:
invalid_json_format = 0
invalid_key = 0
pred_responses_base = []
for i in range(len(generated_responses_base)):
    predicted = []
    try:
        predicted = transform_to_prodigy(sentences[i], generated_responses_base[i])
    except JSONDecodeError:
        invalid_json_format += 1
    except KeyError:
        invalid_key += 1
    pred_responses_base.append(predicted)

print(f"Invalid format: {invalid_json_format + invalid_key} out of {len(generated_responses_base)} times ({invalid_json_format} JSONDecodeError, {invalid_key} KeyError)")

Invalid format: 7 out of 30 times (6 JSONDecodeError, 1 KeyError)


### Responses From OpenAI Model

In [6]:
generated_responses_openai = []
at = 0
for sentence in sentences:
    at += 1
    generated_responses_openai.append(generate_openai(sentence))
    if at % 100 == 0:
        with open(f'output/openai{at}.txt', 'w') as file:
            file.write("\n$SEP$\n".join(generated_responses_openai))

In [7]:
from datetime import datetime
with open('output/openai.txt', 'w') as file:
    file.write("\n$SEP$\n".join(generated_responses_openai))

# Create a backup, as to avoid accidental overwriting
with open('output/openai_' + datetime.now().strftime("%Y%m%d%H%M%S") + '.txt' , 'w') as file:
    file.write("\n$SEP$\n".join(generated_responses_openai))

In [8]:
invalid_json_format = 0
invalid_key = 0
pred_responses_openai = []
for i in range(len(generated_responses_openai)):
    predicted = []
    try:
        predicted = transform_to_prodigy(sentences[i], generated_responses_openai[i])
    except JSONDecodeError:
        invalid_json_format += 1
    except KeyError:
        invalid_key += 1
    pred_responses_openai.append(predicted)

print(f"Invalid format: {invalid_json_format + invalid_key} out of {len(generated_responses_openai)} times ({invalid_json_format} JSONDecodeError, {invalid_key} KeyError)")

Invalid format: 0 out of 2422 times (0 JSONDecodeError, 0 KeyError)


### Responses From Fine-tuned Model

In [10]:
finetuned_model = get_finetuned_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
generated_responses_finetuned = []
for sentence in sentences[:60]:
    generated_responses_finetuned.append(generate_response(sentence, finetuned_model, tokenizer))

NameError: name 'tokenizer' is not defined

In [10]:
invalid_json_format = 0
invalid_key = 0
for i in range(len(generated_responses_finetuned)):
    predicted = []
    try:
        predicted = transform_to_prodigy(sentences[i], generated_responses_base[i])
    except JSONDecodeError:
        invalid_json_format += 1
    except KeyError:
        invalid_key += 1
    pred_responses_finetuned.append(predicted)
print(f"Invalid format: {invalid_json_format + invalid_key} out of {len(generated_responses_finetuned)} times ({invalid_json_format} JSONDecodeError, {invalid_key} KeyError)")

Invalid json format: 10 out of 60 times


### NERvaluate evaluation

#### Base model evaluation

In [37]:
evaluator = Evaluator(true_responses, pred_responses_base, tags=['Disease', 'Chemical'])
results, results_per_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
results

{'ent_type': {'correct': 5837,
  'incorrect': 225,
  'partial': 0,
  'missed': 3758,
  'spurious': 3054,
  'possible': 9820,
  'actual': 9116,
  'precision': 0.6403027643703378,
  'recall': 0.5943991853360489,
  'f1': 0.6164976763836079},
 'partial': {'correct': 5009,
  'incorrect': 0,
  'partial': 1053,
  'missed': 3758,
  'spurious': 3054,
  'possible': 9820,
  'actual': 9116,
  'precision': 0.6072290478279947,
  'recall': 0.5636965376782077,
  'f1': 0.5846535699197296},
 'strict': {'correct': 4931,
  'incorrect': 1131,
  'partial': 0,
  'missed': 3758,
  'spurious': 3054,
  'possible': 9820,
  'actual': 9116,
  'precision': 0.540917068889864,
  'recall': 0.5021384928716904,
  'f1': 0.5208069286016055},
 'exact': {'correct': 5009,
  'incorrect': 1053,
  'partial': 0,
  'missed': 3758,
  'spurious': 3054,
  'possible': 9820,
  'actual': 9116,
  'precision': 0.5494734532689777,
  'recall': 0.510081466395112,
  'f1': 0.5290452049007183}}

#### OpenAI model evaluation

In [14]:
evaluator = Evaluator(true_responses, pred_responses_openai, tags=['Disease', 'Chemical'])
results, results_per_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
results

{'ent_type': {'correct': 6795,
  'incorrect': 71,
  'partial': 0,
  'missed': 2946,
  'spurious': 944,
  'possible': 9812,
  'actual': 7810,
  'precision': 0.8700384122919335,
  'recall': 0.6925193640440277,
  'f1': 0.7711950970377937},
 'partial': {'correct': 6161,
  'incorrect': 0,
  'partial': 705,
  'missed': 2946,
  'spurious': 944,
  'possible': 9812,
  'actual': 7810,
  'precision': 0.8339948783610756,
  'recall': 0.6638300040766408,
  'f1': 0.7392463965497674},
 'strict': {'correct': 6143,
  'incorrect': 723,
  'partial': 0,
  'missed': 2946,
  'spurious': 944,
  'possible': 9812,
  'actual': 7810,
  'precision': 0.7865556978233035,
  'recall': 0.6260701182225846,
  'f1': 0.697196685960731},
 'exact': {'correct': 6161,
  'incorrect': 705,
  'partial': 0,
  'missed': 2946,
  'spurious': 944,
  'possible': 9812,
  'actual': 7810,
  'precision': 0.7888604353393086,
  'recall': 0.6279046066041581,
  'f1': 0.6992395868800363}}

#### Fine-tuned model evaluation

In [12]:
evaluator = Evaluator(true_responses[:60], pred_responses_finetuned, tags=['Disease', 'Chemical'])
results, results_per_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
results

{'ent_type': {'correct': 144,
  'incorrect': 9,
  'partial': 0,
  'missed': 91,
  'spurious': 79,
  'possible': 244,
  'actual': 232,
  'precision': 0.6206896551724138,
  'recall': 0.5901639344262295,
  'f1': 0.6050420168067226},
 'partial': {'correct': 126,
  'incorrect': 0,
  'partial': 27,
  'missed': 91,
  'spurious': 79,
  'possible': 244,
  'actual': 232,
  'precision': 0.6012931034482759,
  'recall': 0.5717213114754098,
  'f1': 0.5861344537815125},
 'strict': {'correct': 124,
  'incorrect': 29,
  'partial': 0,
  'missed': 91,
  'spurious': 79,
  'possible': 244,
  'actual': 232,
  'precision': 0.5344827586206896,
  'recall': 0.5081967213114754,
  'f1': 0.5210084033613445},
 'exact': {'correct': 126,
  'incorrect': 27,
  'partial': 0,
  'missed': 91,
  'spurious': 79,
  'possible': 244,
  'actual': 232,
  'precision': 0.5431034482758621,
  'recall': 0.5163934426229508,
  'f1': 0.5294117647058824}}

### InterpretEval evaluation