In [1]:
from json import JSONDecodeError
from utils import *
from output_formatter import transform_to_prodigy, prodigy_to_interpreteval
from nervaluate import Evaluator

In [2]:
base_model = get_base_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
tokenizer = get_base_tokenizer()

In [3]:
sentences = []
true_responses = []
pred_responses_base = []
pred_responses_finetuned = []

In [4]:
with open('data/CDR_TestSet.json', 'r') as file:
    testing_data = json.load(file)

for item in testing_data:
    if 'user' not in item or 'assistant' not in item:
        print("problem with item:", item)
    sentences.append(item['user'])
    true_responses.append(transform_to_prodigy(item['user'], item['assistant']))

### Responses From Base Model

In [7]:
generated_responses_base = []
for sentence in sentences:
    generated_responses_base.append(generate_response(sentence, base_model, tokenizer))

To use the generated responses after restarting the notebook:

In [6]:
from datetime import datetime
with open('output/base_model.txt', 'w') as file:
    file.write("\n$SEP$\n".join(generated_responses_base))

# Create a backup, as to avoid accidental overwriting
with open('output/base_model_' + datetime.now().strftime("%Y%m%d%H%M%S") + '.txt' , 'w') as file:
    file.write("\n$SEP$\n".join(generated_responses_base))

In [5]:
with open('output/base_model.txt', 'r') as file:
    generated_responses_base = file.read().split("\n$SEP$\n")

In [6]:
invalid_json_format = 0
invalid_key = 0
pred_responses_base = []
for i in range(len(generated_responses_base)):
    predicted = []
    try:
        predicted = transform_to_prodigy(sentences[i], generated_responses_base[i])
    except JSONDecodeError:
        invalid_json_format += 1
    except KeyError:
        invalid_key += 1
    pred_responses_base.append(predicted)

print(f"Invalid format: {invalid_json_format + invalid_key} out of {len(generated_responses_base)} times ({invalid_json_format} JSONDecodeError, {invalid_key} KeyError)")

Invalid format: 665 out of 2422 times (610 JSONDecodeError, 55 KeyError)


### Responses From OpenAI Model

In [6]:
generated_responses_openai = []
at = 0
for sentence in sentences:
    at += 1
    generated_responses_openai.append(generate_openai(sentence))
    if at % 100 == 0:
        with open(f'output/openai{at}.txt', 'w') as file:
            file.write("\n$SEP$\n".join(generated_responses_openai))

In [7]:
from datetime import datetime
with open('output/openai.txt', 'w') as file:
    file.write("\n$SEP$\n".join(generated_responses_openai))

# Create a backup, as to avoid accidental overwriting
with open('output/openai_' + datetime.now().strftime("%Y%m%d%H%M%S") + '.txt' , 'w') as file:
    file.write("\n$SEP$\n".join(generated_responses_openai))

In [8]:
invalid_json_format = 0
invalid_key = 0
pred_responses_openai = []
for i in range(len(generated_responses_openai)):
    predicted = []
    try:
        predicted = transform_to_prodigy(sentences[i], generated_responses_openai[i])
    except JSONDecodeError:
        invalid_json_format += 1
    except KeyError:
        invalid_key += 1
    pred_responses_openai.append(predicted)

print(f"Invalid format: {invalid_json_format + invalid_key} out of {len(generated_responses_openai)} times ({invalid_json_format} JSONDecodeError, {invalid_key} KeyError)")

Invalid format: 0 out of 2422 times (0 JSONDecodeError, 0 KeyError)


### Responses From Fine-tuned Model

In [7]:
finetuned_model = get_finetuned_model("/checkpoint-1460")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
generated_responses_finetuned = []
for sentence in sentences:
    generated_responses_finetuned.append(generate_response(sentence, finetuned_model, tokenizer))

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausa

In [13]:
invalid_json_format = 0
invalid_key = 0
for i in range(len(generated_responses_finetuned)):
    predicted = []
    try:
        predicted = transform_to_prodigy(sentences[i], generated_responses_finetuned[i])
    except JSONDecodeError:
        invalid_json_format += 1
    except KeyError:
        invalid_key += 1
    pred_responses_finetuned.append(predicted)
print(f"Invalid format: {invalid_json_format + invalid_key} out of {len(generated_responses_finetuned)} times ({invalid_json_format} JSONDecodeError, {invalid_key} KeyError)")

Invalid format: 114 out of 2422 times (76 JSONDecodeError, 38 KeyError)


In [14]:
from datetime import datetime
with open('output/finetuned_mixed_train.txt', 'w') as file:
    file.write("\n$SEP$\n".join(generated_responses_finetuned))

### NERvaluate evaluation

#### Base model evaluation

In [7]:
evaluator = Evaluator(true_responses, pred_responses_base, tags=['Disease', 'Chemical'])
results, results_per_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
results

{'ent_type': {'correct': 5088,
  'incorrect': 201,
  'partial': 0,
  'missed': 4540,
  'spurious': 2894,
  'possible': 9829,
  'actual': 8183,
  'precision': 0.6217768544543566,
  'recall': 0.5176518465764575,
  'f1': 0.5649566955363092},
 'partial': {'correct': 4325,
  'incorrect': 0,
  'partial': 964,
  'missed': 4540,
  'spurious': 2894,
  'possible': 9829,
  'actual': 8183,
  'precision': 0.5874373701576439,
  'recall': 0.48906297690507683,
  'f1': 0.5337552742616034},
 'strict': {'correct': 4248,
  'incorrect': 1041,
  'partial': 0,
  'missed': 4540,
  'spurious': 2894,
  'possible': 9829,
  'actual': 8183,
  'precision': 0.5191250152755713,
  'recall': 0.4321904568114762,
  'f1': 0.4716855429713524},
 'exact': {'correct': 4325,
  'incorrect': 964,
  'partial': 0,
  'missed': 4540,
  'spurious': 2894,
  'possible': 9829,
  'actual': 8183,
  'precision': 0.5285347672002932,
  'recall': 0.4400244175399329,
  'f1': 0.48023539862314013}}

#### OpenAI model evaluation

In [14]:
evaluator = Evaluator(true_responses, pred_responses_openai, tags=['Disease', 'Chemical'])
results, results_per_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
results

{'ent_type': {'correct': 6795,
  'incorrect': 71,
  'partial': 0,
  'missed': 2946,
  'spurious': 944,
  'possible': 9812,
  'actual': 7810,
  'precision': 0.8700384122919335,
  'recall': 0.6925193640440277,
  'f1': 0.7711950970377937},
 'partial': {'correct': 6161,
  'incorrect': 0,
  'partial': 705,
  'missed': 2946,
  'spurious': 944,
  'possible': 9812,
  'actual': 7810,
  'precision': 0.8339948783610756,
  'recall': 0.6638300040766408,
  'f1': 0.7392463965497674},
 'strict': {'correct': 6143,
  'incorrect': 723,
  'partial': 0,
  'missed': 2946,
  'spurious': 944,
  'possible': 9812,
  'actual': 7810,
  'precision': 0.7865556978233035,
  'recall': 0.6260701182225846,
  'f1': 0.697196685960731},
 'exact': {'correct': 6161,
  'incorrect': 705,
  'partial': 0,
  'missed': 2946,
  'spurious': 944,
  'possible': 9812,
  'actual': 7810,
  'precision': 0.7888604353393086,
  'recall': 0.6279046066041581,
  'f1': 0.6992395868800363}}

#### Fine-tuned model evaluation

In [16]:
evaluator = Evaluator(true_responses, pred_responses_finetuned, tags=['Disease', 'Chemical'])
results, results_per_tag, result_indices, result_indices_by_tag = evaluator.evaluate()
results

{'ent_type': {'correct': 5958,
  'incorrect': 298,
  'partial': 0,
  'missed': 3558,
  'spurious': 2587,
  'possible': 9814,
  'actual': 8843,
  'precision': 0.6737532511591089,
  'recall': 0.6070919095170165,
  'f1': 0.6386878919440424},
 'partial': {'correct': 4965,
  'incorrect': 0,
  'partial': 1291,
  'missed': 3558,
  'spurious': 2587,
  'possible': 9814,
  'actual': 8843,
  'precision': 0.6344566323645822,
  'recall': 0.5716833095577746,
  'f1': 0.6014364581658358},
 'strict': {'correct': 4868,
  'incorrect': 1388,
  'partial': 0,
  'missed': 3558,
  'spurious': 2587,
  'possible': 9814,
  'actual': 8843,
  'precision': 0.5504919145086509,
  'recall': 0.4960260851844304,
  'f1': 0.5218416680066462},
 'exact': {'correct': 4965,
  'incorrect': 1291,
  'partial': 0,
  'missed': 3558,
  'spurious': 2587,
  'possible': 9814,
  'actual': 8843,
  'precision': 0.5614610426325908,
  'recall': 0.5059099245975137,
  'f1': 0.5322399099533687}}

### InterpretEval evaluation

In [None]:
interpret_eval_base = prodigy_to_interpreteval(pred_responses_base)
interpret_eval_openai = prodigy_to_interpreteval(pred_responses_openai)
interpret_eval_finetuned = prodigy_to_interpreteval(pred_responses_finetuned)