In [1]:
import os
import openai
import sys
import json
sys.path.append("../")

import pandas as pd
from sklearn.model_selection import train_test_split

from scipy.special import softmax
from sklearn.model_selection import train_test_split

from util.logging.logger_manager import LoggerManager, logger
from util.logging.logger_config import LoggerConfig

log_path = os.path.expanduser("~/desktop/safa/logs")
LoggerManager.configure_logger(LoggerConfig(output_dir=log_path))

def get_score(probs):
    probs = probs[0]
    if " no" not in probs and " yes" in probs:
        v0 = 0
        v1 = 1
    else:
        v0 = probs[" no"]
        v1 = probs[" yes"]   
    prob_v = [v0, v1]
    score = softmax(prob_v)[1]
    return score

def calculate_metrics(labels, scores, threshold):
    from sklearn.metrics import average_precision_score
    from sklearn.metrics import fbeta_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from train.metrics.precision_at_recall_metric import PrecisionAtRecallMetric
    from train.metrics.f1_metric import FMetric
    
    f_metric = FMetric()
    p_at_r = PrecisionAtRecallMetric()
    pred_labels = list(map(lambda s: 1 if s >= threshold else 0, scores))

    # Metrics
    metrics = {
        "sklearn": {
            "ap": average_precision_score(labels, scores),
        },
        "tgen": {
            **p_at_r._compute(scores, labels),
            **f_metric._compute(scores, labels)
        }
    }
    return metrics

def create_summary(labels, scores, test_df):
    summary = []
    zipped = zip(labels, scores, test_df["prompt"])
    res = sorted(zipped, key = lambda x: x[1], reverse=True)
    for r in res:
        summary.append(f"Label: {r[0]}\tScore: {r[1]}\n{r[2]}")
    return summary

def eval_df(test_df):
    prompts = list(test_df["prompt"])
    res = openai.Completion.create(model=ft_model, prompt=prompts, temperature=0, max_tokens=1, logprobs=2)
    labels = test_df["completion"].map(lambda s: 1 if s == pos_label else 0)
    scores = list(map(lambda r: get_score(r["logprobs"]["top_logprobs"]), res["choices"]))
    texts = list(map(lambda r: r["text"], res["choices"]))
    metrics = calculate_metrics(labels, scores, 0.5)
    print(json.dumps(metrics,indent=4))
    return create_summary(labels, scores, test_df)

# Constants

In [2]:
openai.organization = "org-zmmRix6NzVPwQcNm3WF0v1A2"
openai.api_key = "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"

ft_model = 'ada:ft-safa:cm1-test-2023-03-29-23-31-09'
test_dir_name = "formatted"
test_file_name = "test.jsonl"
data_dir = os.path.join("/Users/albertorodriguez/desktop/safa/datasets/openai", test_dir_name)
pos_label = " yes###"
neg_label = " no###"

test_path = os.path.join(data_dir, test_file_name)

# Predict

In [3]:
test_df = pd.read_json(test_path, orient="records", lines=True)
test_df = test_df[test_df["prompt"].map(lambda p: len(p) < 3000)]
print(len(test_df))
print(test_df["completion"].value_counts())
test_df.head()

1191
 no###     618
 yes###    573
Name: completion, dtype: int64


Unnamed: 0,prompt,completion
0,1. The TG-PTC shall provide a mechanism for th...,no###
2,1. The system shall allow user classes to be d...,yes###
3,1. Since the underlying communication pattern ...,yes###
4,1. If there is a predictive brake warning cond...,yes###
5,1. The system shall allow other VistA applicat...,no###


### All Positives and Undersampled Negatives

In [4]:
pos_df = test_df[test_df["completion"] == pos_label]
neg_df = test_df[test_df["completion"] == neg_label]
df = pd.concat([pos_df, neg_df.sample(n=len(pos_df))])
print(len(df))
summary_1 = eval_df(df)

1146
{
    "sklearn": {
        "ap": 0.9484218085137677
    },
    "tgen": {
        "precision_at_recall_95": 0.7830459770114943,
        "best_threshold": 0.22108745269613916,
        "f1": 0.8835978835978837,
        "f2": 0.9130289903365544
    }
}


  f_metric = FMetric()


### Random Slice

In [5]:
# _, df_2 = train_test_split(test_df, test_size=200, stratify=test_df["completion"])
# print(len(df_2))
print(len(test_df))
summary_2 = eval_df(test_df)

1191
{
    "sklearn": {
        "ap": 0.9466485285681704
    },
    "tgen": {
        "precision_at_recall_95": 0.7752489331436699,
        "best_threshold": 0.22133030946599005,
        "f1": 0.8804920913884008,
        "f2": 0.910904255319149
    }
}


# Review

In [6]:
i = 0
max_display = 5
summary = summary_2
for s in summary_1:
    if i > max_display:
        break
    if "Label: 0" in s:
        print(s)
        i += 1

Label: 0	Score: 0.9983657834574283
1. The system shall integrate orders with progress notes, results, procedures, diagnosis, and Problem List.
2. The system shall provide the ability to document a verbal order, including the clinician taking (receiving) the verbal order and the ordering physician in the patient record.

###


Label: 0	Score: 0.9929402572067226
1. The system shall include the ability to print prescriptions for signature and fax prescriptions to a local pharmacy with an electronic signature.
2. The system shall provide the ability to specify prescription/medication order details including strength, route, frequency and comments. Strength, route and frequency must be captured and maintained as discrete data.

###


Label: 0	Score: 0.9867629075576962
1. The system shall require handwritten signatures for outpatient medication orders of schedule 2 and schedule 2n controlled substances due to DEA policy.
2. The system shall provide the ability to utilize unique identifiers f