## FDA Data
Load FDA label set, segment

In [9]:
import pandas as pd

search_space = pd.read_excel('../20240424_trial_searchspace.xlsx').reset_index(drop=True).drop('Unnamed: 0', axis=1)


## Outlines 
Load model, prepare prompts

In [11]:
import outlines

model = outlines.models.transformers("Open-Orca/Mistral-7B-OpenOrca")

  from .autonotebook import tqdm as notebook_tqdm
Downloading shards: 100%|██████████| 2/2 [03:55<00:00, 117.97s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.92s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
@outlines.prompt
def identify_outcomes(clinical_trial):
    """You are a professional medical practicioner with a medical degree. Other doctors \
    send you clinical trial reports from which you need to extract:

    1. The primary outcome measure and measured value
    2. The therapeutic treatment regiment used to achieve the primary outcome
    
    # EXAMPLE
    CLINICAL_TRIAL: The median overall survival was 9.0 months in the triplet-therapy group and 5.4 months in the control group (hazard ratio for death, 0.52; 95% confidence interval [CI], 0.39 to 0.70; P<0.001). The confirmed response rate was 26% (95% CI, 18 to 35) in the triplet-therapy group and 2% (95% CI, 0 to 7) in the control group (P<0.001). The median overall survival in the doublet-therapy group was 8.4 months (hazard ratio for death vs. control, 0.60; 95% CI, 0.45 to 0.79; P<0.001). Adverse events of grade 3 or higher occurred in 58% of patients in the triplet-therapy group, in 50% in the doublet-therapy group, and in 61% in the control group.
    RESULT: {"outcome": "Median overall survival", "value": "9.0 months", "regiment":"triplet-therapy group"}

    # OUTPUT INSTRUCTIONS    
    
    Answer in valid JSON. Here are different objects relevant for the output:

    ClinicalOutcome:
        outcome (str): name of the primary outcome measure
        value (str): the value that was measured from the outcome
        regiment (str): the therapeutic treatment strategy used to obtain the outcome

    # OUTPUT
    
    CLINICAL_TRIAL: {{ clinical_trial }}
    RESULT: """

In [14]:
from pydantic import BaseModel

# Notes: Outcome could be a giant Enum according to buckets? Value str to capture lots of values, but could enforce strict int
class ClinicalOutcome(BaseModel):
    outcome: str 
    value: str
    regiment: str

In [17]:
prompts = [identify_outcomes(trial) for trial in list(search_space['clinical_studies'][5:35])]
prompts[0:2]

['You are a professional medical practicioner with a medical degree. Other doctors send you clinical trial reports from which you need to extract:\n\n1. The primary outcome measure and measured value\n2. The therapeutic treatment regiment used to achieve the primary outcome\n\n# EXAMPLE\nCLINICAL_TRIAL: The median overall survival was 9.0 months in the triplet-therapy group and 5.4 months in the control group (hazard ratio for death, 0.52; 95% confidence interval [CI], 0.39 to 0.70; P<0.001). The confirmed response rate was 26% (95% CI, 18 to 35) in the triplet-therapy group and 2% (95% CI, 0 to 7) in the control group (P<0.001). The median overall survival in the doublet-therapy group was 8.4 months (hazard ratio for death vs. control, 0.60; 95% CI, 0.45 to 0.79; P<0.001). Adverse events of grade 3 or higher occurred in 58% of patients in the triplet-therapy group, in 50% in the doublet-therapy group, and in 61% in the control group.\nRESULT: {"outcome": "Median overall survival", "va

In [None]:
generator = outlines.generate.json(model, ClinicalOutcome)

results = generator(prompts)