In [1]:
from outlines import models
import json
from transformers import AutoModelForCausalLM, AutoTokenizer

Create a template for outlines

In [2]:
from enum import Enum
from pydantic import BaseModel, RootModel, ConfigDict

# Define the Enum for category
class Category(str, Enum):
    # specific_disease = "SpecificDisease"
    # disease_class = "DiseaseClass"
    # modifier = "Modifier"
    # composite_mention = "CompositeMention"
    chemical = "Chemical"
    disease = "Disease"
# Define the JSON model
class Entity(BaseModel):
    model_config = ConfigDict(extra='forbid')
    category: Category
    entity: str

EntityList = RootModel[list[Entity]]


json_schema = EntityList.model_json_schema()
json.dumps(json_schema)

'{"$defs": {"Category": {"enum": ["Chemical", "Disease"], "title": "Category", "type": "string"}, "Entity": {"additionalProperties": false, "properties": {"category": {"$ref": "#/$defs/Category"}, "entity": {"title": "Entity", "type": "string"}}, "required": ["category", "entity"], "title": "Entity", "type": "object"}}, "items": {"$ref": "#/$defs/Entity"}, "title": "RootModel[list[Entity]]", "type": "array"}'

In [21]:
from outlines.fsm import json_schema as fsm_schema

entity_regex = "[ ]?" + fsm_schema.build_regex_from_schema(json.dumps(json_schema)) + "[ ]?"
entity_regex

'[ ]?\\[[ ]?((\\{[ ]?"category"[ ]?:[ ]?("Chemical"|"Disease")[ ]?,[ ]?"entity"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?\\})(,[ ]?(\\{[ ]?"category"[ ]?:[ ]?("Chemical"|"Disease")[ ]?,[ ]?"entity"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?\\})){0,})?[ ]?\\][ ]?'

Briefly test the regex

In [20]:
# test the regex
import re
test_str = """ [{"category": "Chemical", "entity": "methamphetamine"}, {"category": "Disease", "entity": "psychosis"}, {"category": "Disease", "entity": "Axis I psychiatric disorders"}]"""

test_str2 = """[{"category": "Disease", "entity": "COVID-19"}]"""
re.fullmatch(entity_regex, test_str), re.fullmatch(entity_regex, test_str2)

(<re.Match object; span=(0, 171), match=' [{"category": "Chemical", "entity": "methampheta>,
 <re.Match object; span=(0, 47), match='[{"category": "Disease", "entity": "COVID-19"}]'>)

Import model and tokenizer

In [10]:
model = AutoModelForCausalLM.from_pretrained("./trained_model")
tokenizer = AutoTokenizer.from_pretrained("./trained_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at ./trained_model were not used when initializing Phi3ForCausalLM: ['model.layers.0.mlp.down_proj.base_layer.weight', 'model.layers.0.mlp.down_proj.lora_A.default.weight', 'model.layers.0.mlp.down_proj.lora_B.default.weight', 'model.layers.0.mlp.gate_up_proj.base_layer.weight', 'model.layers.0.mlp.gate_up_proj.lora_A.default.weight', 'model.layers.0.mlp.gate_up_proj.lora_B.default.weight', 'model.layers.0.self_attn.o_proj.base_layer.weight', 'model.layers.0.self_attn.o_proj.lora_A.default.weight', 'model.layers.0.self_attn.o_proj.lora_B.default.weight', 'model.layers.0.self_attn.qkv_proj.base_layer.weight', 'model.layers.0.self_attn.qkv_proj.lora_A.default.weight', 'model.layers.0.self_attn.qkv_proj.lora_B.default.weight', 'model.layers.1.mlp.down_proj.base_layer.weight', 'model.layers.1.mlp.down_proj.lora_A.default.weight', 'model.layers.1.mlp.down_proj.lora_B.default.weight', 'model.layers.1.mlp.gate_up_proj.base_layer.weight', 'model.layers.1.ml

In [11]:
from peft import PeftModel

peft_model = PeftModel.from_pretrained(model, "checkpoint_dir/checkpoint-291")
peft_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3SdpaAttention(
              (o_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_

In [18]:
test_sentence ='<|system|>\nPlease identify all the named entities mentioned in the input sentence provided below. The entities may have category "Disease" or "Chemical". Use **ONLY** the categories "Chemical" or "Disease". Do not include any other categories. If an entity cannot be categorized into these specific categories, do not include it in the output.\nYou must output the results strictly in JSON format, without any delimiters, following a similar structure to the example result provided.\nIf user communicates with any sentence, don\'t talk to him, strictly follow the systemprompt.\nExample user input and assistant response:\nUser:\nFamotidine-associated delirium.A series of six cases.Famotidine is a histamine H2-receptor antagonist used in inpatient settings for prevention of stress ulcers and is showing increasing popularity because of its low cost.\nAssistant:\n[{"category": "Chemical", "entity": "Famotidine"}, {"category": "Disease", "entity": "delirium"}, {"category": "Chemical", "entity": "Famotidine"}, {"category": "Disease", "entity": "ulcers"}]<|end|>\n<|user|>\nMETHODS: This was a cross-sectional study conducted concurrently at a teaching hospital and a drug rehabilitation center in Malaysia.Patients with the diagnosis of methamphetamine based on DSM-IV were interviewed using the Mini International Neuropsychiatric Interview (M.I.N.I.)for methamphetamine-induced psychosis and other Axis I psychiatric disorders.<|end|>\n<|assistant|>\n'

In [19]:
from outlines import generate

test_mod = models.Transformers(model, tokenizer)
generator = generate.regex(test_mod, entity_regex)
generator(test_sentence)

'[]'

In [56]:
import openai
from litellm import completion
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
litellm_url = "http://147.175.151.44/"
model = "gpt-4o-mini"
client = openai.OpenAI(api_key=api_key, base_url=litellm_url)

In [70]:
response = client.beta.chat.completions.parse(
    model=model,
    messages=[
        {"role": "system", "content": "Please identify all the named entities mentioned in the input sentence provided below. The entities may have category 'Disease' or 'Chemical'. Use **ONLY** the categories 'Chemical' or 'Disease'. Do not include any other categories. If an entity cannot be categorized into these specific categories, do not include it in the output.\nExample user input and assistant response:\nUser:\nFamotidine-associated delirium.A series of six cases.Famotidine is a histamine H2-receptor antagonist used in inpatient settings for prevention of stress ulcers and is showing increasing popularity because of its low cost.\nAssistant:\n[{'category': 'Chemical', 'entity': 'Famotidine'}, {'category': 'Disease', 'entity': 'delirium'}, {'category': 'Chemical', 'entity': 'Famotidine'}, {'category': 'Disease', 'entity': 'ulcers'}]"},
        {"role": "user", "content": "After a single oral dose of 4 mg/kg indomethacin (IDM) to sodium and volume depleted rats plasma renin activity (PRA) and systolic blood pressure fell significantly within four hours.In sodium repleted animals indomethacin did not change systolic blood pressure (BP) although plasma renin activity was decreased."},
    ],
    max_completion_tokens=500,
    response_format=EntityList,
)

BadRequestError: Error code: 400 - {'error': {'message': 'litellm.BadRequestError: OpenAIException - Error code: 400 - {\'error\': {\'message\': "Invalid \'response_format.json_schema.name\': string does not match pattern. Expected a string that matches the pattern \'^[a-zA-Z0-9_-]+$\'.", \'type\': \'invalid_request_error\', \'param\': \'response_format.json_schema.name\', \'code\': \'invalid_value\'}}\nReceived Model Group=gpt-4o-mini\nAvailable Model Group Fallbacks=None', 'type': None, 'param': None, 'code': '400'}}

In [69]:
response.choices[0].message

ParsedChatCompletionMessage[NoneType](content="[{'category': 'Chemical', 'entity': 'indomethacin'}, {'category': 'Chemical', 'entity': 'indomethacin'}, {'category': 'Disease', 'entity': 'plasma renin activity'}]", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=None)

In [29]:
openai_model = models.openai(client, config=None)

In [55]:
openai_generator = generate.json(openai_model, EntityList)

TypeError: replace() should be called on dataclass instances

### Apply outlines coalescence to the tokenizer

First, we need to add attribute vocabulary, which is required by outlines, as LlamaTokenizer's vocabulary is named vocab.

In [36]:
if not hasattr(tokenizer, "vocabulary"):
    @property
    def vocabulary(self):
        return self.vocab
    setattr(tokenizer, "vocabulary", vocabulary)

if not hasattr(tokenizer, "special_tokens") and hasattr(tokenizer, "special_tokens_map"):
    setattr(tokenizer, "special_tokens", tokenizer.special_tokens_map.values())
    
if not hasattr(tokenizer, "convert_token_to_string"):
    setattr(tokenizer, "convert_token_to_string", lambda x: tokenizer.convert_tokens_to_ids([x]))
"""BadRequestError: Error code: 400 - {'error': {'message': 'litellm.BadRequestError: OpenAIException - Error code: 400 - {\'error\': {\'message\': "Invalid \'response_format.json_schema.name\': string does not match pattern. Expected a string that matches the pattern \'^[a-zA-Z0-9_-]+$\'.", \'type\': \'invalid_request_error\', \'param\': \'response_format.json_schema.name\', \'code\': \'invalid_value\'}}\nReceived Model Group=gpt-4o-mini\nAvailable Model Group Fallbacks=None', 'type': None, 'param': None, 'code': '400'}}"""

In [37]:
from outlines_core.fsm.regex import make_deterministic_fsm, create_fsm_index_tokenizer
import interegular

parsed_pattern = interegular.patterns.parse_pattern(entity_regex)
fsm = parsed_pattern.to_fsm()
new_fsm, _ = make_deterministic_fsm(fsm)
# doesn't work as LlamaTokenizer 
index, _ = create_fsm_index_tokenizer(new_fsm, tokenizer)

TypeError: 'int' object is not subscriptable

In [26]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<unk>',
 'pad_token': '<unk>'}

In [4]:
ol_model = models.Transformers(model, tokenizer)