In [1]:
from outlines import models
import json
from transformers import AutoModelForCausalLM, AutoTokenizer

Create a template for outlines

In [2]:
from enum import Enum
from pydantic import BaseModel, RootModel

# Define the Enum for category
class Category(str, Enum):
    specific_disease = "SpecificDisease"
    disease_class = "DiseaseClass"
    modifier = "Modifier"
    composite_mention = "CompositeMention"

# Define the JSON model
class Entity(BaseModel):
    category: Category
    entity: str

EntityList = RootModel[list[Entity]]
    
json_schema = EntityList.model_json_schema()
json.dumps(json_schema)

'{"$defs": {"Category": {"enum": ["SpecificDisease", "DiseaseClass", "Modifier", "CompositeMention"], "title": "Category", "type": "string"}, "Entity": {"properties": {"category": {"$ref": "#/$defs/Category"}, "entity": {"title": "Entity", "type": "string"}}, "required": ["category", "entity"], "title": "Entity", "type": "object"}}, "items": {"$ref": "#/$defs/Entity"}, "title": "RootModel[list[Entity]]", "type": "array"}'

In [3]:
from outlines.fsm import json_schema as fsm_schema
fsm_schema.build_regex_from_schema(json.dumps(json_schema))

'\\[[ ]?((\\{[ ]?"category"[ ]?:[ ]?("SpecificDisease"|"DiseaseClass"|"Modifier"|"CompositeMention")[ ]?,[ ]?"entity"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?\\})(,[ ]?(\\{[ ]?"category"[ ]?:[ ]?("SpecificDisease"|"DiseaseClass"|"Modifier"|"CompositeMention")[ ]?,[ ]?"entity"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?\\})){0,})?[ ]?\\]'

In [7]:
from outlines.fsm import json_schema as fsm_schema

entity_regex = fsm_schema.build_regex_from_schema(json.dumps(json_schema))
entity_regex

'\\[[ ]?((\\{[ ]?"category"[ ]?:[ ]?("SpecificDisease"|"DiseaseClass"|"Modifier"|"CompositeMention")[ ]?,[ ]?"entity"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?\\})(,[ ]?(\\{[ ]?"category"[ ]?:[ ]?("SpecificDisease"|"DiseaseClass"|"Modifier"|"CompositeMention")[ ]?,[ ]?"entity"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?\\})){0,})?[ ]?\\]'

Briefly test the regex

In [13]:
# test the regex
import re
test_str = """[
    {"category": "SpecificDisease", "entity": "COVID-19"},
    {"category": "DiseaseClass", "entity": "virus"},    {"category": "Modifier", "entity": "severe"},    
      {"category": "CompositeMention", "entity": "COVID-19 virus"}
]"""

test_str2 = """[{"category": "SpecificDisease", "entity": "COVID-19"},{"category": "DiseaseClass", "entity": "virus"}, { "category": "Modifier", "entity": "severe"}, {"category": "CompositeMention", "entity": "COVID-19 virus"}]"""
re.fullmatch(entity_regex, test_str), re.fullmatch(entity_regex, test_str2)

(None,
 <re.Match object; span=(0, 212), match='[{"category": "SpecificDisease", "entity": "COVID>)

Import model and tokenizer

In [15]:
model = AutoModelForCausalLM.from_pretrained("./trained_model")
tokenizer = AutoTokenizer.from_pretrained("./trained_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at ./trained_model were not used when initializing Phi3ForCausalLM: ['model.layers.0.mlp.down_proj.base_layer.weight', 'model.layers.0.mlp.down_proj.lora_A.default.weight', 'model.layers.0.mlp.down_proj.lora_B.default.weight', 'model.layers.0.mlp.gate_up_proj.base_layer.weight', 'model.layers.0.mlp.gate_up_proj.lora_A.default.weight', 'model.layers.0.mlp.gate_up_proj.lora_B.default.weight', 'model.layers.0.self_attn.o_proj.base_layer.weight', 'model.layers.0.self_attn.o_proj.lora_A.default.weight', 'model.layers.0.self_attn.o_proj.lora_B.default.weight', 'model.layers.0.self_attn.qkv_proj.base_layer.weight', 'model.layers.0.self_attn.qkv_proj.lora_A.default.weight', 'model.layers.0.self_attn.qkv_proj.lora_B.default.weight', 'model.layers.1.mlp.down_proj.base_layer.weight', 'model.layers.1.mlp.down_proj.lora_A.default.weight', 'model.layers.1.mlp.down_proj.lora_B.default.weight', 'model.layers.1.mlp.gate_up_proj.base_layer.weight', 'model.layers.1.ml

In [16]:
from peft import PeftModel

peft_model = PeftModel.from_pretrained(model, "checkpoint_dir/checkpoint-120")
peft_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3SdpaAttention(
              (o_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_

In [47]:
test_sentence = '<|system|>\nPlease identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results in JSON format, following a similar structure to the example result provided.\n\nExample sentence and results:\n"A common human skin tumour is caused by activating mutations in beta-catenin."\n\n"Results": [\n{ "category": "DiseaseClass", "entity": "skin tumour" }\n]\n<|end|>\n<|user|>\nSomatic-cell selection is a major determinant of the blood-cell phenotype in heterozygotes for glucose-6-phosphate dehydrogenase mutations causing severe enzyme deficiency.X-chromosome inactivation in mammals is regarded as an essentially random process, but the resulting somatic-cell mosaicism creates the opportunity for cell selection.<|end|>\n<|assistant|>\n'

In [58]:
from outlines import generate

test_mod = models.Transformers(model, tokenizer)
generator = generate.regex(test_mod, entity_arr_regex)
generator(test_sentence)

KeyboardInterrupt: 

In [60]:
import openai
from litellm import completion
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
litellm_url = "http://147.175.151.44/"
model = "gpt-4o-mini"
client = openai.OpenAI(api_key=api_key, base_url=litellm_url)

In [65]:
openai_model = models.openai(client, config=None)

In [69]:
openai_generator = generate.json(openai_model, json.dump(json_schema))
#openai_generator(test_sentence)

ValueError: Cannot parse schema {'$defs': {'Category': {'enum': ['SpecificDisease', 'DiseaseClass', 'Modifier', 'CompositeMention'], 'title': 'Category', 'type': 'string'}}, 'properties': {'category': {'$ref': '#/$defs/Category'}, 'entity': {'title': 'Entity', 'type': 'string'}}, 'required': ['category', 'entity'], 'title': 'Entity', 'type': 'object'}. The schema must be either a Pydantic object, a function or a string that contains the JSON Schema specification

### Apply outlines coalescence to the tokenizer

First, we need to add attribute vocabulary, which is required by outlines, as LlamaTokenizer's vocabulary is named vocab.

In [36]:
if not hasattr(tokenizer, "vocabulary"):
    @property
    def vocabulary(self):
        return self.vocab
    setattr(tokenizer, "vocabulary", vocabulary)

if not hasattr(tokenizer, "special_tokens") and hasattr(tokenizer, "special_tokens_map"):
    setattr(tokenizer, "special_tokens", tokenizer.special_tokens_map.values())
    
if not hasattr(tokenizer, "convert_token_to_string"):
    setattr(tokenizer, "convert_token_to_string", lambda x: tokenizer.convert_tokens_to_ids([x]))


In [37]:
from outlines_core.fsm.regex import make_deterministic_fsm, create_fsm_index_tokenizer
import interegular

parsed_pattern = interegular.patterns.parse_pattern(entity_arr_regex)
fsm = parsed_pattern.to_fsm()
new_fsm, _ = make_deterministic_fsm(fsm)
# doesn't work as LlamaTokenizer 
index, _ = create_fsm_index_tokenizer(new_fsm, tokenizer)

TypeError: 'int' object is not subscriptable

In [26]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<unk>',
 'pad_token': '<unk>'}

In [4]:
ol_model = models.Transformers(model, tokenizer)