## Prepare the model

In [None]:
from transformers import AutoTokenizer
import torch
from vllm import LLM,SamplingParams

model_path = "./DeepSeek-R1-Distill-Llama-8B-SFT"

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id

model = LLM(model=model_path)


## Prepare the input

In [None]:
prompt = """\
Act as a clinical coding assistant. Based on the patient's medical records, \
output ONLY the top 1-3 most probable ICD codes from the provided disease list. \
Strictly format as valid JSON with a "diagnoses" array containing codes ordered by \
descending probability. Use only codes from the predefined 50-disease ICD list. \
Omit all explanatory text. Example output format: {"diagnoses": ["code1", "code2", ...]}"""

content_path = './case_0.txt'
with open(content_path, 'r') as f:
    input_content = f.read()

messages = [{"role": "system", "content": prompt},
            {"role": "user", "content": input_content},
            ]

input_temp = tokenizer.apply_chat_template(
                                messages, 
                                tokenize=False, 
                                add_generation_prompt=True)

sampling_params = SamplingParams(temperature=0.7,top_p=0.95,max_tokens=200,include_stop_str_in_output=True,
                                                stop_token_ids=[
                                                   tokenizer.eos_token_id,
                                                   tokenizer.convert_tokens_to_ids("<｜end▁of▁sentence｜>")
                                               ])

## Inference model

In [None]:
print(f'Input text: {input_text}')
output_texts = model.generate(input_temp,sampling_params)
print(f'Output texts: {output_texts}')