# 導入模組

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-reasoning")
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-reasoning", device_map="auto", torch_dtype="auto")

messages = [
    {"role": "system", "content": "You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:"},
    {"role": "user", "content": "什麼是ERAS？請問繁體中文解釋。"},
]
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")

outputs = model.generate(
    inputs.to(model.device),
    max_new_tokens=4096,
    temperature=0.8,
    top_k=50,
    top_p=0.95,
    do_sample=True,
)
print(tokenizer.decode(outputs[0]))



In [1]:
from core.db_manager import DBManager
db_manager = DBManager()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
db_manager.connect_db

print(db_manager.collection_name)


None
sentence-transformers_all-MiniLM-L6-v2
[Document(metadata={'author': 'G. Nelson', 'creationdate': '2023-04-20T06:14:22+00:00', 'creator': 'Elsevier', 'crossmarkdomainexclusive': 'true', 'crossmarkdomains[1]': 'elsevier.com', 'crossmarkdomains[2]': 'sciencedirect.com', 'crossmarkmajorversiondate': '2010-04-23', 'doi': '10.1016/j.ygyno.2023.04.009', 'elsevierwebpdfspecifications': '7.0', 'keywords': 'Enhanced recovery after surgery; \t\t\tERAS; \t\t\tPerioperative care; \t\t\tSurgical quality; \t\t\tGynecologic oncology surgery', 'moddate': '2023-04-20T06:14:22+00:00', 'page': 1, 'page_label': '2', 'producer': 'PyPDF', 'robots': 'noindex', 'source': './PDFS/Enhanced recovery after surgery (ERAS®) society guidelines for gynecologic oncology Addressing implementation challenges -.pdf', 'source_file_name': 'Enhanced recovery after surgery (ERAS®) society guidelines for gynecologic oncology Addressing implementation challenges -.pdf', 'start_index': 0, 'subject': 'Gynecologic Oncology, 

In [3]:
db_manager.get_available_source_files()

['A-framework-for-perioperative-care-for-lower-extremity-vascular-bypasses-a-Consensus-Statement-by-the-Enhanced-Recovery-after-Surgery-ERAS-Society-and-Society-for-Vascular-Surgery.pdf',
 'British Journal of Surgery - 2014 - Mortensen - Consensus guidelines for enhanced recovery after gastrectomy.pdf',
 'Consensus Guidelines for Perioperative Care for Emergency Laparotomy Enhanced Recovery After Surgery (ERAS®) Society Recommendations Part 2—Emergency Laparotomy Intra- and Postoperative Care.pdf',
 'Consensus Guidelines for Perioperative Care in Neonatal Intestinal Surgery Enhanced Recovery After Surgery (ERAS®) Society Recommendations.pdf',
 'Consensus Review of Optimal Perioperative Care in Breast Reconstruction Enhanced Recovery after Surgery (ERAS) Society Recommendations.pdf',
 'Consensus statement for perioperative care in lumbar spinal fusion Enhanced Recovery After Surgery (ERAS®) Society recommendations.pdf',
 'Consensus statement for perioperative care in total hip replaceme

In [4]:

print(db_manager.search("什麼是ERAS？請問繁體中文解釋。", source_files=['A-framework-for-perioperative-care-for-lower-extremity-vascular-bypasses-a-Consensus-Statement-by-the-Enhanced-Recovery-after-Surgery-ERAS-Society-and-Society-for-Vascular-Surgery.pdf',
 'British Journal of Surgery - 2014 - Mortensen - Consensus guidelines for enhanced recovery after gastrectomy.pdf',
 'Consensus Guidelines for Perioperative Care for Emergency Laparotomy Enhanced Recovery After Surgery (ERAS®) Society Recommendations Part 2—Emergency Laparotomy Intra- and Postoperative Care.pdf',
 'Consensus Guidelines for Perioperative Care in Neonatal Intestinal Surgery Enhanced Recovery After Surgery (ERAS®) Society Recommendations.pdf',
 'Consensus Review of Optimal Perioperative Care in Breast Reconstruction Enhanced Recovery after Surgery (ERAS) Society Recommendations.pdf',
 'Consensus statement for perioperative care in lumbar spinal fusion Enhanced Recovery After Surgery (ERAS®) Society recommendations.pdf',
 'Consensus statement for perioperative care in total hip replacement and totalknee replacement surgery Enhanced Recovery After Surgery (ERAS®) Societyrecommendations.pdf',
 'Enhanced Recovery After Surgery (ERAS) for gastrointestinal surgery, part 2 consensus statement for anaesthesia practice.pdf'] ))

[Document(metadata={'creationdate': '2017-04-05T08:22:17+05:30', 'creator': 'Adobe InDesign CS5.5 (7.5.3)', 'keywords': 'LWW', 'moddate': "D:20250428092157Z00'00'", 'page': 8, 'page_label': 'e1064', 'producer': 'PD4ML 4.0.18', 'source': './PDFS/Consensus Review of Optimal Perioperative Care in Breast Reconstruction Enhanced Recovery after Surgery (ERAS) Society Recommendations.pdf', 'source_file_name': 'Consensus Review of Optimal Perioperative Care in Breast Reconstruction Enhanced Recovery after Surgery (ERAS) Society Recommendations.pdf', 'start_index': 5414, 'total_pages': 16, 'trapped': '/False', 'distance': 0.8181858062744141}, page_content='access and preferences? Can J Plast Surg. 2012;20:37–42.\nDownloaded from http://journals.lww.com/plasreconsurg by BhDMf5ePHKav1zEoum1tQfN4a+kJLhEZgbsIHo4XMi0\nhCywCX1AWnYQp/IlQrHD3i3D0OdRyi7TvSFl4Cf3VC1y0abggQZXdgGj2MwlZLeI= on 04/28/2025'), Document(metadata={'author': 'Bertrand Debono MD', 'authoritativedomain[1]': 'sciencedirect.com', 'au

In [5]:
db_manager.rebuild_db()

True

# Model selection
model_name = "microsoft/Phi-4-reasoning"
model_name = "Qwen/Qwen3-30B-A3B"
model_name = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"
model_name = "Qwen/Qwen2.5-14B-Instruct-1M"

In [1]:
from core.model_manager import ModelManager
manager = ModelManager(model_name = "microsoft/Phi-4-reasoning", inactivity_timeout=60)
print(manager.model_name)


  from .autonotebook import tqdm as notebook_tqdm


microsoft/Phi-4-reasoning


In [2]:
manager.initialize()
print(manager.get_status())

Loading checkpoint shards: 100%|██████████| 6/6 [00:16<00:00,  2.69s/it]


{'model_name': 'microsoft/Phi-4-reasoning', 'initialized': True, 'current_device': 'cuda', 'device_details': {'type': 'cuda', 'index': 0}, 'last_used': 1747463457.6273034, 'gpu_available': True, 'gpu_memory': {'allocated': 8.48, 'reserved': 11.29, 'total': 31.37, 'unit': 'GB'}, 'max_context_length': 32768, 'monitor_thread_alive': True, 'load_in_4bit_setting': True, 'inactivity_timeout_setting': 60, 'monitor_check_interval_setting': 60}


In [3]:
model = manager.get_model()
tokenizer = manager.tokenizer

messages = [
    {"role": "system", "content": "You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:"},
    {"role": "user", "content": "什麼是ERAS？請問繁體中文解釋。"},
]
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")

outputs = model.generate(
    inputs.to(model.device),
    max_new_tokens=4096,
    temperature=0.8,
    top_k=50,
    top_p=0.95,
    do_sample=True,
)
print(tokenizer.decode(outputs[0]))

<|im_start|>system<|im_sep|>You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format:<think>{Thought section}</think>{Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, 

In [3]:
simple_prompt = "ERAS是什麼的縮寫？請用繁體中文回答。"
response = manager.generate_response(
    prompt=simple_prompt,
    max_new_tokens=4000,
    temperature=0.7,
    do_sample=True
)
print(f"模型直接回應: {response}")

模型直接回應: 


In [4]:
simple_prompt = "USER:ERAS是什麼的縮寫？請用繁體中文回答？根據ERAS指引，腰椎手術術前需要做怎麼樣的復健訓練? ASSISTANT:"
print("\n模型流式回應:")
for chunk in manager.generate_stream_response(
    prompt=simple_prompt,
    max_new_tokens=4000,
    temperature=0.1,
    do_sample=True
):
    print(chunk, end="", flush=True)
print()


模型流式回應:



In [11]:
print(manager.get_status())

{'model_name': 'Qwen/Qwen2.5-14B-Instruct-1M', 'initialized': True, 'current_device': 'cuda', 'device_details': {'type': 'cuda', 'index': 0}, 'last_used': 1747449971.331978, 'gpu_available': True, 'gpu_memory': {'allocated': 9.33, 'reserved': 9.61, 'total': 31.37, 'unit': 'GB'}, 'max_context_length': 1010000, 'monitor_thread_alive': True, 'load_in_4bit_setting': True, 'inactivity_timeout_setting': 60, 'monitor_check_interval_setting': 60}


In [13]:
manager.shutdown()

In [5]:
manager.model = None
manager.shutdown()