In [None]:
! git clone https://github.com/NCHU-NLP-Lab/demeterchain.git
! pip install demeterchain

In [None]:
cd demeterchain/examples

# 讀取文件

In [1]:
from demeterchain.loaders import TextLoader
from demeterchain.splitters import TextSplitter

loader = TextLoader('wiki_datas/', glob='*.txt', show_progress=True)
documents = loader.load()

100%|██████████| 12/12 [00:00<00:00, 17741.15it/s]


In [2]:
documents

[Document(page_content='蘋果樹（學名：Malus domestica）是薔薇科蘋果亞科蘋果屬植物，為落葉喬木，在世界上廣泛種...', metadata={'source': 'wiki_datas/蘋果.txt'}),
 Document(page_content='智慧型手機（英語：smartphone）是一種既可用來撥打移動電話具有多功能任務處理移動計算的行動裝...', metadata={'source': 'wiki_datas/智慧型手機.txt'}),
 Document(page_content='香蕉（學名：Musa × paradisiaca），又名甘蕉、芎蕉、芽蕉，弓蕉（閩南語：king-t...', metadata={'source': 'wiki_datas/香蕉.txt'}),
 Document(page_content='太陽餅，是一種甜餡薄餅，一般內餡是麥芽糖，源起於台中市神岡區社口一帶林家崑派的麥芽餅，是台灣台中市的...', metadata={'source': 'wiki_datas/太陽餅.txt'}),
 Document(page_content='草莓（學名：Fragaria × ananassa），中國大陸和台灣稱草莓，在香港常稱為士多啤梨（英...', metadata={'source': 'wiki_datas/草莓.txt'}),
 Document(page_content='冰箱[註 1]，是以低溫保存食物等物品的機械設備。工業用冰箱適用於工業環境，如餐廳、食品加工和超市。...', metadata={'source': 'wiki_datas/冰箱.txt'}),
 Document(page_content='筆記型電腦（英語：notebook computer）又稱膝上電腦（laptop computer）...', metadata={'source': 'wiki_datas/筆記型電腦.txt'}),
 Document(page_content='稻（學名：Oryza sativa），古稱禾稻[1]，其果實為可作為糧食食用部位，稱米，是禾本科稻屬...', metadata={'source': 'wiki_datas/水稻.txt'}),
 Document

# 設定retriever

In [22]:
pip install rank_bm25 jieba

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
from demeterchain.retrievers import RankBM25Retriever

retriever = RankBM25Retriever.from_documents(documents)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.358 seconds.
Prefix dict has been built successfully.


In [4]:
# 測試
retriever.invoke("甚麼水果產於南美洲", k=3)

[Document(page_content='鳳梨（Ananas comosus），俗名菠蘿、露兜子[1]，是原產於南美洲的熱帶水果，為禾本目鳳梨...', metadata={'source': 'wiki_datas/鳳梨.txt'}),
 Document(page_content='草莓（學名：Fragaria × ananassa），中國大陸和台灣稱草莓，在香港常稱為士多啤梨（英...', metadata={'source': 'wiki_datas/草莓.txt'}),
 Document(page_content='蘋果樹（學名：Malus domestica）是薔薇科蘋果亞科蘋果屬植物，為落葉喬木，在世界上廣泛種...', metadata={'source': 'wiki_datas/蘋果.txt'})]

In [4]:
# 保存
retriever.save("bm25")

In [4]:
# 讀取
from demeterchain.retrievers import RankBM25Retriever

retriever = RankBM25Retriever.load("bm25")

# 設定回答模型

In [None]:
! huggingface-cli login

In [1]:
model_name_or_path = "NchuNLP/taide-qa" # 需要申請權限

In [6]:
from demeterchain.models import GenerativeModel
from demeterchain.utils import QAModelConfig, PromptTemplate


model_config = QAModelConfig(
    model_name_or_path = model_name_or_path,
    device_map = "auto",
    quantize = "bitsandbytes-nf4",
    noanswer_str = "無法回答。",
    template = PromptTemplate(
        input_variables = ["doc", "query"],
        template="[INST] <<SYS>>\n請根據提供的問題，從提供的內文中尋找答案並回答，回答時只需要輸出答案，不需輸出其他資訊，如果從提供的內文無法找到答案，請回答\"無法回答\"\n<</SYS>>\n\n問題:\n{query}\n\n內文:\n{doc}\n [/INST]答案:\n"
     )
)
reader = GenerativeModel(config=model_config)

[2024-05-14 07:10:15,567] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [7]:
# 測試
from demeterchain.utils import Document

inputs = {"query": "什麼水果是紅色的?"}
docs = [
    "蘋果是紅色的水果",
    "香蕉是黃色的水果",
    "我是小明，今天天氣很好",
    "草莓不是黑的，是紅色的水果"]
docs = [Document(page_content=doc) for doc in docs]
answer_doc = reader.get_answer(inputs, docs)

print(answer_doc)

100%|██████████| 4/4 [00:02<00:00,  1.36it/s]

{'蘋果': Document(page_content='蘋果是紅色的水果', metadata={}), '草莓': Document(page_content='草莓不是黑的，是紅色的水果', metadata={})}





# 串接全部套件

In [8]:
from demeterchain.utils import QAConfig
from demeterchain.chains import RetrievalQA

qa = RetrievalQA(reader=reader, retriever=retriever,)
qa_config = QAConfig(retrieve_k = 3)

In [9]:
result = qa({"query": "恐龍好吃嗎?"}, qa_config=qa_config)
answers = [answer.answer for answer in result.answers]
print(answers)

100%|██████████| 3/3 [00:03<00:00,  1.18s/it]

['很抱歉，模型無法根據現有資料集回答您的問題。']





In [10]:
result = qa({"query": "哪間大學的農業科學很厲害?"}, qa_config=qa_config)
answers = [answer.answer for answer in result.answers]
print(answers)

100%|██████████| 3/3 [00:01<00:00,  1.71it/s]

['國立中興大學']





In [11]:
result = qa({"query": "DPP是甚麼的簡寫?"}, qa_config=qa_config)
answers = [answer.answer for answer in result.answers]
print(answers)

100%|██████████| 3/3 [00:02<00:00,  1.29it/s]

['民主進步黨']



