In [None]:
import re
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma


In [None]:
loader=PyPDFLoader('Yongshun.pdf')
documents=loader.load()
documents = documents[1:]
documents_texts = '\n'.join([doc.page_content[1:] for doc in documents])
documents_texts = re.sub(r'(第[一二三四五六七八九十零百千万]+條)', r'@\1', documents_texts)
documents_texts = re.sub(r'(第[一二三四五六七八九十]+章.*?)(?=\n)', '', documents_texts)
documents_texts = documents_texts.replace('\n', '').replace(' ','')

for i in range(1,16):
    text="永純化學工業股份有限公司工作規則桃園市政府113年1月9日府勞條字第1120369127號函核備文件名稱文件編號版次總頁頁次工作規則YS-A-01C15"+str(i)
    text2="永純化學工業股份有限公司工作規則文件名稱文件編號版次總頁頁次工作規則YS-A-01C15"+str(i)
    documents_texts = documents_texts.replace(text,"").replace(text2,"")
documents_texts = documents_texts.replace('\n', '').replace(' ','').replace('@', ' ')

text_splitter = RecursiveCharacterTextSplitter(chunk_size=570, chunk_overlap=300)
texts = text_splitter.split_text(documents_texts)
print(documents_texts)

In [None]:
embeddings=HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")

In [15]:
if 'db2' in locals(): db2.delete_collection()
db2=Chroma.from_texts(texts,embeddings,persist_directory="db2", collection_metadata={"hnsw:space": "cosine"})

In [None]:
import torch
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model = AutoModelForCausalLM.from_pretrained(
    "MediaTek-Research/Breeze-7B-Instruct-v0_1",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2" # optional
)
tokenizer = AutoTokenizer.from_pretrained("MediaTek-Research/Breeze-7B-Instruct-v0_1")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500)
llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=600)
llm = HuggingFacePipeline(pipeline=pipe)

In [16]:
retriever = db2.as_retriever( search_kwargs={'k': 10})

In [None]:
a = retriever.invoke("員工報到時需要繳交哪些文件？")
for i in a:
  print(re.findall(r'第.{1,5}條(?:之.)*', i.page_content))
  print(i.page_content)
  print('='*30)

In [None]:
import pandas as pd
from langchain_core.prompts import PromptTemplate
template = """你是問答任務的助手。注意:請根據以下文章來回答問題。注意:如果你在文章中找不到答案，請回答"沒有提及"。答案只能從文章內尋找，請勿根據其他資訊來回答。

文章:{context}

問題:{question}

請詳細回答並說明。

回答:"""
custom_rag_prompt = PromptTemplate.from_template(template)


from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

df = pd.read_excel("Questions_Answers.xlsx")
question = df['Query']
answer = df['標準答案_永純化學']
print("模型答案",rag_chain.invoke("假期總類有哪些?"))
for (ques,ans) in zip(question,answer):
  print(ques)
  if ques == None:
    break
  print("模型答案:",rag_chain.invoke(ques))
  print()
  print("標準答案:",ans)
  print("="*50)