In [13]:
from datasets import load_dataset
import tqdm

dataset = load_dataset("NTTUNLPTEAM/class-textbook",split='train')
format_func = lambda data: f"text: {data['text']}, metadata: {data['metadata']}, type: {data['type']}, summary: {data['summary']}"

In [14]:
from dotenv import load_dotenv
load_dotenv()

from langchain_community.vectorstores.faiss import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [19]:
embeddings = OllamaEmbeddings(model='all-minilm')
text = 'hello world'

query_result = embeddings.embed_query(text)
query_result[:5]

[-0.028497308492660522,
 0.08764791488647461,
 -0.02262119948863983,
 -0.07729446142911911,
 -0.09653183072805405]

In [26]:
vectorstore = FAISS.from_texts('test',embeddings)


In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_text(format_func(dataset))

# if os.path.exists("./dataset/faiss_db"):
vectorstore = FAISS.from_texts(texts=splits, embedding=OllamaEmbeddings(model='all-minilm'))
vectorstore.save_local("faiss-index")
newvector = FAISS.load_local("faiss-index",embeddings,allow_dangerous_deserialization=True)

# Retrieve and generate using the relevant snippets of the blog.
retriever = newvector.as_retriever()

In [None]:
# from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

# llm = HuggingFacePipeline.from_model_id(
#     model_id="gpt2",
#     task="text-generation",
#     pipeline_kwargs={"max_new_tokens": 100}
# )

# from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# model_id = "gpt2"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id)
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10)
# hf = HuggingFacePipeline(pipeline=pipe)


In [None]:
from langchain_community.llms.ollama import Ollama

llm = Ollama()
llm.invoke("test")

'Hello! How can I help you today?'

In [None]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")
example_messages = prompt.invoke(
    {"context": "filler context", "question": "filler question"}
).to_messages()
example_messages
print(example_messages[0].content)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: filler question 
Context: filler context 
Answer:


In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join( doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
ans = rag_chain.invoke("give me a summary of the text and generate ten questions")

import translators as ts
ts.translate_text(query_text=ans, translator='google', from_language= 'en', to_language='zh-TW')

'以下是基於提供的上下文的十個問題： \n\n 1.“醫院感染”一詞是什麼意思是什麼？醫院感染的兩種一般類型是什麼？ \n 2.命名病原體進入人體的五個潛在入口。 \n 3.傳染病和傳染性疾病有什麼區別？ \n 4.解釋巴氏滅菌和滅菌之間的差異。 \n 5.描述病毒的結構及其如何引起疾病。 \n 6.命名人體病原體的五個潛在出口門戶。 \n 7.什麼是酵母和黴菌，它們彼此之間有何不同？ \n 8.描述表面和全身mycoses之間的差異。 \n 9.什麼是緩衝系統，如何限制pH的巨大變化？ \n 10.命名一些由向量傳播的疾病，並為每個疾病命名矢量。'

In [None]:
print(ans)

Here are ten questions based on the provided context:

1. What does the term "nosocomial infection" refer to, and what are the two general types of nosocomial infections?
2. Name five potential portals of entry for pathogens into the human body.
3. What is the difference between a communicable disease and a contagious disease?
4. Explain the difference between pasteurization and sterilization.
5. Describe the structure of a virus and how it causes disease.
6. Name five potential portals of exit for pathogens from the human body.
7. What are yeasts and molds, and how do they differ from each other?
8. Describe the differences between superficial and systemic mycoses.
9. What is a buffer system, and how does it limit great changes in pH?
10. Name some diseases that are spread by vectors, and name the vector for each.


In [None]:
ans = rag_chain.invoke("Name five potential portals of entry for pathogens into the human body.")
print(ts.translate_text(query_text=ans, translator='google', from_language= 'en', to_language='zh-TW'))

病原體進入人體的五個潛在入口是： 

 1.鼻腔：病原體可以通過鼻子進入身體，尤其是通過鼻腔中的粘膜進入身體。 
 2.口：病原體可以通過攝入污染的食物或水或直接與皮膚接觸來通過口腔進入身體。 
 3.皮膚：病原體可以通過皮膚斷裂進入身體，例如切割，刮傷或傷口。 
 4.眼睛：病原體可以通過眼睛進入身體，尤其是通過暴露於污染的水或觸摸受污染的表面然後觸摸臉部的情況。 
 5.泌尿生殖道：病原體可以在性接觸期間通過泌尿生殖道進入身體，或通過暴露於污染的水或土壤中。
