In [3]:
from datasets import load_dataset

# Load the 'Medical Sciences_Anatomy' subset from the 'DomainSpeech' dataset
dataset = load_dataset("DoSp/DomainSpeech", "Medical Sciences_Pediatric Hematology")

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
text_list = dataset['test']['sentence']

In [10]:
file_path = "my_text_file.txt"

# Writing the texts to the file
with open(file_path, "w") as file:
    for text in text_list:
        file.write(text + "\n")

In [86]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=20,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)


In [87]:
with open(file_path, "r") as file:
    file_content = file.read()


In [88]:
texts = text_splitter.create_documents([file_content])
print(texts[0:10])

[Document(page_content='Using flow'), Document(page_content='cytometry, we'), Document(page_content='observed aberrant'), Document(page_content='expression of CD34,'), Document(page_content='CD117, and CD13'), Document(page_content='markers in the bone'), Document(page_content='marrow of pediatric'), Document(page_content='patients with acute'), Document(page_content='myeloid leukemia.'), Document(page_content='Following a')]


In [89]:
chunked_docs = text_splitter.split_text(file_content)

In [90]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

db = FAISS.from_texts(chunked_docs, HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"))

In [91]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [92]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "HuggingFaceH4/zephyr-7b-beta"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 8/8 [00:11<00:00,  1.43s/it]


In [93]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.01,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Correct the spelling in the given sentence. Say only the sentence.
Correct only the spelling of the word. Do not revise word totally. Use the following accurate words:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

In [94]:
from langchain_core.runnables import RunnablePassthrough

retriever = db.as_retriever()

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

## Error spelling sentence 1 (Whisper-small):
They conducted pediatric patients with hemophabocetic lymphocytic ketosis, identifying risk factors.

In [95]:
question = " They conducted pediatric patients with hemophabocetic lymphocytic ketosis, identifying risk factors."

In [96]:
print(rag_chain.invoke(question))



<|system|>
Correct the spelling in the given sentence. Say only the sentence.
Correct only the spelling of the word. Do not revise word totally. Use the following accurate words:

[Document(page_content='pediatric patients.'), Document(page_content='pediatric patients.'), Document(page_content='pediatric patients.'), Document(page_content='pediatric patients.')]

</s>
<|user|>
 They conducted pediatric patients with hemophabocetic lymphocytic ketosis, identifying risk factors.
</s>
<|assistant|>

  They conducted pediatric patients with hemophagocytic lymphohistiocytic syndrome (HLH), identifying risk factors.

(Note: Hemophagocytic lymphohistiocytosis (HLH) is a rare disorder that affects the immune system. In HLH, white blood cells called histiocytes and lymphocytes become overactive and attack other cells in the body, leading to symptoms such as fever, enlarged lymph nodes, and low platelet counts. Pediatric patients with HLH are at increased risk for complications, including organ

## GT

They conducted pediatric patients with hemophagocytic lymphohistiocytosis, identifying risk factors.

## Result 1:
<|assistant|>

  They conducted pediatric patients with hemophagocytic lymphohistiocytic syndrome (HLH), identifying risk factors.

(Note: Hemophagocytic lymphohistiocytosis (HLH) is a rare disorder that affects the immune system. It can cause severe inflammation and damage to various organs in the body, including the liver, spleen, and bone marrow. Pediatric patients with HLH are often treated with immunosuppressive therapy and other supportive measures.)

In [98]:
import jiwer

out = jiwer.process_words(
    ["They conducted pediatric patients with hemophagocytic lymphohistiocytosis, identifying risk factors.", "They conducted pediatric patients with hemophagocytic lymphohistiocytosis, identifying risk factors."],
    ["They conducted pediatric patients with hemophabocetic lymphocytic ketosis, identifying risk factors.", "They conducted pediatric patients with hemophagocytic lymphohistiocytic syndrome (HLH), identifying risk factors."],
)

print(jiwer.visualize_alignment(out))

sentence 1
REF: They conducted pediatric patients with ************** hemophagocytic lymphohistiocytosis, identifying risk factors.
HYP: They conducted pediatric patients with hemophabocetic    lymphocytic             ketosis, identifying risk factors.
                                                         I              S                    S                          

sentence 2
REF: They conducted pediatric patients with hemophagocytic ***************** ******** lymphohistiocytosis, identifying risk factors.
HYP: They conducted pediatric patients with hemophagocytic lymphohistiocytic syndrome               (HLH), identifying risk factors.
                                                                           I        I                    S                          

number of sentences: 2
substitutions=3 deletions=0 insertions=3 hits=17

mer=26.09%
wil=37.17%
wip=62.83%
wer=30.00%



 ## Error speling sentence 2(Whisper tiny):
 They conducted pediatric patients with hemophavocetic lymphocistocytosis, identifying risk factors.

In [97]:
question = "  They conducted pediatric patients with hemophavocetic lymphocistocytosis, identifying risk factors."
print(rag_chain.invoke(question))


<|system|>
Correct the spelling in the given sentence. Say only the sentence.
Correct only the spelling of the word. Do not revise word totally. Use the following accurate words:

[Document(page_content='risk of pediatric'), Document(page_content='high-risk pediatric'), Document(page_content='(HSCT) in pediatric'), Document(page_content='pediatric patients,')]

</s>
<|user|>
  They conducted pediatric patients with hemophavocetic lymphocistocytosis, identifying risk factors.
</s>
<|assistant|>

  They conducted pediatric patients with hemophagocytic lymphohistiocytosis, identifying risk factors. (Note: Hemophagocytic lymphohistiocytosis is a type of high-risk pediatric condition that requires specialized treatment, such as hematopoietic stem cell transplantation (HSCT).)


## GT

They conducted pediatric patients with hemophagocytic lymphohistiocytosis, identifying risk factors.

## Result 2:
<|assistant|>

  They conducted pediatric patients with hemophagocytic lymphohistiocytosis, identifying risk factors.
  <br>
  (Completely same!)
  <br><br><br>
   (Note: Hemophagocytic lymphohistiocytosis is a type of high-risk pediatric condition that requires specialized treatment, such as hematopoietic stem cell transplantation (HSCT).)

In [99]:
import jiwer

out = jiwer.process_words(
    ["They conducted pediatric patients with hemophagocytic lymphohistiocytosis, identifying risk factors.", "They conducted pediatric patients with hemophagocytic lymphohistiocytosis, identifying risk factors."],
    ["They conducted pediatric patients with hemophabocetic lymphocytic ketosis, identifying risk factors.", "They conducted pediatric patients with hemophagocytic lymphohistiocytosis, identifying risk factors."],
)

print(jiwer.visualize_alignment(out))

sentence 1
REF: They conducted pediatric patients with ************** hemophagocytic lymphohistiocytosis, identifying risk factors.
HYP: They conducted pediatric patients with hemophabocetic    lymphocytic             ketosis, identifying risk factors.
                                                         I              S                    S                          

number of sentences: 2
substitutions=2 deletions=0 insertions=1 hits=18

mer=14.29%
wil=22.86%
wip=77.14%
wer=15.00%

