# RAG with LLAMA and FAISS
credit: https://www.matillion.com/blog/step-by-step-guide-building-a-rag-model-with-open-source-llm-llama-2-and-vector-store-faiss

## install packages

## load pdf

In [None]:
#!rm "pdf_files/1.pdf"

In [3]:

import glob
md_folder="md_files"
md_path = glob.glob(md_folder + "/*.md")
print(md_path)

['md_files\\1_1733975053.267784.md', 'md_files\\2_1733975093.5062084.md', 'md_files\\3_1733975145.179506.md', 'md_files\\Adv Funct Materials - 2023 - Yu - Chalcogenide Perovskite Thin Films with Controlled Phases for Optoelectronics_1733974706.4877486.md']


In [11]:
import os
import re
import markdown
#from pdfminer.high_level import extract_text as extract_text_from_pdf
from io import StringIO
from html.parser import HTMLParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size = 1200
chunk_overlap = 300

# List to store all document chunks
all_docs = []
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = StringIO()

    def handle_data(self, d):
        self.text.write(d)

    def get_data(self):
        return self.text.getvalue()

def clean_markdown(text):
    """Clean Markdown syntax from text."""
    # Remove Markdown URL links
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    # Remove bold and italic text markers
    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
    text = re.sub(r'\*([^*]+)\*', r'\1', text)
    text = re.sub(r'__([^_]+)__', r'\1', text)
    text = re.sub(r'_([^_]+)_', r'\1', text)
    # Remove images and their references
    text = re.sub(r'!\[[^\]]]*]\([^)]*\)', '', text)
    # Remove headers markers
    text = re.sub(r'#+\s?', '', text)
    # Remove other Markdown syntax as needed (e.g., tables, bullet points)
    text = re.sub(r'\|', ' ', text)
    text = re.sub(r'-{2,}', '', text)
    text = re.sub(r'\n{2,}', '\n', text)  # Remove extra newlines
    return text

def extract_text_from_md(md_path):
    """Extract and clean text from a Markdown file."""
    with open(md_path, "r", encoding="utf-8") as file:
        md_content = file.read()
        html = markdown.markdown(md_content)
        text = strip_tags(html)
        return clean_markdown(text)
def strip_tags(html):
    """Remove HTML tags from a string."""
    s = MLStripper()
    s.feed(html)
    return s.get_data()


for file_path in md_path:
    file_content = extract_text_from_md(file_path)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_text(file_content)
    file_name_without_extension = os.path.splitext(filename)[0]
    for i, chunk in enumerate(docs):
        # Define metadata for each chunk (you can customize this)
        metadata = {
            "File Name": file_name_without_extension,
            "Chunk Number": i + 1,
        }
        # Create a header with metadata and file name
        header = f"File Name: {file_name_without_extension}\n"
        for key, value in metadata.items():
            header += f"{key}: {value}\n"
        
        # Combine header, file name, and chunk content
        chunk_with_header = header + file_name_without_extension + "\n" + chunk
        all_docs.append(chunk_with_header)
    print(f"Processed: {filename}")

Processed: md_files\1_1733975053.267784.md
Processed: md_files\1_1733975053.267784.md
Processed: md_files\1_1733975053.267784.md
Processed: md_files\1_1733975053.267784.md


## create FAISS vector store

In [12]:
from langchain.vectorstores import FAISS
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings

# Initialize HuggingFaceInstructEmbeddings
'''
embeding_name = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"#"/kaggle/input/gemma-2/pytorch/gemma-2-2b-pt/1"#"hkunlp/instructor-large"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
hf_embedding = HuggingFaceEmbeddings(
    model_name=embeding_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
'''
# use ollama embedding
local_llm = "llama3.2:3b-instruct-fp16"
embedding=OllamaEmbeddings(model=local_llm)
# Embed and index all the documents using FAISS
db = FAISS.from_texts(all_docs, embedding)

# Save the indexed data locally
db.save_local("faiss_AiDoc")

In [None]:
# Load the FAISS index from local storage
#db = FAISS.load_local("faiss_AiDoc", embeddings=hf_embedding)

## query

In [31]:
from langchain.llms import LlamaCpp
import transformers
from langchain import PromptTemplate, LLMChain
from langchain_huggingface import HuggingFacePipeline
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import (
    StreamingStdOutCallbackHandler
)
from langchain_ollama import ChatOllama

# Template for question-answer prompt
template = """Question: {question} \n\nAnswer:"""
# Initialize prompt template and callback manager
prompt = PromptTemplate(template=template, input_variables=["question"])
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])






llm = ChatOllama(model=local_llm, temperature=0)
# Create LLMChain
#llm_chain = LLMChain(prompt=prompt, llm=llm)
llm_chain = prompt | llm








# Define a query to search the indexed documents
query = "what is advantage of BaZrS3?"
# Search for semantically similar chunks and return the top 5 chunks
search = db.similarity_search(query, k=2)

# Define a template for generating a final prompt
template = '''Context: {context}
Based on the Context, please answer the following question:
Question: {question}
Provide an answer based on the context only, without using general knowledge. The answer
should be derived directly from the context provided.
Please correct any grammatical errors for improved readability.
If the context does not contain relevant information to answer the question, state that the
answer is not available in the given context.
Please include the source title of the information as a reference of how you arrive at your
answer. '''

# Create a prompt template
prompt = PromptTemplate(input_variables=["context", "question"], template=template)

# Format the final prompt with the query and search results
final_prompt = prompt.format(question=query, context=search)

# Run LLMChain to generate an answer based on the context
generation=llm_chain.invoke(final_prompt)

In [29]:
generation.content

'Based on the provided context, one advantage of BaZrS3 can be inferred:\n\nBaZrS3 has reasonably long carrier lifetimes, reaching approximately 50 ns as measured by time-resolved PL.\n\nThis suggests that BaZrS3 has a favorable property related to its carrier lifetime, which is beneficial for certain applications.'

# gradio interface

In [1]:
import gradio as gr
from langchain.vectorstores import FAISS
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings,ChatOllama
from langchain import PromptTemplate
from langchain_core.messages import HumanMessage
from langchain import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import (
    StreamingStdOutCallbackHandler
)
from langchain_ollama import ChatOllama

import glob

import glob
md_folder="md_files"
md_path = glob.glob(md_folder + "/*.md")


import os
import re
import markdown
#from pdfminer.high_level import extract_text as extract_text_from_pdf
from io import StringIO
from html.parser import HTMLParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size = 1200
chunk_overlap = 300

# List to store all document chunks
all_docs = []
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = StringIO()

    def handle_data(self, d):
        self.text.write(d)

    def get_data(self):
        return self.text.getvalue()

def clean_markdown(text):
    """Clean Markdown syntax from text."""
    # Remove Markdown URL links
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    # Remove bold and italic text markers
    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
    text = re.sub(r'\*([^*]+)\*', r'\1', text)
    text = re.sub(r'__([^_]+)__', r'\1', text)
    text = re.sub(r'_([^_]+)_', r'\1', text)
    # Remove images and their references
    text = re.sub(r'!\[[^\]]]*]\([^)]*\)', '', text)
    # Remove headers markers
    text = re.sub(r'#+\s?', '', text)
    # Remove other Markdown syntax as needed (e.g., tables, bullet points)
    text = re.sub(r'\|', ' ', text)
    text = re.sub(r'-{2,}', '', text)
    text = re.sub(r'\n{2,}', '\n', text)  # Remove extra newlines
    return text

def extract_text_from_md(md_path):
    """Extract and clean text from a Markdown file."""
    with open(md_path, "r", encoding="utf-8") as file:
        md_content = file.read()
        html = markdown.markdown(md_content)
        text = strip_tags(html)
        return clean_markdown(text)
def strip_tags(html):
    """Remove HTML tags from a string."""
    s = MLStripper()
    s.feed(html)
    return s.get_data()


for file_path in md_path:
    file_content = extract_text_from_md(file_path)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_text(file_content)
    file_name_without_extension = os.path.splitext(file_path)[0]
    for i, chunk in enumerate(docs):
        # Define metadata for each chunk (you can customize this)
        metadata = {
            "File Name": file_name_without_extension,
            "Chunk Number": i + 1,
        }
        # Create a header with metadata and file name
        header = f"File Name: {file_name_without_extension}\n"
        for key, value in metadata.items():
            header += f"{key}: {value}\n"
        
        # Combine header, file name, and chunk content
        chunk_with_header = header + file_name_without_extension + "\n" + chunk
        all_docs.append(chunk_with_header)
    print(f"Processed: {file_path}")



local_llm = "nomic-embed-text:latest"
embedding=OllamaEmbeddings(model=local_llm)
# Embed and index all the documents using FAISS
db = FAISS.from_texts(all_docs, embedding)





  from .autonotebook import tqdm as notebook_tqdm


Processed: md_files\1_1733975053.267784.md
Processed: md_files\2_1733975093.5062084.md
Processed: md_files\3_1733975145.179506.md
Processed: md_files\Adv Funct Materials - 2023 - Yu - Chalcogenide Perovskite Thin Films with Controlled Phases for Optoelectronics_1733974706.4877486.md


In [2]:
from langchain_core.messages import HumanMessage
def get_output(modelname:str,is_RAG:str,questions:str):
    if is_RAG== "RAG":
        query = questions#"what is advantage of chalcogenide perovskite?"
        # Search for semantically similar chunks and return the top 5 chunks
        search = db.similarity_search(query, k=5)
        # Template for question-answer prompt
        template = """Question: {question} \n\nAnswer:"""
        # Initialize prompt template and callback manager
        prompt = PromptTemplate(template=template, input_variables=["question"])
        llm = ChatOllama(model=modelname, temperature=0)
        llm_chain = prompt | llm
        
        # Define a template for generating a final prompt
        template = '''You are an assistant for question-answering tasks. 

        Here is the context to use to answer the question:

        {context} 

        Think carefully about the above context. 

        Now, review the user question:

        {question}

        Provide an answer to this questions using only the above context. 

        Use three sentences maximum and keep the answer concise.

        Answer:'''


        # Run LLMChain to generate an answer based on the context
        rag_prompt_formatted = template.format(context=search, question=query)
        generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
        return generation.content
    else:
        query = questions
        template = """Answer the Question: {question} """
        # Initialize prompt template and callback manager
        prompt = PromptTemplate(template=template, input_variables=["question"])
        llm = ChatOllama(model=modelname, temperature=0)
        llm_chain = prompt | llm
        no_rag_prompt=template.format(question=query)
        generation=llm_chain.invoke(no_rag_prompt)
        return generation.content
demo = gr.Interface(
    fn=get_output,
    inputs=[
        gr.Radio(
            choices=["llama3.2:3b-instruct-fp16", "qwen2.5-coder:14b", "llama3.2-vision:latest"],
            type="value",
            value="llama3.2:3b-instruct-fp16",  # Set default value to "Model 1"
            label="Select Model"
        ),
        gr.Radio(
            choices=["RAG", "No RAG"],
            type="value",
            value="RAG",  # Set default value to "Model 1"
            label="RAG or not"
        ),
        gr.Textbox(label="Input Questions",info="input questions on chalcogenide perovskites")
    ],
    outputs="markdown",
    title="RAG Llama3.2-3b based on chalcogenide perovskite papers",
    description="""
    ## Select a model, and ask a question to get the output; or just click on the examples below.
    """,
    examples=[["llama3.2:3b-instruct-fp16","RAG","what is advantage of BaZrS3?"],
              ["qwen2.5-coder:14b","RAG","what is bandgap of SrHfS3?"],
              ["llama3.2-vision:latest","RAG","why is chalcogenide perovskite important?"]
              ]
)

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch(share=False)

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.
