## 1. Loading required libraries

In [14]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## 2. Loading PDF files

In [15]:
# Defining a fuction to retrieve the pdf file
def get_pdf_text(pdf_docs):
    text = ""
    #for pdf in pdf_docs:
    pdf_reader = PyPDFLoader(pdf_docs)
    for page in pdf_reader.load():
        text += page.page_content
    return text

In [16]:
# Getting pdf document
text= get_pdf_text("R - into .pdf")

## 3. Splitting the document into chunks

In [17]:
# Defining the function to perform the splitting
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

In [18]:
texts = text_splitter.split_text(text)
texts[0]


'A\nBrief\nIntroduction\nto\nR\nThe\njourney\nof\nstatistical\ncomputing\nand\ngraphics\ngets\ninteresting\nwhen\nit\ncomes\nto\nR\nbut\nbefore\nwe\ndelve\ninto\nthe\nfundamental\nintroduction\nto\nwhat\nR\nis,\nlet\nthe\nwords\nof\nstatistics,\nstatistical\ncomputing,\nand\ngraphics\nnot\ndeter\nyou\nfrom\nstarting.\nWe\nare\nabout\nto\nsimplify\nthem.\nLet\nus\nstart\nwith\ndata.\nData\nrefers\nto\nsimply\nraw\ninformation\nor\nrather\nunprocessed\ninformation\nWhat\nare\nstatistics?\nStatistics\nhighlight\nmethodologies\nused\nin\nthe\ncollection,\norganization,'

## 4. Embedd segmented text

In [19]:
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large", model_kwargs={"device": "cpu"})

load INSTRUCTOR_Transformer
max_seq_length  512


## 5. Creating the Vector Database

In [20]:
db = FAISS.from_texts(texts, embeddings)

## 6. Making a retriever

In [21]:
retriever = db.as_retriever()
docs = retriever.get_relevant_documents("What is Statistics?")

In [2]:
# Check similarity search is working
query = "What is Statistics?"
docs = db.similarity_search(query)
docs[0].page_content

NameError: name 'db' is not defined

## 7. Creating a chain

In [23]:
import os
from getpass import getpass

os.environ['HUGGINGFACEHUB_API_TOKEN'] = getpass()

In [24]:
# Using the Mixtral API
llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",task="text-generation",model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,"temperature":0.1,
        "repetition_penalty": 1.03, "max_length":64})


In [25]:
# Create QA chain to integrate similarity search with user queries (answer query from knowledge base)
import re
chain = load_qa_chain(llm, chain_type="stuff")

query = "What is statistics?"
docs = db.similarity_search(query)

chain_response = chain.run(input_documents=docs, question=query)

# Use regular expressions to find the answer
match = re.search(r'Helpful Answer:(.*)', chain_response)
if match:
    answer = match.group(1).strip()
    print("Answer:", answer)
else:
    print("Answer not found in the response.")

Answer: Statistics is a field that deals with methodologies used in the collection, organization, and interpretation of data. It involves analyzing raw information to extract meaningful insights and conclusions.


In [26]:
import re

# Defining a function to automate the process
def get_feedback(query):
    # Load the QA chain
    chain = load_qa_chain(llm, chain_type="stuff")
    
    # Perform similarity search
    docs = db.similarity_search(query)
    
    # Run the QA chain
    chain_response = chain.run(input_documents=docs, question=query)
    
    # Use regular expressions to find the answer
    match = re.search(r'Helpful Answer:(.*)', chain_response)
    if match:
        answer = match.group(1).strip()
        return answer
    else:
        return "Answer not found in the response."

# Get user input
user_query = input("Enter your query: ")

# Get feedback
feedback = get_feedback(user_query)

# Print feedback
print("Feedback:", feedback)


Feedback: Data refers to raw or unprocessed information.
