In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint # Now the import should work
import os
import dotenv
from dotenv import load_dotenv


In [None]:
%pwd

In [None]:
%cd ..
%pwd

In [None]:


loader = PyPDFDirectoryLoader("data")
data = loader.load()

In [None]:
len(data)


In [None]:
data

To store the data as one variable(optional)

In [None]:
question_gen = ""
for page in data:
    question_gen += page.page_content

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
text_chunk=text_splitter.split_documents(data)


In [None]:
text_chunk

In [None]:
# Load environment variables from .env file
load_dotenv()


import os

# Get the API key from the environment variables
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")


# Check if the API key was found
if HUGGINGFACEHUB_API_TOKEN:
    print("API key found.")
    # Now you can use OPENAI_API_KEY in your code
else:
    print("API key not found in .env file.")
    HUGGINGFACEHUB_API_TOKEN = input("Please enter your HuggingFcae API token: ")
    # You might want to store the entered key in the .env file for future use
    # but be careful about security implications if you're sharing the file.
    with open('.env', 'a') as f:
        f.write(f'\nHUGGINGFACEHUB_API_TOKEN="{HUGGINGFACEHUB_API_TOKEN}"')
    print("API key stored in .env file for future use.")

In [None]:
print(HUGGINGFACEHUB_API_TOKEN)

In [None]:
huggingfacehub_api_token = os.environ.get("HUGGINGFACEHUB_API_TOKEN")

llm = HuggingFaceEndpoint(
    task='text-generation',
    model="mistralai/Mistral-7B-Instruct-v0.3",
    max_new_tokens=1024,
    temperature=0.3,
    huggingfacehub_api_token=huggingfacehub_api_token
)

In [None]:
res=llm.invoke("who is the owner of openAI")

print(res)

In [None]:


from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModel


emb_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



In [None]:
emb_model

In [None]:
emb_model.embed_documents("text_chunk")

In [None]:
from langchain import embeddings
from langchain_chroma import Chroma
persist_directory = 'db'

vectordb = Chroma.from_documents(documents=text_chunk,
                                 embedding=emb_model,
                                 persist_directory=persist_directory)

In [None]:
vectordb

In [None]:
retriever = vectordb.as_retriever()

In [None]:
query="tell me about SDG"

In [None]:
docs=vectordb.similarity_search(query,k=5)
docs

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Define the prompt template
template = """
Please analyze the conteXt and generate 20 diverse and insightful questions that could be asked about the information within the PDF. For each question, also provide a concise and accurate answer based on the PDF's content.



Requirements:
* Question types: Aim for a variety of question types (e.g., factual, inferential, definitional).
* Relevance: All questions and answers must be directly related to the information presented in the PDF without using the word PDF.
* Clarity: Questions and answers should be clearly worded and easy to understand.
* Conciseness: Answers should be concise but provide sufficient information.
* Accuracy: Answers must be factually correct according to the PDF.

Output format:
Present the questions and answers in a numbered list format, like this:

Question: [Question 1]
Answer: [Answer 1]



Context: {context}
"""
QA_PROMPT = PromptTemplate(
    template=template, input_variables=["context"]
)

# Update the RetrievalQA chain with the new prompt
query_retriever_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_PROMPT} 
)

# Run the query to generate questions and answers
result = query_retriever_chain({"query": "SDG"}) 
qa_list = result["result"].split("\n\n")# Split into question-answer pairs

for qa_pair in qa_list:
    print(qa_pair + "\n-------------------------------------------\n")

# Save th# Add line breakers and save to QnA.txt

with open("QnA.txt", "w") as f:
    for qa_pair in qa_list:
        f.write(qa_pair + "\n--------------------------------------------\n")  # Add line breaker after each paire result to QnA.txt
