In [66]:
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

import pandas as pd
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai
from langchain.vectorstores.faiss import FAISS
import faiss

In [67]:
warnings.filterwarnings("ignore")

In [68]:
import numpy as np

In [69]:
import os
from dotenv import load_dotenv
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")


In [70]:
model = ChatGoogleGenerativeAI(
    model = "gemini-1.5-pro",google_api_key = GOOGLE_API_KEY,temperature =0, convert_system_message_to_human=True
)

In [71]:
pdf_loader = PyPDFLoader("C:\Projects\Langchain\Rag_doc\Applied Natural Language Processing.pdf")
pages = pdf_loader.load_and_split()

In [72]:
pages

[Document(metadata={'source': 'C:\\Projects\\Langchain\\Rag_doc\\Applied Natural Language Processing.pdf', 'page': 0}, page_content='Applied Natural \nLanguage Processing \nwith Python\nImplementing Machine Learning  \nand Deep Learning Algorithms for  \nNatural Language Processing\n—\nTaweh Beysolow II'),
 Document(metadata={'source': 'C:\\Projects\\Langchain\\Rag_doc\\Applied Natural Language Processing.pdf', 'page': 1}, page_content='Applied Natural \nLanguage Processing \nwith Python\nImplementing Machine \nLearning and Deep Learning \nAlgorithms for Natural \nLanguage Processing\nTaweh  Beysolow  II'),
 Document(metadata={'source': 'C:\\Projects\\Langchain\\Rag_doc\\Applied Natural Language Processing.pdf', 'page': 2}, page_content='Applied Natural Language Processing with Python\nISBN-13 (pbk): 978-1-4842-3732-8     ISBN-13 (electronic): 978-1-4842-3733-5\nhttps://doi.org/10.1007/978-1-4842-3733-5\nLibrary of Congress Control Number: 2018956300\nCopyright © 2018 by Taweh Beysolow

In [73]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500,chunk_overlap =100) # How to determine the size of chunking and overlap
context = "\n\n".join(str(p.page_content) for p in pages)

In [74]:
context

'Applied Natural \nLanguage Processing \nwith Python\nImplementing Machine Learning  \nand Deep Learning Algorithms for  \nNatural Language Processing\n—\nTaweh Beysolow II\n\nApplied Natural \nLanguage Processing \nwith Python\nImplementing Machine \nLearning and Deep Learning \nAlgorithms for Natural \nLanguage Processing\nTaweh  Beysolow  II\n\nApplied Natural Language Processing with Python\nISBN-13 (pbk): 978-1-4842-3732-8     ISBN-13 (electronic): 978-1-4842-3733-5\nhttps://doi.org/10.1007/978-1-4842-3733-5\nLibrary of Congress Control Number: 2018956300\nCopyright © 2018 by Taweh Beysolow II \nThis work is subject to copyright. All rights are reserved by the Publisher, whether the whole or \npart of the material is concerned, specifically the rights of translation, reprinting, reuse of \nillustrations, recitation, broadcasting, reproduction on microfilms or in any other physical way, \nand transmission or information storage and retrieval, electronic adaptation, computer softwar

In [75]:
context = context.replace("\n","")
context

'Applied Natural Language Processing with PythonImplementing Machine Learning  and Deep Learning Algorithms for  Natural Language Processing—Taweh Beysolow IIApplied Natural Language Processing with PythonImplementing Machine Learning and Deep Learning Algorithms for Natural Language ProcessingTaweh  Beysolow  IIApplied Natural Language Processing with PythonISBN-13 (pbk): 978-1-4842-3732-8     ISBN-13 (electronic): 978-1-4842-3733-5https://doi.org/10.1007/978-1-4842-3733-5Library of Congress Control Number: 2018956300Copyright © 2018 by Taweh Beysolow II This work is subject to copyright. All rights are reserved by the Publisher, whether the whole or part of the material is concerned, specifically the rights of translation, reprinting, reuse of illustrations, recitation, broadcasting, reproduction on microfilms or in any other physical way, and transmission or information storage and retrieval, electronic adaptation, computer software, or by similar or dissimilar methodology now known

In [76]:
texts = text_splitter.split_text(context)


In [77]:
texts[1]

'ISBN-13 (electronic): 978-1-4842-3733-5https://doi.org/10.1007/978-1-4842-3733-5Library of Congress Control Number: 2018956300Copyright © 2018 by Taweh Beysolow II This work is subject to copyright. All rights are reserved by the Publisher, whether the whole or part of the material is concerned, specifically the rights of translation, reprinting, reuse of illustrations, recitation, broadcasting, reproduction on microfilms or in any other physical way, and transmission or information storage and'

In [78]:
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001",google_api_key = GOOGLE_API_KEY)

In [79]:
# vector_index = Chroma.from_texts(text,embeddings).as_retriever(search_kwargs={"k":3})
# vector_index
text_embeddings = embeddings.embed_documents(texts)
# Create a FAISS HNSW index
dimension = len(text_embeddings[0]) # dimensionality of the embeddings
index = faiss.IndexHNSWFlat(dimension,32) #32 is the no of neighbours in HNSW


In [80]:
from langchain.docstore import InMemoryDocstore
from langchain.schema import Document

In [81]:
# convert embeddings to numpy array
embedding_matrix = np.array(text_embeddings).astype('float32')
index.add(embedding_matrix)

index_to_docstore_id = {i: str(i) for i in range(len(texts))}  # Map FAISS indices to document IDs
documents = [Document(page_content=text) for text in texts]     # Create a list of Document objects
docstore = InMemoryDocstore(dict(zip(index_to_docstore_id.values(), documents)))  # Create a SimpleDocstore
#Create vector store from faiss

# 5. Create the FAISS vector store
vector_store = FAISS(
    embedding_function=embeddings.embed_query,
    index=index,
    index_to_docstore_id=index_to_docstore_id,
    docstore=docstore
)

retriever = vector_store.as_retriever(search_kwargs={"k":3})


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [82]:
template = """You are an expert in document extration. Answer the questions with relevant information by refering the context provided
{context}
Question :{question}"""

In [83]:
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
qa_chain = RetrievalQA.from_chain_type(
    model,
    retriever = retriever,
    return_source_documents = True,
    chain_type_kwargs = {"prompt":QA_CHAIN_PROMPT}
)

In [65]:
question = "What is neural networks"
result = qa_chain({"query":question})
result["result"]

KeyError: 622

In [23]:
result["source_documents"]

[Document(metadata={}, page_content='in the MLP , utilizing functions such as ReLU.\xa0However, one of the more straightforward solutions is to use a model devised in the 1990s by Sepp Hochreiter and Jürgen Schmidhuber: the long short-term memory unit, or LSTM. Let’s start with what this model looks like, as shown in Figure\xa0 2-7.Figure 2-6.  Tanh activation and derivative functionChapter 2  review of\xa0Deep Learning38LSTMs are distinguished structurally by the fact that we observe them as blocks, or units, rather than the traditional'),
 Document(metadata={}, page_content='(NMF)features, 87Gensim model, 90Jupyter notebook, 89–90and LDA, 90mathematical formula, 86scikit-learn implementation, 87–88,  90topic extraction, 88P,  QParagraph2Vec algorithm, 115movie review data, 116–118Principal components analysis (PCA), 97Multilayer perceptron models (MLPs) ( cont. )Index149RRecurrent neural networks (RNNs)activation function, 35BPTT , 36build_rnn() function, 32chain rule, 36data set, 33