In [1]:
import warnings
from typing import List
import boto3
from PyPDF2 import PdfReader
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain_aws import BedrockEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.schema import Document
from langchain.llms.bedrock import Bedrock
from langchain.chains import RetrievalQA
import logging

  from tqdm.autonotebook import tqdm




In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

load_dotenv()

True

In [3]:
def extract_text_from_pdfs(pdf_path):
    try:
        pdf_reader = PdfReader(pdf_path)
        text = ''
        for page in pdf_reader.pages:
            extracted = page.extract_text()
            if extracted:
                text += extracted
        if not text:
            logger.warning("No text extracted from PDF.")
        return text
    except FileNotFoundError:
        logger.error(f"The file {pdf_path} does not exist.")
        return ""
    except Exception as e:
        logger.error(f"An error occurred during PDF extraction: {e}")
        return ""

In [4]:
def create_text_chunks(text: str) -> List[Document]:
    """Divide raw string data into chunks."""
    try:
        text_splitter = CharacterTextSplitter(
            separator="\n",
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )
        chunks = text_splitter.split_text(text)
        logger.info(f"Created {len(chunks)} text chunks.")
        return [Document(page_content=chunk) for chunk in chunks]
    except Exception as e:
        logger.error(f"Error in create_text_chunks: {e}")
        return []

In [5]:
def create_vector_store(chunks: List[Document]) -> PineconeVectorStore:
    """Store data into vector database."""
    try:
        embeddings = BedrockEmbeddings()
        index_name = "rag-aws"
        vector_store = PineconeVectorStore.from_documents(chunks, embeddings, index_name=index_name)
        logger.info("Vector store created successfully.")
        return vector_store
    except Exception as e:
        logger.error(f"Error in create_vector_store: {e}")
        return None

In [6]:
def initialize_llm() -> Bedrock: 
    """Initialize the Bedrock LLM."""
    try:
        bedrock_client = boto3.client(
            service_name="bedrock-runtime",
            region_name="us-east-1"  # replace with your preferred region
        )
        llm = Bedrock(
            model_id="meta.llama2-70b-chat-v1",
            client=bedrock_client,
            model_kwargs={
                "max_gen_len": 512,
                "temperature": 0.5,
                "top_p": 0.9
            }
        )
        logger.info("LLM initialized successfully.")
        return llm
    except Exception as e:
        logger.error(f"Error in initialize_llm: {e}")
        return None

In [7]:
# Cell 7: Define Function to Create QA Chain
def create_qa_chain(llm: Bedrock, docsearch: PineconeVectorStore) -> RetrievalQA:
    try:
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",  # 'stuff' combines all retrieved docs into a single prompt
            retriever=docsearch.as_retriever(search_kwargs={"k": 5}),
            return_source_documents=False  # Set to True if you want to see source documents
        )
        logger.info("QA chain created successfully.")
        return qa_chain
    except Exception as e:
        logger.error(f"Error in create_qa_chain: {e}")
        return None


In [8]:
# Cell 8: Execute QA Pipeline (Partial)
pdf_file = r"C:\Users\kshit\Desktop\Kshitij_Sarve_Resume.pdf"

# Step 1: Extract text from PDF
raw_txt = extract_text_from_pdfs(pdf_file)
if not raw_txt:
    logger.error("No text extracted. Exiting.")
else:
    print("Extracted Text (first 1000 characters):")
    print(raw_txt[:1000])

Extracted Text (first 1000 characters):
Kshitij  Sarve  
Education  Mob:  +91-9579360733  
Mail:  kshitijsarve2001@gmail.com  
GitHub:  https://github.com/Kshitij10000  
LinkedIn: https://shorturl.at/5Nh3U  
Deogiri  Institute  of Engineering  and Management  Studies,  Aurangabad,  MH 2020 – 2024  
B. Tech  in CSE  (Artificial  Intelligence  and Machine Learning)  CGPA–7.9 Aurangabad,  Maharashtra  
 
Experience  
Aldrich  Research  Services  (USA  based  Private  Equity  Firm)  February  2024  – May  2024  
AI Developer  Intern  Onsite  
Projects:  
Candidate  Recommendation  Systems  (CRS)  
• Developed  AI solutions  that boosted  workflow  efficiency  by 30%  through  automation  and real-time  data  analysis.  
• Built AI -powered  candidate  recommendation  systems,  increasing  successful  hires  by 25%  and reducing  screening  time  by 
50%.  
• Enhanced  GPT  based  chatbot  with  90%  accuracy,  decreasing  customer  service  response  times  by 40%.  
• Continuously  improv

In [9]:
# Step 2: Create text chunks
chunks = create_text_chunks(raw_txt)
print(f"Number of Chunks: {len(chunks)}")
if chunks:
    print("Sample Chunk:")
    print(chunks[0].page_content[:500])  # Print first 500 characters of the first chunk


INFO:__main__:Created 4 text chunks.


Number of Chunks: 4
Sample Chunk:
Kshitij  Sarve  
Education  Mob:  +91-9579360733  
Mail:  kshitijsarve2001@gmail.com  
GitHub:  https://github.com/Kshitij10000  
LinkedIn: https://shorturl.at/5Nh3U  
Deogiri  Institute  of Engineering  and Management  Studies,  Aurangabad,  MH 2020 – 2024  
B. Tech  in CSE  (Artificial  Intelligence  and Machine Learning)  CGPA–7.9 Aurangabad,  Maharashtra  
 
Experience  
Aldrich  Research  Services  (USA  based  Private  Equity  Firm)  February  2024  – May  2024  
AI Developer  Intern  Onsi


In [10]:
# Step 3: Create vector store
doc_search = create_vector_store(chunks)
if doc_search is None:
    logger.error("Failed to create vector store.")
else:
    # Optionally, print the number of vectors stored
    index_name = "rag-aws"
    index = pinecone.Index(index_name)
    try:
        index_stats = index.describe_index_stats()
        num_vectors = index_stats.get('total_vector_count', 0)
        print(f"Number of vectors in Pinecone index '{index_name}': {num_vectors}")
    except Exception as e:
        logger.error(f"Error retrieving Pinecone index stats: {e}")


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['c:\\Users\\kshit\\Desktop\\rag_pine\\rag_env\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
ERROR:__main__:Error in create_vector_store: Index 'rag-aws' not found in your Pinecone project. Did you mean one of the following indexes: testfiles
ERROR:__main__:Failed to create vector store.


In [11]:
doc_search = create_vector_store(chunks)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['c:\\Users\\kshit\\Desktop\\rag_pine\\rag_env\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
ERROR:__main__:Error in create_vector_store: Index 'rag-aws' not found in your Pinecone project. Did you mean one of the following indexes: testfiles


In [12]:
llm = initialize_llm()

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
  llm = Bedrock(
INFO:__main__:LLM initialized successfully.


In [13]:

qa_chain = create_qa_chain(llm, doc_search)


ERROR:__main__:Error in create_qa_chain: 'NoneType' object has no attribute 'as_retriever'


In [14]:
# Cell 9: Test the QA System
if qa_chain:
    try:
        user_question = "Who is Kshitij Sarve?"
        response = qa_chain.invoke(user_question)
        print("Answer:", response['result'])
    except Exception as e:
        logger.error(f"Error during QA: {e}")
else:
    logger.error("QA chain is not initialized.")


ERROR:__main__:QA chain is not initialized.
