In [1]:
print("Hello World!")

Hello World!


In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get the API key
api_key = os.getenv("OPENAI_API_KEY")

# Set it as an environment variable (if needed)
os.environ["OPENAI_API_KEY"] = api_key

print("API Key Loaded Successfully" if api_key else "API Key Not Found")


API Key Loaded Successfully


In [3]:
import fitz
import glob

def load_all_pdfs_in_directory(directory="./content/"):
  combined_text = ""
  pdf_paths = glob.glob(f"{directory}/*.pdf")

  for file_path in pdf_paths:
    with fitz.open(file_path) as pdf:
      for page in pdf:
        combined_text += page.get_text()

  return combined_text

pdf_data = load_all_pdfs_in_directory()
print("Pdf_data: ", pdf_data[:500])

Pdf_data:  Saimahith Chigurupati 
Boston, MA | 857-693-9706 | mahithchigurupati@gmail.com | LinkedIn | GitHub 
Software Engineer with 5+ Years of Expertise in Designing Scalable Systems and Delivering Innovative Solutions  
SKILLS 
Programming Languages  
Python, Java, C, JavaScript 
Web Technologies 
 
Spring Boot, FastAPI, Node.js, React.js, Next.js, Tailwind, REST API, GraphQL, Django REST 
DevOps and Cloud 
 
AWS, GCP, Linux, Git, Docker, Kubernetes, Jenkins, CI/CD, Terraform, Packer, Apache Kafka 
Dat


In [4]:
len(pdf_data)

5577

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

documents = text_splitter.split_text(pdf_data)
print("Number of documents: ", len(documents))


Number of documents:  7


In [6]:
documents[0]

'Saimahith Chigurupati \nBoston, MA | 857-693-9706 | mahithchigurupati@gmail.com | LinkedIn | GitHub \nSoftware Engineer with 5+ Years of Expertise in Designing Scalable Systems and Delivering Innovative Solutions  \nSKILLS \nProgramming Languages  \nPython, Java, C, JavaScript \nWeb Technologies \n \nSpring Boot, FastAPI, Node.js, React.js, Next.js, Tailwind, REST API, GraphQL, Django REST \nDevOps and Cloud \n \nAWS, GCP, Linux, Git, Docker, Kubernetes, Jenkins, CI/CD, Terraform, Packer, Apache Kafka \nDatabases & Tools \n \nMySQL, PostgreSQL, NoSQL, MongoDB, Redis, Azure SQL, Hibernate, RabbitMQ, SwiftUI \nCertifications \n \n                AWS Certified Solutions Architect, Pega Certified Senior System Architect \n \nWORK EXPERIENCE \nSoftware Engineer, Walmart                                                                                                                                                          Oct 2024 – present \n•'

In [7]:
documents[1]

'• \nCollaborating with the SRE team to develop automation tools, improving uptime for cloud services powering Walmart \napplications using Java, Spring Boot, Azure SQL, Python, and FastAPI. \n• \nContributing to Business Continuity and Disaster Recovery (BCDR) initiatives, enhancing the reliability of Walmart’s applications \nduring high-traffic events like thanksgiving sales, directly impacting major revenue generation. \n• \nDesigning and developing backend applications to enhance security and compliance of applications across the organization, \nensuring alignment with security best practices and regulatory standards. \n \nSoftware Engineer, Keelworks                                                                                                                                                       Jul 2024 – Oct 2024 \n• \nContributed to the development of a digital knowledge hub, building a centralized platform for community-driven education, \ncareer advancement, and knowledge s

In [8]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="llama3.2")

embedded_documents = embeddings.embed_documents(documents)

In [9]:
print("Embedding for the first document")
for i, value in enumerate(embedded_documents[0]):
  print(f"Component {i+1}: {value}")
  if i >= 5:
    break

Embedding for the first document
Component 1: -0.01651998
Component 2: 0.0035332777
Component 3: -0.0048407987
Component 4: -0.028871365
Component 5: 0.0041457387
Component 6: -0.0028303354


In [10]:
from langchain.vectorstores import FAISS

faiss_index = FAISS.from_texts(texts=documents, embedding=embeddings)
faiss_index.save_local("faiss_index")

In [11]:
print(f"Number of documents indexed: {len(documents)}")
print("Sample vector from first document", faiss_index.index.reconstruct(0))

Number of documents indexed: 7
Sample vector from first document [-0.01652064  0.00353308 -0.00484387 ...  0.00915766  0.01250717
 -0.01615427]


In [12]:
import os

index_path = "./faiss_index"
print("Index directory exists:", os.path.exists(index_path))
print("Files in index directory:", os.listdir(index_path) if os.path.exists(index_path) else "Directory not found")


Index directory exists: True
Files in index directory: ['index.faiss', 'index.pkl']


In [13]:
vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

print("vectorstore loaded successfully")
print(f"Number of vectors in vectorstore: {vectorstore.index.ntotal}")
print(f"Dimensionlity of vectors: {vectorstore.index.d}")

vectorstore loaded successfully
Number of vectors in vectorstore: 7
Dimensionlity of vectors: 3072


In [14]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [15]:
def pretty_print(docs):
  print(
      f"\n{'-' * 100}\n".join(
          [f"Document {i+1}:\n\n"+ d.page_content for i,d in enumerate(docs)]
      )
  )

In [16]:
vectorstore = FAISS.load_local("./faiss_index", embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [17]:
query_1 = "who does the document discuss about?"
docs = retriever.invoke(query_1)
pretty_print(docs)

query_2 = "what are his achievements?"
docs = retriever.invoke(query_2)
pretty_print(docs)

Document 1:

design patterns to ensure timely and high-quality feature delivery. 
 
EDUCATION 
MS in Information Systems, Northeastern University                                                                                                                         GPA: 3.8/4.0 
 
Achievements: Winner at Harvard Blockchain Conference (HBC ’23), ETH Boston ‘23, Columbia Blockchain Hackathon (LionHack ‘23) 
Coursework: Application Engineering and Development, Data Structures and Algorithms, Network Structures and Cloud Computing, 
Web Design and UX Engineering, iOS Mobile App Development, Engineering of Advanced Cryptocurrency Systems
----------------------------------------------------------------------------------------------------
Document 2:

• 
Contributed to the development of a digital knowledge hub, building a centralized platform for community-driven education, 
career advancement, and knowledge sharing. 
• 
Developed backend REST APIs using Node.js, integrating Redis for sessio

In [18]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_cohere import CohereRerank
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
import cohere

cohere_api_key = os.getenv("COHERE_API_KEY")
cohere_client = cohere.ClientV2(cohere_api_key)

cohere_reranker = CohereRerank(client=cohere_client, model="rerank-english-v3.0", top_n=3)

compression_retriever = ContextualCompressionRetriever(
    base_compressor = cohere_reranker,
    base_retriever = retriever
)

query_1 = "who does the document discuss about?"
reranked_docs = compression_retriever.invoke(query_1)
pretty_print(reranked_docs)

query_2 = "what are his achievements?"
reranked_docs = compression_retriever.invoke(query_2)
pretty_print(reranked_docs)

Document 1:

Saimahith Chigurupati 
Boston, MA | 857-693-9706 | mahithchigurupati@gmail.com | LinkedIn | GitHub 
Software Engineer with 5+ Years of Expertise in Designing Scalable Systems and Delivering Innovative Solutions  
SKILLS 
Programming Languages  
Python, Java, C, JavaScript 
Web Technologies 
 
Spring Boot, FastAPI, Node.js, React.js, Next.js, Tailwind, REST API, GraphQL, Django REST 
DevOps and Cloud 
 
AWS, GCP, Linux, Git, Docker, Kubernetes, Jenkins, CI/CD, Terraform, Packer, Apache Kafka 
Databases & Tools 
 
MySQL, PostgreSQL, NoSQL, MongoDB, Redis, Azure SQL, Hibernate, RabbitMQ, SwiftUI 
Certifications 
 
                AWS Certified Solutions Architect, Pega Certified Senior System Architect 
 
WORK EXPERIENCE 
Software Engineer, Walmart                                                                                                                                                          Oct 2024 – present 
•
--------------------------------------------------------

In [19]:
from langchain.chains import RetrievalQA
from langchain_ollama import ChatOllama

llm = ChatOllama(model="llama3.2", temperature=0.5, max_tokens=3000)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=compression_retriever)

In [20]:
user_query_1 = "who does the document discuss about?"
user_query_2 = "what are his achievements?"
user_query_3 = "what certifications does he have?"
user_query_4 = "what are his skills?"
user_query_5 = "Does he have any experience working with AI/ML or Blockchain?"
user_query_6 = "what are his qualifications?"
user_query_7 = "what are his experiences?"

user_queries = [user_query_3, user_query_4, user_query_5, user_query_6, user_query_7]

In [21]:
for user_query in user_queries:
  prompt = (
      "You are an expert assistant with a strong grasp of the subject matter. "
      "Please answer the following question succinctly, highlighting the key points. "
      f"Format your response as follows: \n\n"
      f" [Your answer herel\n"
      f"Key Points: \n"
      f"- Point 1: [Key insight 1]\n"
      f"- Point 2: [Key insight 2]\n"
      f"- Point 3: [Key insight 3]\n\n"
      f"Ensure your response is refevant and avoid unnecessary elaboration. "
      f"Answer the following question: '{(user_query)}'"
    )
  response = qa_chain.invoke(prompt)
  print(f"Question: {user_query}\nAnswer: {response['result']}\n")
  print("-----------------------------------------------------------------------------------------------------------------")

# response = qa_chain.invoke(prompt)
# print(response['result'])

Question: what certifications does he have?
Answer: [He has two relevant certifications.]

Key Points:
- AWS Certified Solutions Architect
- Pega Certified Senior System Architect

-----------------------------------------------------------------------------------------------------------------
Question: what are his skills?
Answer: Saimahith Chigurupati's key skills include:

* Programming Languages: Python, Java, C, JavaScript
* Web Technologies:
	+ Front-end: React.js, Next.js, Tailwind
	+ Back-end: Spring Boot, FastAPI, Node.js, GraphQL, Django REST
* DevOps and Cloud: AWS, GCP, Linux, Git, Docker, Kubernetes, Jenkins, CI/CD, Terraform, Packer, Apache Kafka
* Databases & Tools: MySQL, PostgreSQL, NoSQL, MongoDB, Redis, Azure SQL, Hibernate, RabbitMQ

Key Points:
- Skilled in a range of programming languages and web technologies.
- Experienced with DevOps tools and cloud platforms.
- Proficient in various databases and tools.

---------------------------------------------------------