In [74]:
print("Hello World!")

Hello World!


In [75]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key

print("API Key Loaded Successfully" if api_key else "API Key Not Found")


API Key Loaded Successfully


In [76]:
import fitz
import glob

def load_all_pdfs_in_directory(directory="./content/"):
  combined_text = ""
  pdf_paths = glob.glob(f"{directory}/*.pdf")

  for file_path in pdf_paths:
    with fitz.open(file_path) as pdf:
      for page in pdf:
        combined_text += page.get_text()

  return combined_text

pdf_data = load_all_pdfs_in_directory()
print("Pdf_data: ", pdf_data[:500])

Pdf_data:  Maruti Mohit Rayalacheruvu 
(857) 313-2407 | rayalacheruvu.m@northeastern.edu | https://linkedin.com/in/marutimohitr 
 
EDUCATION 
Master of Science in Information Systems, Northeastern University​
December 2024 
 
Bachelor of Engineering in Computer Science, Visvesvaraya Technological University​
July 2021 
 
TECHNICAL SKILLS 
Programming Languages: Python, Java, C#, C,  C++ 
Data Science & ML: Scikit-learn, Statistical Analysis, Data Visualization (Matplotlib, Seaborn), Classification Models, 


In [77]:
len(pdf_data)

13088

In [78]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100
)

documents = text_splitter.split_text(pdf_data)
print("Number of documents: ", len(documents))


Number of documents:  7


In [79]:
documents[0]

'Maruti Mohit Rayalacheruvu \n(857) 313-2407 | rayalacheruvu.m@northeastern.edu | https://linkedin.com/in/marutimohitr \n \nEDUCATION \nMaster of Science in Information Systems, Northeastern University\u200b\nDecember 2024 \n \nBachelor of Engineering in Computer Science, Visvesvaraya Technological University\u200b\nJuly 2021 \n \nTECHNICAL SKILLS \nProgramming Languages: Python, Java, C#, C,  C++ \nData Science & ML: Scikit-learn, Statistical Analysis, Data Visualization (Matplotlib, Seaborn), Classification Models, \nFeature Engineering, Pandas, NumPy, Model Optimization, Jupyter, Predictive Modeling \nTechnologies: SQL (PostgreSQL), AWS, JavaScript, TypeScript, React, Node.js, Express, MongoDB, Docker, CI/CD \nFrameworks: Data Structures & Algorithms, Microservices Architecture, .NET Core, Git, Linux, Unit Testing, Postman \n \nWORK EXPERIENCE \nAssociate Software Engineer, Conga\u200b\n\u200b\nJuly 2021 – August 2022 \n●\u200b Architected cloud-native microservices with C#, .NET Co

In [80]:
documents[1]

'making \n \nPROJECTS \nOvarian Cancer Detection Using Machine Learning\u200b\nDecember 2023 \n●\u200b\nSpearheaded the development of a machine learning-based ovarian cancer detection system by processing 7 \ndistinct gene expression datasets, implementing dimensionality reduction through PCA, and utilizing SMOTE for \nclass balancing, resulting in a comprehensive dataset of 3,840 genetic markers \n●\u200b\nEngineered 5 classification models (KNN, XGBoost, Decision Tree, Logistic Regression, SVM) through \nhyperparameter tuning and cross-validation, achieving up to 78.19% accuracy in distinguishing between 6 ovarian \ntissue types, validated through detailed confusion matrices \n●\u200b\nBuilt an end-to-end data pipeline with StandardScaler normalization and PCA, reducing feature dimensionality by \n98% while maintaining 99% of variance; created interactive Plotly visualizations for gene expression patterns and \ncomparative model performance analysis \n \nMusic Streaming Database Man

In [81]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

embedded_documents = embeddings.embed_documents(documents)

In [82]:
print("Embedding for the first document")
for i, value in enumerate(embedded_documents[0]):
  print(f"Component {i+1}: {value}")
  if i >= 5:
    break

Embedding for the first document
Component 1: -0.01341044157743454
Component 2: -0.011500553227961063
Component 3: 0.013767686672508717
Component 4: -0.02007444202899933
Component 5: -0.027617815881967545
Component 6: 0.01268221065402031


In [83]:
from langchain.vectorstores import FAISS

faiss_index = FAISS.from_texts(texts=documents, embedding=embeddings)
faiss_index.save_local("faiss_index")

In [84]:
print(f"Number of documents indexed: {len(documents)}")
print("Sample vector from first document", faiss_index.index.reconstruct(0))

Number of documents indexed: 7
Sample vector from first document [-0.01341044 -0.01150055  0.01376769 ... -0.0347902   0.00155522
 -0.01596612]


In [85]:
import os

index_path = "./faiss_index"
print("Index directory exists:", os.path.exists(index_path))
print("Files in index directory:", os.listdir(index_path) if os.path.exists(index_path) else "Directory not found")


Index directory exists: True
Files in index directory: ['index.faiss', 'index.pkl']


In [86]:
vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

print("vectorstore loaded successfully")
print(f"Number of vectors in vectorstore: {vectorstore.index.ntotal}")
print(f"Dimensionlity of vectors: {vectorstore.index.d}")

vectorstore loaded successfully
Number of vectors in vectorstore: 7
Dimensionlity of vectors: 1536


In [87]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [88]:
def pretty_print(docs):
  print(
      f"\n{'-' * 100}\n".join(
          [f"Document {i+1}:\n\n"+ d.page_content for i,d in enumerate(docs)]
      )
  )

In [89]:
vectorstore = FAISS.load_local("./faiss_index", embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [90]:
query_1 = "who does the document discuss about?"
docs = retriever.invoke(query_1)
pretty_print(docs)

query_2 = "what are his achievements?"
docs = retriever.invoke(query_2)
pretty_print(docs)

Document 1:

SMS APIs, boosting operational resilience and ensuring uninterrupted business continuity. 
 
Programmer Analyst, Cognizant                                                                                                                                                  Jun 2019 - Jul 2021 
• 
Revamped the architecture of an Equity and Debt investment processing application for World Bank Group, improving case 
processing lifecycle and disbursement time by 30% through advanced system design. 
• 
Designed and implemented REST APIs for Java and .NET applications, integrating core business logic and configuring SLAs and 
routings to optimize workflow efficiency. 
• 
Utilized Splunk and Tracer for troubleshooting and resolving production issues, ensuring zero downtime and delivering timely 
fixes with minimal impact on end-users. 
• 
Provided L2 support for Pegasystems, diagnosing and resolving infrastructure and product-level issues to maintain system 
integrity and enhance user

In [91]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_cohere import CohereRerank
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
import cohere

cohere_api_key = os.getenv("COHERE_API_KEY")
cohere_client = cohere.ClientV2(cohere_api_key)

cohere_reranker = CohereRerank(client=cohere_client, model="rerank-english-v3.0", top_n=3)

compression_retriever = ContextualCompressionRetriever(
    base_compressor = cohere_reranker,
    base_retriever = retriever
)

query_1 = "who does the document discuss about?"
reranked_docs = compression_retriever.invoke(query_1)
pretty_print(reranked_docs)

query_2 = "what are his achievements?"
reranked_docs = compression_retriever.invoke(query_2)
pretty_print(reranked_docs)

Document 1:

Maruti Mohit Rayalacheruvu 
(857) 313-2407 | rayalacheruvu.m@northeastern.edu | https://linkedin.com/in/marutimohitr 
 
EDUCATION 
Master of Science in Information Systems, Northeastern University​
December 2024 
 
Bachelor of Engineering in Computer Science, Visvesvaraya Technological University​
July 2021 
 
TECHNICAL SKILLS 
Programming Languages: Python, Java, C#, C,  C++ 
Data Science & ML: Scikit-learn, Statistical Analysis, Data Visualization (Matplotlib, Seaborn), Classification Models, 
Feature Engineering, Pandas, NumPy, Model Optimization, Jupyter, Predictive Modeling 
Technologies: SQL (PostgreSQL), AWS, JavaScript, TypeScript, React, Node.js, Express, MongoDB, Docker, CI/CD 
Frameworks: Data Structures & Algorithms, Microservices Architecture, .NET Core, Git, Linux, Unit Testing, Postman 
 
WORK EXPERIENCE 
Associate Software Engineer, Conga​
​
July 2021 – August 2022 
●​ Architected cloud-native microservices with C#, .NET Core, and AWS to process documents a

In [92]:
# from langchain_core.messages import HumanMessage, AIMessage
# from langchain_community.chat_message_histories import ChatMessageHistory

# chat_history = ChatMessageHistory()

In [93]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

llm = ChatOpenAI(model="gpt-4-0613", temperature=0.5, max_tokens=3000)

# qa_chain_with_history = ConversationalRetrievalChain.from_llm(
#     llm =llm,
#     retriever =compression_retriever,
#     return_source_documents=True,
#     get_chat_history=lambda h: chat_history.messages
# )

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=compression_retriever,
    memory=None  # We will add memory below
)

In [108]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

memory = ConversationBufferMemory(return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=compression_retriever,
    memory=memory
)


In [115]:
user_query_1 = "who does the document discuss about?"
user_query_2 = "what are their achievements?"
user_query_3 = "what certifications does mahith have? "
user_query_4 = "what certifications does maruti have?"
user_query_5 = "Does maruti have any experience working with AI/ML or Blockchain?"
user_query_7 = "what are their emails?"
user_query_8 = "my name is mahith"
user_query_9 = "my roommate name is ankit"
user_query_10 = "whats my name ? who's my roommate? "

user_queries = [user_query_1, user_query_2, user_query_3, user_query_4, user_query_5, user_query_7, user_query_8, user_query_9, user_query_10]

In [116]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

# Initialize memory (this stores the conversation history)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Initialize QA chain with memory
qa_chain = ConversationalRetrievalChain.from_llm(
    llm, retriever=retriever, memory=memory
)

# Loop through user queries and invoke the chain
chat_history = []  # Initialize chat history

for user_query in user_queries:
    # Pass both question and chat history to the chain
    response = qa_chain.invoke({
        "question": user_query,           # Pass the user query
        "chat_history": chat_history   # Pass the chat history
    })

    # Append the user query and answer to chat history
    chat_history.append((user_query, response["answer"]))

    # Print the answer
    print(f"Question: {user_query}\nAnswer: {response['answer']}\n")
    print("-----------------------------------------------------------------------------------------------------------------")


Question: who does the document discuss about?
Answer: The document discusses about three individuals:

1. Deepak Swaminathan, who has experience as a Programmer Analyst at Cognizant and as a Data Scientist and Machine Learning Engineer at Sopra Steria. He also served as a Teaching Assistant at Boston University.

2. Saimahith Chigurupati, who is a Software Engineer with expertise in designing scalable systems and delivering innovative solutions.

3. Maruti Mohit Rayalacheruvu, who has technical skills in various programming languages and technologies, and has worked as an Associate Software Engineer at Conga. He has also undertaken several projects related to machine learning and database management.

-----------------------------------------------------------------------------------------------------------------
Question: what are their achievements?
Answer: Deepak Swaminathan's achievements include:
- Improved model performance through feature engineering and hyperparameter tuning, 

In [71]:
def process_query_with_history(user_query):
    prompt = (
      "You are an expert assistant with a strong grasp of the subject matter. "
      "Please answer the following question succinctly, highlighting the key points. "
      f"Format your response as follows: \n\n"
      f" [Your answer herel\n"
      f"Key Points: \n"
      f"- Point 1: [Key insight 1]\n"
      f"- Point 2: [Key insight 2]\n"
      f"- Point 3: [Key insight 3]\n\n"
      f"Ensure your response is relevant and avoid unnecessary elaboration. "
      "Consider both document context and our previous conversation when answering. "
      "if user asks to remember something, remmeber it and use it for answering when asked about it later. "
      f"Answer the following question: '{(user_query)}'"
    )

    chat_history.add_user_message(user_query)
    response = qa_chain_with_history.invoke({
        "question": prompt,
        "chat_history": chat_history.messages
    })

    chat_history.add_ai_message(response["answer"])
    return response["answer"]

In [72]:
while True:
    user_input = input("\nYour Question: ")
    if user_input.lower() == 'exit':
        break

    response = process_query_with_history(user_input)
    print(f"\nRrsponse: {response}")


Your Question:  hey



Rrsponse: The greeting 'hey' is a friendly, informal way to say hello or attract attention.



Your Question:  my name is mahith



Rrsponse: As an expert assistant, I would respond: "Sure, Mahith. I've made a note of your name. How can I assist you further?" In subsequent interactions, I would use this information to personalize my responses, for example, "Mahith, based on your previous query, you might find this information useful..." or "How can I assist you today, Mahith?"



Your Question:  whats my name



Rrsponse: Your name is Mahith.



Your Question:  what do you think about my resume



Rrsponse: As an AI, I can't remember previous conversations, but based on the information provided, your resume is impressive. You have a strong background in software engineering, with experience in various programming languages and technologies. Your work history showcases your ability to lead projects and teams, develop complex systems, and contribute to high-stakes initiatives. Your skills in cloud computing, web technologies, and databases are well-documented and supported by your work experience. You have a Master's degree in Information Systems from a reputable university and have won notable awards at blockchain conferences. Your resume also shows a steady career progression, with roles at Cognizant, Optum, Northeastern University, Keelworks, and currently at Walmart.



Your Question:  exit


In [63]:
user_query_1 = "who does the document discuss about?"
user_query_2 = "what are his achievements?"
user_query_3 = "what certifications does he have?"
user_query_4 = "what are his skills?"
user_query_5 = "Does he have any experience working with AI/ML or Blockchain?"
user_query_6 = "what are his qualifications?"
user_query_7 = "what are his experiences?"

user_queries = [user_query_1, user_query_2, user_query_3, user_query_4, user_query_5, user_query_6, user_query_7]

In [64]:
for user_query in user_queries:
  prompt = (
      "You are an expert assistant with a strong grasp of the subject matter. "
      "Please answer the following question succinctly, highlighting the key points. "
      f"Format your response as follows: \n\n"
      f" [Your answer herel\n"
      f"Key Points: \n"
      f"- Point 1: [Key insight 1]\n"
      f"- Point 2: [Key insight 2]\n"
      f"- Point 3: [Key insight 3]\n\n"
      f"Ensure your response is refevant and avoid unnecessary elaboration. "
      f"Answer the following question: '{(user_query)}'"
    )
  response = qa_chain.invoke(prompt)
  print(f"Question: {user_query}\nAnswer: {response['result']}\n")
  print("-----------------------------------------------------------------------------------------------------------------")

# response = qa_chain.invoke(prompt)
# print(response['result'])

Question: who does the document discuss about?
Answer: The document discusses about Saimahith Chigurupati.

Key Points: 
- Point 1: Saimahith Chigurupati is a Software Engineer based in Boston, MA.
- Point 2: He has over 5 years of expertise in designing scalable systems and delivering innovative solutions.
- Point 3: His work experience includes roles at Walmart and Northeastern University, and he holds certifications as an AWS Certified Solutions Architect and Pega Certified Senior System Architect.

-----------------------------------------------------------------------------------------------------------------
Question: what are his achievements?
Answer: The individual's achievements include:

Key Points: 
- Point 1: Winning at several prestigious events such as the Harvard Blockchain Conference (HBC ’23), ETH Boston ‘23, and the Columbia Blockchain Hackathon (LionHack ‘23).
- Point 2: Successfully architecting and engineering downstream RESTful APIs with Java Spring Boot, integrat