In [22]:
print("Hello World!")

Hello World!


In [23]:
import fitz
import glob

def load_all_pdfs_in_directory(directory="./content/"):
  combined_text = ""
  pdf_paths = glob.glob(f"{directory}/*.pdf")

  for file_path in pdf_paths:
    with fitz.open(file_path) as pdf:
      for page in pdf:
        combined_text += page.get_text()

  return combined_text

pdf_data = load_all_pdfs_in_directory()
print("Pdf_data: ", pdf_data[:500])

Pdf_data:  Maruti Mohit Rayalacheruvu 
(857) 313-2407 | rayalacheruvu.m@northeastern.edu | https://linkedin.com/in/marutimohitr 
 
EDUCATION 
Master of Science in Information Systems, Northeastern University​
December 2024 
 
Bachelor of Engineering in Computer Science, Visvesvaraya Technological University​
July 2021 
 
TECHNICAL SKILLS 
Programming Languages: Python, Java, C#, C,  C++ 
Data Science & ML: Scikit-learn, Statistical Analysis, Data Visualization (Matplotlib, Seaborn), Classification Models, 


In [24]:
len(pdf_data)

13088

In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=300
)

documents = text_splitter.split_text(pdf_data)
print("Number of documents: ", len(documents))


Number of documents:  15


In [26]:
documents[0]

'Maruti Mohit Rayalacheruvu \n(857) 313-2407 | rayalacheruvu.m@northeastern.edu | https://linkedin.com/in/marutimohitr \n \nEDUCATION \nMaster of Science in Information Systems, Northeastern University\u200b\nDecember 2024 \n \nBachelor of Engineering in Computer Science, Visvesvaraya Technological University\u200b\nJuly 2021 \n \nTECHNICAL SKILLS \nProgramming Languages: Python, Java, C#, C,  C++ \nData Science & ML: Scikit-learn, Statistical Analysis, Data Visualization (Matplotlib, Seaborn), Classification Models, \nFeature Engineering, Pandas, NumPy, Model Optimization, Jupyter, Predictive Modeling \nTechnologies: SQL (PostgreSQL), AWS, JavaScript, TypeScript, React, Node.js, Express, MongoDB, Docker, CI/CD \nFrameworks: Data Structures & Algorithms, Microservices Architecture, .NET Core, Git, Linux, Unit Testing, Postman \n \nWORK EXPERIENCE \nAssociate Software Engineer, Conga\u200b\n\u200b\nJuly 2021 – August 2022 \n●\u200b Architected cloud-native microservices with C#, .NET Co

In [27]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="llama3.2")

embedded_documents = embeddings.embed_documents(documents)

In [28]:
print("Embedding for the first document")
for i, value in enumerate(embedded_documents[0]):
  print(f"Component {i+1}: {value}")
  if i >= 5:
    break

Embedding for the first document
Component 1: 0.008273192
Component 2: -0.0009663236
Component 3: -0.013073141
Component 4: 0.008556816
Component 5: -0.0065383804
Component 6: 0.013164695


In [29]:
from langchain_chroma import Chroma
import chromadb

client = chromadb.Client()

chroma_db_directory = "./chroma_index"

vectorstore = Chroma.from_texts(texts=documents, embedding=embeddings, client=client, persist_directory=chroma_db_directory)

print("Vectorstore created and stored successfully")

Vectorstore created and stored successfully


In [30]:
print(f"Number of documents indexed: {len(documents)}")

Number of documents indexed: 15


In [31]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [32]:
def pretty_print(docs):
  print(
      f"\n{'-' * 100}\n".join(
          [f"Document {i+1}:\n\n"+ d.page_content for i,d in enumerate(docs)]
      )
  )

In [33]:
# Define your queries
query_1 = "who does the document discuss about?"
query_2 = "what are his achievements?"

# Retrieve relevant documents for the queries
docs_1 = retriever.invoke(query_1)
docs_2 = retriever.invoke(query_2)

# Pretty print the results
print("Results for query 1:")
pretty_print(docs_1)

print("\nResults for query 2:")
pretty_print(docs_2)


Results for query 1:
Document 1:

SKILLS
Frameworks: TensorFlow, TernsorFlow Serving, PyTorch, PySpark
Models: DBSCAN, BERT, R-CNN, Multi-Linear Regression, Polynomial Regression, Random Forest, SVM, SARSA
Cloud: Amazon Web Services, Google Cloud Platform.
EXPERIENCE
Sopra Steria
Jan 2021 - Dec 2023
Data Scientist
• Gathered historical data on user roles and access patterns. Applied Singular Value Decomposition to reduce
the dimensionality of the user-permission matrix. The DBSCAN clustering algorithm was used to analyze
user roles, attributes, and group relationships, ensuring accurate and scalable access control.
Machine Learning Engineer
• Built an NLP model using BERT to analyze detailed descriptions of ServiceNow tickets and automatically
classify them into resolution categories. The model was trained on historical ticket data to improve assign-
ment accuracy and reduce manual efforts.
• Developed a RAG model using Hugging Face Transformers, integrating organizational password pol

In [35]:
from langchain.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
import chromadb
from langchain_cohere import CohereRerank
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
import os
import cohere

cohere_api_key = os.getenv("COHERE_API_KEY")
cohere_client = cohere.ClientV2(cohere_api_key)

cohere_reranker = CohereRerank(client=cohere_client, model="rerank-english-v3.0", top_n=3)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=cohere_reranker,
    base_retriever=retriever
)

Results for query 1:
Document 1:

Object Detection on Google’s Open Images Dataset:
• Using the distributed computing capabilities of PySpark to preprocess Google’s Open Images Dataset (1M+
images) and implemented R-CNN and SSD object detection models using PyTorch on GCP’s distributed in-
frastructure.
Game of Tigers and Goats:
• Developed and implemented a game that features two players (Goat and Tiger). Designed and trained SARSA
reinforcement learning agents to optimize player strategies, improving model performance through iterative
game simulations.
Parkinson’s Disease Regression Analysis
• Statistical tests (ANOVA, F-Test) and feature selection techniques were performed to develop regression
models to predict Total UPDRS scores in Parkinson’s patients, utilizing voice-related features. Built and op-
timized multiple models, including Multi-Linear and Polynomial Regression, to improve prediction accuracy.
Online Shoppers Purchasing Intention
• Built predictive models using classi

In [36]:
from langchain.chains import RetrievalQA
from langchain_ollama import ChatOllama

llm = ChatOllama(model="llama3.2", temperature=0.5, max_tokens=3000)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=compression_retriever)

In [37]:
user_query_1 = "who does the document discuss about?"
user_query_2 = "what are their achievements?"
user_query_3 = "what certifications does mahith have? "
user_query_4 = "what certifications does maruti have?"
user_query_5 = "Does maruti have any experience working with AI/ML or Blockchain?"
user_query_7 = "what are their emails?"
user_query_8 = "my name is mahith"
user_query_9 = "my roommate name is ankit"
user_query_10 = "whats my name ? who's my roommate? "

user_queries = [user_query_1, user_query_2, user_query_3, user_query_4, user_query_5, user_query_7, user_query_8, user_query_9, user_query_10]

In [38]:
for user_query in user_queries:
  prompt = (
      "You are an expert assistant with a strong grasp of the subject matter. "
      "Please answer the following question succinctly, highlighting the key points. "
      f"Ensure your response is relevant and avoid unnecessary elaboration. "
      "make sure to utilize
      f"Answer the following question: '{(user_query)}'"
    )
  response = qa_chain.invoke(prompt)
  print(f"Question: {user_query}\nAnswer: {response['result']}\n")
  print("-----------------------------------------------------------------------------------------------------------------")

Question: who does the document discuss about?
Answer: [The document discusses various individuals, but primarily focuses on a single person.]

Key Points:
- The document highlights the author's own experiences and achievements.
- It also mentions the author's education background at Boston University (expected May 2025).
- There is no specific individual discussed in the main body of the text.

-----------------------------------------------------------------------------------------------------------------
Question: what are their achievements?
Answer: [Their Achievements]

Key Points:
- Winner at Harvard Blockchain Conference (HBC ’23)
- Winner at ETH Boston ‘23
- Winner at Columbia Blockchain Hackathon (LionHack ‘23)

-----------------------------------------------------------------------------------------------------------------
Question: what certifications does mahith have? 
Answer: I don't have information on Mahith's certifications.

Key Points:
- None

------------------------

TooManyRequestsError: status_code: 429, body: {'message': "You are using a Trial key, which is limited to 10 API calls / minute. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}