<a href="https://colab.research.google.com/github/MBilalSharif/RAG-based-CV-Reader/blob/main/CV_Reader_RAG_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q langchain langchain-community chromadb sentence-transformers pypdf google-generativeai
print("Installed!")

In [None]:
# Directly use your CV file
cv_filename = "M.Bilal Sharif.pdf"
print(f"âœ… Using CV file: {cv_filename}")

In [None]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_core.documents import Document

if cv_filename.endswith('.pdf'):
    loader = PyPDFLoader(cv_filename)
    documents = loader.load()
elif cv_filename.endswith('.txt'):
    loader = TextLoader(cv_filename)
    documents = loader.load()
else:
    with open(cv_filename, 'r') as f:
        documents = [Document(page_content=f.read())]

print(f"Loaded {len(documents)} document(s)")
print(f"Total characters: {sum(len(doc.page_content) for doc in documents)}")



In [None]:
import re


for doc in documents:
    text = doc.page_content

    text = re.sub(r'(?<=\w)\s(?=\w)', '', text)

    text = re.sub(r'\s+', ' ', text)

    text = re.sub(r'\s+([.,;:!?])', r'\1', text)

    doc.page_content = text.strip()

full_text = ' '.join([doc.page_content for doc in documents])


full_text = full_text.replace('\n', ' ')

full_text = re.sub(r'\s+', ' ', full_text).strip()

documents = [Document(page_content=full_text)]

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

#Chunking doc
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# Creating embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

print("Creating vector database...")
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./cv_vectorstore"
)
print("Vector database created!")

In [None]:
retriver = vectorstore.as_retriever(search_type='similarity' ,search_kwargs={"k":1} )
print("Retriver ready!")

In [None]:
retriver.invoke("Summary of CV")

In [None]:
import google.generativeai as genai
from getpass import getpass


api_key = getpass("ðŸ”‘ Paste your Google AI API key: ")
genai.configure(api_key=api_key)

model = genai.GenerativeModel(
    model_name='models/gemini-2.5-flash',
    generation_config={
        'temperature': 0.3,
        'max_output_tokens': 500,
    }
)

print("Gemini ready!")

In [None]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    template = """
    You are a helpful assistant
    Answer only from the provided context
    If the context is insufficent , just say you don't know the answer.

    {context}
    Question:{question}
    """,
    input_variables=['context','question']
)

In [None]:
question = 'What is the name of Student in CV?'
retrieved_docs = retriver.invoke(question)

In [None]:
context_text = "n\n".join(doc.page_content for doc in retrieved_docs)

In [None]:
final_prompt = prompt.format(context=context_text, question=question)

In [None]:
response = model.generate_content(final_prompt)
print(response.text)