<a href="https://colab.research.google.com/github/Josogrephy/simple_rag/blob/main/Simple_Resume_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install sentence-transformers faiss-cpu pypdf2
# Install the Google Generative AI library
!pip install -q -U google-generativeai


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-tran

In [14]:
import numpy as np
from sentence_transformers import SentenceTransformer
import PyPDF2
import re
import io

file_name = "/content/jsian_resume.pdf"

# 2. Extract text from the PDF
raw_text_from_pdf = ""
try:
  with open(file_name, "rb") as f:
    pdf_reader = PyPDF2.PdfReader(f)
    num_pages = len(pdf_reader.pages)
    print(f"Number of pages in PDF: {num_pages}")

    for page_num in range(num_pages):
        page_obj = pdf_reader.pages[page_num]
        raw_text_from_pdf += page_obj.extract_text() + "\n" # Add a newline between pages

except Exception as e:
  print(f"Error reading PDF: {e}")
  raw_text_from_pdf = None

if raw_text_from_pdf and raw_text_from_pdf.strip(): # Check if text was successfully extracted
  # 3. Chunk your document by sections
  # Define section headers (case-insensitive for robustness)
  section_headers = ["EXPERIENCE", "EDUCATION", "SKILLS"]
  # Create a regex pattern to match these headers at the beginning of a line
  # We use \s* to account for potential leading whitespace
  # We use (?:...) for a non-capturing group to match the options
  # We use re.IGNORECASE for case-insensitive matching
  section_header_pattern = re.compile(r'^\s*(?:' + '|'.join(re.escape(h) for h in section_headers) + r')\s*$', re.IGNORECASE)

  lines = raw_text_from_pdf.split('\n')
  chunks = []
  current_chunk_lines = []
  first_section_found = False

  for line in lines:
      stripped_line = line.strip()
      # Check if the line is a section header
      if section_header_pattern.match(stripped_line):
          # If we have accumulated lines for a previous chunk, save it
          if current_chunk_lines:
              chunks.append("\n".join(current_chunk_lines).strip())
              current_chunk_lines = [] # Reset for the new section
          current_chunk_lines.append(stripped_line) # Add the section header itself to the new chunk
          first_section_found = True # Mark that we've started with a section

      else:
          # If it's not a section header, add it to the current chunk.
          # If no section header has been found yet, this content is part of the initial "header" chunk.
          current_chunk_lines.append(line)

  # Add the last accumulated chunk
  if current_chunk_lines:
      chunks.append("\n".join(current_chunk_lines).strip())

  # Post-processing: If the first chunk doesn't start with a section header,
  # it's likely the personal details/contact info. We want to keep that as a separate chunk.
  # The above logic correctly handles this, but a quick check to ensure.
  # If the very first chunk contains only pre-section data, it will be the first chunk.

  # Filter out any completely empty chunks that might result from extra newlines
  chunks = [chunk for chunk in chunks if chunk.strip()]


  print(f"\nNumber of text chunks created: {len(chunks)}")
  print("\n--- Sample Chunks ---")
  for i, chunk in enumerate(chunks): # Print all chunks to verify
    print(f"Chunk {i+1} (approx {len(chunk)} chars):\n{chunk[:500]}...\n-------------------\n") # Print first 500 chars of chunk

  my_documents = chunks # These are the "documents" we will embed

  # 4. Load a pre-trained Sentence Transformer model
  model = SentenceTransformer('all-MiniLM-L6-v2')

  # 5. Generate embeddings for your document chunks
  print("\nGenerating embeddings for document chunks...")
  document_embeddings = model.encode(my_documents)

  # Check the shape of our embeddings
  print("Shape of document embeddings:", document_embeddings.shape)

  # Save these embeddings for later use.
  np.save("my_document_embeddings.npy", document_embeddings)
  # Also save the processed text chunks if you need to map back to them later
  with open("my_document_chunks.txt", "w", encoding="utf-8") as f:
      for chunk in my_documents:
          f.write(chunk + "\n===\n") # Using a separator

  print("Embeddings generated and saved!")
  print("Text chunks also saved to my_document_chunks.txt")

else:
  print("No text extracted or text is empty.")

Number of pages in PDF: 3

Number of text chunks created: 4

--- Sample Chunks ---
Chunk 1 (approx 110 chars):
Joseph Sian Gou Wei
siangouweijoseph@gmail.com |+65 92265048 |https://www.linkedin.com/in/joseph-sian-gou-wei/...
-------------------

Chunk 2 (approx 7342 chars):
Experience
Advertising Solutions Architect, Apps & Measurement Jul. 2022 – Present
Google
•Served as the primary point of contact for professional services in the Malaysia market, enhancing client engagement
and improving service delivery.
•Delivered cross-functional support to Commercial, Product, and Operational teams, enhancing collaboration and
streamlining processes to improve overall efficiency.
•Analyzed clients’ challenges and developed tailored solutions, resulting in improved client sa...
-------------------

Chunk 3 (approx 325 chars):
Education
Murdoch University 2017
Bachelor of Commerce - BCom, Marketing and Web Communication (Double Majors)
Singapore Polytechnic 2022
Specialist diploma in Data science

In [15]:
# Ensure you have run the previous code block or have the necessary libraries and data loaded.

# If you haven't run the previous block in the same session, load the embeddings:
# document_embeddings = np.load("my_document_embeddings.npy")
# model = SentenceTransformer('all-MiniLM-L6-v2') # if needed for query embedding later

# 1. Get the dimensionality of our embeddings
d = document_embeddings.shape[1] # Dimension of embeddings

# 2. Create a FAISS index
# IndexFlatL2 is a basic index that performs exact L2 distance search.
# For very large datasets, you might explore more complex FAISS indexes like IndexIVFFlat.
index = faiss.IndexFlatL2(d)

# 3. Add the document embeddings to the index
index.add(document_embeddings)

# Check if the embeddings are added
print("Number of vectors in the FAISS index:", index.ntotal)

# We can save the FAISS index to disk
faiss.write_index(index, "my_faiss_index.index")

print("FAISS index created, populated, and saved!")

Number of vectors in the FAISS index: 4
FAISS index created, populated, and saved!


In [21]:
import google.generativeai as genai

# --- Configuration ---
# Make sure you have your Gemini API key stored as a secret in Colab.
# Name it 'GEMINI_API_KEY'.
try:
    GEMINI_API_KEY = 'Gemini API KEy' #Get it at https://aistudio.google.com/
    genai.configure(api_key=GEMINI_API_KEY)
except Exception as e:
    print(f"An error occurred during API key configuration: {e}")
    GEMINI_API_KEY = None



# 1. Define a user query
user_query = "Why should I hire Joseph?"

# 2. Embed the user query
# It's crucial to use the SAME model for embedding the query as you used for the documents.
query_embedding = model.encode([user_query]) # Pass the query as a list

# 3. Search the FAISS index
k = 5 # Number of top relevant documents to retrieve
distances, indices = index.search(query_embedding, k)

# 'indices' will contain the row numbers of the most similar documents in your original 'my_documents' list.
# 'distances' will contain the corresponding similarity scores (e.g., L2 distances).

print(f"Query: {user_query}")
print(f"Retrieved document indices: {indices}")
print(f"Distances: {distances}")

# 4. Retrieve the actual document content
retrieved_docs_content = [my_documents[i] for i in indices[0]]

print("\n--- Retrieved Documents ---")
for i, doc in enumerate(retrieved_docs_content):
    print(f"Doc {i+1}: {doc}")

# 5. Prepare the context and prompt for Gemini
context_for_llm = "\n\n".join(retrieved_docs_content)

prompt_template = f"""Based ONLY on the following context, answer the question.
If the context doesn't contain the answer, say "I don't have enough information from the provided documents."

Context:
{context_for_llm}

Question: {user_query}

Answer:
"""

print("\n--- Prompt for LLM ---")
print(prompt_template)

# 6. Call the Gemini API (if the API key is available)
if GEMINI_API_KEY:
    try:
        llm_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Or your preferred Gemini model
        response = llm_model.generate_content(prompt_template)

        print("\n--- LLM Response ---")
        print(response.text)
    except Exception as e:
        print(f"\nError during Gemini API call: {e}")
        print("Please ensure your API key is correct and you have API access.")
else:
    print("\nSkipping Gemini API call as API key is not configured.")

Query: Why should I hire Joseph?
Retrieved document indices: [[ 0  2  1  3 -1]]
Distances: [[1.5808572e+00 1.8517233e+00 1.8551801e+00 1.8894284e+00 3.4028235e+38]]

--- Retrieved Documents ---
Doc 1: Joseph Sian Gou Wei
siangouweijoseph@gmail.com |+65 92265048 |https://www.linkedin.com/in/joseph-sian-gou-wei/
Doc 2: Education
Murdoch University 2017
Bachelor of Commerce - BCom, Marketing and Web Communication (Double Majors)
Singapore Polytechnic 2022
Specialist diploma in Data science(Artificial intelligence), Artificial Intelligence
Singapore Polytechnic 2013
Diploma in Business Information Technology, Business information technology
Doc 3: Experience
Advertising Solutions Architect, Apps & Measurement Jul. 2022 – Present
Google
•Served as the primary point of contact for professional services in the Malaysia market, enhancing client engagement
and improving service delivery.
•Delivered cross-functional support to Commercial, Product, and Operational teams, enhancing collaboration a