In [None]:
#imports
import pandas as pd
import numpy as np
import pymupdf
import re
from sentence_transformers import SentenceTransformer, util
import faiss
import json
import google.generativeai as genai
from dotenv import load_dotenv
import os

#load environment variables
load_dotenv()

In [None]:
api_key = os.getenv("GOOGLE_API_KEY")

In [76]:
genai.configure(api_key = api_key)

In [2]:
PDF_PATH = "/Moazzam resume.pdf"

In [None]:
def read_pdf(pdf_path):
  text = ""
  with pymupdf.open(pdf_path) as pages:
    for page in pages:
      text += page.get_text()
  return text


def clean_text(text):
    text = text.lower()  
    text = text.replace('\n', ' ')
    text = ''.join(c for c in text if c.isprintable() and not (0xE000 <= ord(c) <= 0xF8FF))
    text = text.strip()
    return text

def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = words[start:end]
        chunks.append(' '.join(chunk))
        start += chunk_size - overlap
    return chunks


In [27]:
raw_text = read_pdf(PDF_PATH)
clean_text = clean_text(raw_text)

In [28]:
clean_text

'muhammad moazzam ml/ai engineer lahore +923228032990 moazzamaleem786@gmail.com summary i am a dynamic and motivated software engineering student at ucp with a deep passion for machine learning and artificial intelligence. proficient in python, with skills in data manipulation and visualization using libraries such as pandas, numpy, and matplotlib. experienced in building machine learning models and translating complex data into actionable insights. eager to apply my knowledge of ai/ml fundamentals through innovative projects, i thrive on challenges and am committed to developing intelligent solutions that drive real-world impact. skills programming languages: python, c++ data visualization matplotlib, seaborn, plotly machine learning model building, supervised and unsupervised learning artificial intelligence basic understanding of ai concepts tools & technologies jupyter notebooks, scikit-learn problem solving: analytical thinking, data-driven decision making database management sql,

In [40]:
chunks = chunk_text(clean_text)

In [41]:
len(chunks)

2

In [42]:
chunks

['muhammad moazzam ml/ai engineer lahore +923228032990 moazzamaleem786@gmail.com summary i am a dynamic and motivated software engineering student at ucp with a deep passion for machine learning and artificial intelligence. proficient in python, with skills in data manipulation and visualization using libraries such as pandas, numpy, and matplotlib. experienced in building machine learning models and translating complex data into actionable insights. eager to apply my knowledge of ai/ml fundamentals through innovative projects, i thrive on challenges and am committed to developing intelligent solutions that drive real-world impact. skills programming languages: python, c++ data visualization matplotlib, seaborn, plotly machine learning model building, supervised and unsupervised learning artificial intelligence basic understanding of ai concepts tools & technologies jupyter notebooks, scikit-learn problem solving: analytical thinking, data-driven decision making database management sql

In [50]:
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [51]:
embedding = model.encode(chunks)

In [53]:
embedding.shape

(2, 384)

In [61]:
embedding_matrix = np.array(embedding)
dimensions = embedding_matrix.shape[1]

index = faiss.IndexFlatL2(dimensions)
index.add(embedding_matrix)


faiss.write_index(index, "index.faiss")

In [63]:
with open("chunks.json", "w") as f:
    json.dump(chunks, f)


In [79]:
query = "What kinda candidate is Muhammad Moazzam for ML intenrship"
query_embedding = model.encode([query])[0]

top_k = 5
distances, indices = index.search(np.array([query_embedding]), top_k)

matched_chunks = [chunks[i] for i in indices[0]]

print("Matched chunks:")
matched_chunks

Matched chunks:


['muhammad moazzam ml/ai engineer lahore +923228032990 moazzamaleem786@gmail.com summary i am a dynamic and motivated software engineering student at ucp with a deep passion for machine learning and artificial intelligence. proficient in python, with skills in data manipulation and visualization using libraries such as pandas, numpy, and matplotlib. experienced in building machine learning models and translating complex data into actionable insights. eager to apply my knowledge of ai/ml fundamentals through innovative projects, i thrive on challenges and am committed to developing intelligent solutions that drive real-world impact. skills programming languages: python, c++ data visualization matplotlib, seaborn, plotly machine learning model building, supervised and unsupervised learning artificial intelligence basic understanding of ai concepts tools & technologies jupyter notebooks, scikit-learn problem solving: analytical thinking, data-driven decision making database management sql

In [None]:
gen_model = genai.GenerativeModel("gemini-1.5-flash-latest")


context = "\n\n".join(matched_chunks)
prompt = f"""Answer the question based on the context below:

{context}

Question: {query}

Make sure the answer is concise and relevant only, no extra text.
"""

response = gen_model.generate_content(prompt)
answer = response.text
print(answer)

A strong candidate; proficient in Python, experienced in building ML models, and has several relevant projects.

