In [None]:
##Old Version

In [13]:
# imports
from openai import OpenAI
from pypdf import PdfReader
import gradio as gr
from google.colab import userdata

client = OpenAI(
    api_key= userdata.get('OPENAI_API_KEY')
)

model_name = "gpt-4o-mini"

reader = PdfReader(r"/Leena Almoradi CV 2025 (6).pdf")
cv = ""
for page in reader.pages:
    text = page.extract_text()
    if text:
        cv += text

name = "leena almoradi"

system_prompt = f"You are acting as {name}. You are answering questions on {name}'s website, \
particularly questions related to {name}'s career, background, skills and experience. \
Your responsibility is to represent {name} for interactions on the website as faithfully as possible. \
You are given a cv of {name}'s background use to answer questions. \
Be professional and engaging, as if talking to a potential client or future employer who came across the website."

system_prompt += f"\n\n## CV:\n{cv}\n\n"
system_prompt += f"With this context, please chat with the user, always staying in character as {name}."

def chat(message, history):
    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
    response = client.chat.completions.create(model=model_name, messages=messages)

    return response.choices[0].message.content

gr.ChatInterface(chat, type="messages").launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c5cfd4b6ebfefaa38f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
## Fixed Chunking

In [None]:
# =============================
# Imports
# =============================
from openai import OpenAI
from pypdf import PdfReader
import gradio as gr
import faiss
import numpy as np
from google.colab import userdata
from langchain.text_splitter import RecursiveCharacterTextSplitter

# =============================
# Init
# =============================
client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))
chat_model = "gpt-4o-mini"
embedding_model = "text-embedding-3-small"

# =============================
# Step 1: Load CV
# =============================
reader = PdfReader(r"/content/Karim Nabil PM Resume - August 2025.pdf")
cv_text = "".join(page.extract_text() or "" for page in reader.pages)

# =============================
# Step 2: Chunking (LangChain)
# =============================
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ".", " ", ""]
)
chunks = text_splitter.split_text(cv_text)

# =============================
# Step 3: Embeddings + FAISS
# =============================
# Create embeddings for all chunks
embeddings = [
    client.embeddings.create(model=embedding_model, input=chunk).data[0].embedding
    for chunk in chunks
]

embeddings = np.array(embeddings).astype("float32")

# build FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# keep chunks aligned by index
chunk_list = chunks

# =============================
# Step 4: Retrieval
# =============================
def retrieve(query, top_k=3):
    query_emb = client.embeddings.create(model=embedding_model, input=query).data[0].embedding
    query_emb = np.array([query_emb], dtype="float32")
    D, I = index.search(query_emb, top_k)
    return [chunk_list[i] for i in I[0]]

# =============================
# Step 5: Chat
# =============================
def chat(message, history):
    context_chunks = retrieve(message, top_k=3)
    context = "\n\n".join(context_chunks)

    system_prompt = f"""You are acting as Karim Nabil.
Answer questions faithfully using the following retrieved context from Karim’s CV:

{context}

Always stay professional and engaging, as if speaking to a potential client or employer.
If something is not in the CV, politely say you don’t have that info.
"""

    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
    response = client.chat.completions.create(model=chat_model, messages=messages)
    return response.choices[0].message.content

# =============================
# Gradio UI
# =============================
gr.ChatInterface(chat, type="messages").launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f4d6420dcce30e0f50.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
## Fixed Chunking

ModuleNotFoundError: No module named 'pypdf'

In [None]:
!pip install faiss-cpu




In [None]:
## Context-Aware Chunking

In [None]:
import json
from openai import OpenAI
from pypdf import PdfReader
import gradio as gr
import faiss
import numpy as np
from google.colab import userdata
from langchain.text_splitter import RecursiveCharacterTextSplitter

# =============================
# Init
# =============================
client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))
chat_model = "gpt-4o-mini"
embedding_model = "text-embedding-3-small"

# =============================
# Step 1: Load CV
# =============================
reader = PdfReader(r"/content/Karim Nabil PM Resume - August 2025.pdf")
cv_text = "".join(page.extract_text() or "" for page in reader.pages)

def llm_chunk_cv(cv_text):
    messages = [
        {
            "role": "system",
            "content": (
                "Split the following CV into logical sections for example"
                "(Experience, Education, Summary). "
                "Return each section as a JSON array of objects with 'section' and 'content' ."
            )
        },
        {"role": "user", "content": cv_text}
    ]

    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=0
    )

    raw_output = resp.choices[0].message.content.strip()

    # Remove markdown code block fences if they exist
    if raw_output.startswith("```json"):
        raw_output = raw_output[7:]
    if raw_output.endswith("```"):
        raw_output = raw_output[:-3]

    parsed = json.loads(raw_output)
    return parsed

sections = llm_chunk_cv(cv_text)

In [None]:
print(json.dumps(sections, indent=2))

[
  {
    "section": "Summary",
    "content": "Driven professional with a blend of engineering expertise, business development acumen, and product management skills. Demonstrated success in pioneering product initiatives, forging strategic partnerships, and leveraging data for user-centric solutions. Adept at translating market insights into actionable product strategies. Eager to apply my versatile background to lead and innovate in product management roles, delivering value and driving growth."
  },
  {
    "section": "Experience",
    "content": [
      {
        "title": "AI Product Manager/ Business Developer",
        "company": "Stealth Mode",
        "duration": "March 2025 \u2013 Ongoing",
        "location": "Canada, Full-time, remote",
        "industry": "Agentic AI OS SaaS",
        "responsibilities": [
          "Led the product discovery phase by engaging directly with target users to uncover pain points, validate core problems, and define solution fit.",
          "De

In [None]:
embeddings = []
section_texts = []
metadata = []

for s in sections:
    section_name = s["section"]
    content = s["content"]

    if not content:
        continue

    emb = client.embeddings.create(
        model=embedding_model,
        input=json.dumps(content, indent=2)
    ).data[0].embedding

    embeddings.append(emb)
    section_texts.append(content)
    metadata.append(section_name)

# Convert to numpy for FAISS
embeddings = np.array(embeddings, dtype="float32")
print(metadata, embeddings, section_texts)

['Summary', 'Experience', 'Extracurricular Activities', 'Education', 'Courses and Certifications', 'Skill Highlights', 'Languages', 'Initiatives and Accomplishments'] [[ 8.89946613e-03 -9.45531297e-03  1.71721261e-02 ...  8.44118826e-04
   3.25702727e-02 -9.28382855e-03]
 [-2.84280125e-02 -2.81259650e-03  3.96570787e-02 ... -6.21862803e-03
  -7.73952669e-03  4.12561558e-03]
 [-4.71006474e-03 -8.84763524e-03  8.09948985e-03 ... -1.03048924e-02
  -1.16710719e-02 -5.97540662e-03]
 ...
 [-1.61280092e-02 -2.36737952e-02  6.13268837e-02 ... -1.82135291e-02
  -3.78444602e-05  1.85800735e-02]
 [-1.49572631e-02 -3.10543017e-03  7.57737160e-02 ... -3.92277688e-02
  -1.75781362e-02  5.06803766e-03]
 [-5.97637845e-03 -3.85707058e-02  1.50536755e-02 ... -3.84597108e-02
  -1.93265427e-04 -1.65382307e-02]] ['Driven professional with a blend of engineering expertise, business development acumen, and product management skills. Demonstrated success in pioneering product initiatives, forging strategic pa

In [None]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [None]:
def retrieve(query, top_k=2):
    query_emb = client.embeddings.create(
        model=embedding_model,
        input=query
    ).data[0].embedding
    query_emb = np.array([query_emb], dtype="float32")

    D, I = index.search(query_emb, top_k)

    results = []
    for idx in I[0]:
        results.append({
            "section": metadata[idx],
            "content": section_texts[idx]
        })
    return results


In [None]:
results = retrieve("what is your Education?")
for r in results:
    print(f"== {r['section']} ==")
    print(r["content"][:200], "...\n")


== Education ==
[{'degree': 'BCs in Mechanical Design and Production Engineering', 'institution': 'Zagazig University', 'year': 2021, 'details': '92%, Excellent with Honor, 1st of department.'}, {'degree': 'Marketing nanodegree', 'institution': 'Udacity'}, {'degree': 'Google career certificate', 'field': 'Automation using Python'}] ...

== Initiatives and Accomplishments ==
['Agentic AI Engineering instructor.', 'n8n with Karim YouTube course.', 'NASA Space Apps Cairo winner and global nominee twice.', '2nd place in YLP (Youth Leadership Program) With UNDP.', 'African youth forum for entrepreneur’s winner.', 'The Ideal Student @ Zagazig university 2020.'] ...



In [None]:
def chat(message, history):
    context_chunks = retrieve(message, top_k=3)
    context = "\n\n".join([f"### {c['section']}\n{c['content']}" for c in context_chunks])

    system_prompt = f"""You are acting as Karim Nabil.
Answer questions faithfully using the following retrieved context from Karim’s CV:

{context}

Always stay professional and engaging, as if speaking to a potential client or employer.
If something is not in the CV, politely say you don’t have that info.
"""

    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
    response = client.chat.completions.create(model=chat_model, messages=messages)
    return response.choices[0].message.content


# =============================
# Gradio UI
# =============================
gr.ChatInterface(chat, type="messages").launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://910edd34c80a9bd65d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


