In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List
def extract_jd_keywords(job_description: str, top_k: int = 50) -> List[str]:
    """Extract keywords from ONE job description"""
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform([job_description])
    
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray().flatten()
    
    top_indices = scores.argsort()[-top_k:][::-1]
    return [feature_names[i] for i in top_indices]

In [17]:
jd = """Job Description:
We are seeking an experienced Oracle Sales Cloud Consultant to support implementation, customization, and optimization of Oracle CX Sales applications. The ideal candidate will work closely with business stakeholders to gather requirements, configure the system, and ensure seamless integration with other Oracle and third-party applications.

Key Responsibilities:

Implement and configure Oracle Sales Cloud modules (Leads, Opportunities, Accounts, Contacts, and Forecasting).
Gather business requirements and translate them into functional solutions.
Develop custom reports and dashboards using OTBI/BIP.
Collaborate with technical teams for integrations and data migration.
Provide end-user training, documentation, and post-implementation support.
Required Skills:

Hands-on experience in Oracle Sales Cloud (B2B/B2C) implementation and support.
Strong understanding of sales automation processes and CRM best practices.
Knowledge of OIC, Groovy scripting, and REST/SOAP integrations is a plus.
Excellent communication and problem-solving skills.
"""

In [None]:
import re
from collections import Counter
import spacy
from wordfreq import zipf_frequency

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# English + job boilerplate stopwords
STOPWORDS = set(nlp.Defaults.stop_words).union({
    "responsibilities", "requirements", "requirement", "qualification",
    "qualifications", "responsible", "candidate", "role", "position",
    "skills", "ability", "abilities", "experience", "team", "work",
    "years", "job", "environment", "department"
})

def is_acronym(token):
    return bool(re.fullmatch(r"[A-Z0-9\-\.]{2,}", token))

def is_common_word(word):
    """
    Uses word frequency (Zipf score). Higher = more common.
    Common English words have Zipf >= 4.0 typically.
    Technical terms tend to have low frequency.
    """
    return zipf_frequency(word.lower(), "en") >= 4.0

def extract_candidates(text):
    doc = nlp(text)
    candidates = []

    # 1️⃣ Single-token candidates (nouns, proper nouns, acronyms)
    for token in doc:
        if token.pos_ in ("NOUN", "PROPN"):
            w = token.text.strip()
            if len(w) > 1 and w.lower() not in STOPWORDS:
                candidates.append(w)
        # Acronym pattern
        if is_acronym(token.text):
            candidates.append(token.text)

    # 2️⃣ Multi-word noun chunks (e.g., "cloud infrastructure")
    for chunk in doc.noun_chunks:
        phrase = chunk.text.strip()
        # Remove chunks that are entirely stopwords
        if not all(w.lower() in STOPWORDS for w in phrase.split()):
            candidates.append(phrase)

    return candidates

def score_terms(candidates):
    # Count frequency in the JD
    freq = Counter([c for c in candidates])

    scored = {}
    for term, count in freq.items():
        # Base score = frequency
        score = count

        # Boost acronyms (high chance of being tech)
        if is_acronym(term):
            score *= 2.0

        # Penalize common everyday English words
        if is_common_word(term):
            score *= 0.4

        # Boost multi-word technical phrases
        if " " in term:
            score *= 1.3

        scored[term] = score

    return scored


def extract_keywords(text, min_score=0.9):
    candidates = extract_candidates(text)
    scored = score_terms(candidates)

    sorted_terms = sorted(scored.items(), key=lambda x: x[1], reverse=True)

    return [term.lower() for term, score in sorted_terms if score >= min_score]


keywords = extract_keywords(jd)
print(keywords)



['oic', 'oracle', 'cx', 'otbi', 'bip', 'b2b', 'b2c', 'crm', 'integrations', 'customization', 'optimization', 'forecasting', 'dashboards', 'documentation', 'sales', 'implementation', 'rest', 'soap', 'an experienced oracle sales cloud consultant', 'oracle cx sales applications', 'business stakeholders', 'seamless integration', 'other oracle and third-party applications', 'oracle sales cloud modules', 'oracle sales cloud', 'sales automation processes', 'crm best practices', 'groovy scripting', 'rest/soap integrations', 'problem-solving skills', 'cloud', 'support', 'stakeholders', 'modules', 'automation', 'scripting', 'otbi/bip', '(b2b/b2c']


In [6]:
words = extract_jd_keywords(jd)
print(words)

['oracle', 'sales', 'implementation', 'cloud', 'support', 'sales cloud', 'oracle sales', 'gather', 'skills', 'configure', 'requirements', 'business', 'implementation support', 'integrations', 'applications', 'ensure', 'excellent communication', 'data migration', 'excellent', 'ensure seamless', 'description', 'end user', 'end', 'experience', 'documentation', 'develop custom', 'develop', 'description seeking', 'documentation post', 'work closely', 'experience oracle', 'groovy scripting', 'integration', 'implementation customization', 'implement configure', 'implement', 'ideal candidate', 'ideal', 'hands experience', 'hands', 'groovy', 'experienced', 'gather requirements', 'gather business', 'functional solutions', 'functional', 'forecasting gather', 'forecasting', 'dashboards using', 'experienced oracle']


In [5]:
import sys
from pathlib import Path

# Add project root to Python path for imports
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.resume_ingestion.vector_store.qdrant_manager import QdrantManager
from typing import Dict, List
from qdrant_client.http import models as qmodels

def has_job_roles(qdrant_manager: QdrantManager, job_roles: List[str]) -> Dict[str, bool]:
    """
    Check whether each job role exists anywhere in Qdrant.
    Returns dict: role -> True/False
    """
    result = {}

    for role in job_roles:
        flt = qmodels.Filter(
            must=[
                qmodels.FieldCondition(
                    key="job_role",
                    match=qmodels.MatchValue(value=role.strip().lower())
                )
            ]
        )

        points, _ = qdrant_manager.client.scroll(
            collection_name="experiences",
            with_payload=False,
            with_vectors=False,
            scroll_filter=flt,
            limit=1
        )

        result[role] = len(points) > 0

    return result

q = QdrantManager()

dr = ["oracle cloud procurement sme", "oracle cloud hcm consultant", "oracle consultant"]
print(has_job_roles(q, dr))

INFO     | Config                    | Base path to find the config file: /Users/naveenpoliasetty/Downloads/RAG-1
INFO     | Config                    | Loading config file: /Users/naveenpoliasetty/Downloads/RAG-1/src/core/config.yaml


  from .autonotebook import tqdm as notebook_tqdm


INFO     | ReliableBatchWorker       | Loading embedding model: intfloat/e5-base-v2
INFO     | sentence_transformers.SentenceTransformer | Use pytorch device_name: mps
INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: intfloat/e5-base-v2
INFO     | ReliableBatchWorker       | Successfully loaded embedding model: intfloat/e5-base-v2
INFO     | ReliableBatchWorker       | Model dimension: 768


QdrantError: Failed to connect to Qdrant after 3 attempts: [Errno 61] Connection refused

In [1]:
import sys
from pathlib import Path

# Add project root to Python path for imports
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [2]:
# Since project root is in sys.path (from Cell 6), use 'src.' prefix
from src.resume_ingestion.database import MongoDBManager


m = MongoDBManager()

ids = ['fbc1ff96-8081-430d-a41b-2f34b9a75c12', 'fb7aadc2-1bfe-441f-a49c-39db4a2b0bdc', '5c0d5af1-246f-443b-8667-bb30603db8d9']

# Correct method: get_sections_by_resume_ids (not get_resume_by_ids)
res = m.get_sections_by_resume_ids(ids, "professional_summary")
print(f"Found {len(res)} results")

INFO     | src.utils.logger          | Logging configured for pipeline: app
INFO     | src.utils.logger          |  Logger module loaded
INFO     | Config                    | Base path to find the config file: /Users/naveenpoliasetty/Downloads/RAG-1
INFO     | Config                    | Loading config file: /Users/naveenpoliasetty/Downloads/RAG-1/src/core/config.yaml
INFO     | ReliableBatchWorker       | Found 3 documents for 3 requested resume IDs in section 'professional_summary'
Found 7391 results


In [4]:
res

'[\n    {\n        "professional_summary_1": [\n            "7+ years of experience as Oracle PL/SQL Developer with expertise in design, development, testing and deployment of applications built on top of Oracle Database and also involved in support DBA activities.",\n            "Experience in working with Oracle 9i, 10g and 11g database.",\n            "Experience in Database design using Normalization and E/R Diagrams.",\n            "Experience includes extensive coding in PL/SQL, developing Packages, Procedures & Functions including Cursors and Exception handling.",\n            "Experience in writing database triggers & anonymous SQL scripts.",\n            "Expertise in using user defined and system defined Exceptions for Error handling.",\n            "Experience in using Oracle supplied packages such as DBMS FLASHBACK, DBMS SQL, DBMS JOB, UTL FILE for file handling.",\n            "Experience also includes writing sub queries and advanced SQL queries.",\n            "Experienc

In [5]:
from src.generation.call_llm import llm_json
from src.generation.resume_generator import ResumeGenerator

r = ResumeGenerator(llm_json)

  from .autonotebook import tqdm as notebook_tqdm


INFO     | src.core.db_manager       | Initializing Qdrant connection...
INFO     | ReliableBatchWorker       | Loading embedding model: intfloat/e5-base-v2
INFO     | sentence_transformers.SentenceTransformer | Use pytorch device_name: mps
INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: intfloat/e5-base-v2
INFO     | ReliableBatchWorker       | Successfully loaded embedding model: intfloat/e5-base-v2
INFO     | ReliableBatchWorker       | Model dimension: 768
INFO     | QdrantManager             | Connecting to Qdrant at 34.130.75.211:6333
INFO     | httpx                     | HTTP Request: GET http://34.130.75.211:6333 "HTTP/1.1 200 OK"
INFO     | httpx                     | HTTP Request: GET http://34.130.75.211:6333/collections "HTTP/1.1 200 OK"
INFO     | QdrantManager             | Successfully connected to Qdrant
INFO     | httpx                     | HTTP Request: GET http://34.130.75.211:6333/collections "HTTP/1.1 200 OK"
INFO     |

In [6]:
jd = "hi man"
from src.generation.prompts import SUMMARY_USER_PROMPT

file = r._build_prompt(SUMMARY_USER_PROMPT, jd, res, top_k=3)

In [7]:
with open("prompt.txt", "w") as f:
    f.write(file)

In [3]:
from src.resume_ingestion.database import MongoDBManager
import json

manager = MongoDBManager()
resume_id = "bb5fd9e7-a57a-4c3e-af0e-88a90445aaf1"

resume = manager.get_resume_by_id(resume_id)

if resume:
    # Remove MongoDB _id for cleaner output
    resume.pop('_id', None)
    print(json.dumps(resume, indent=2, ensure_ascii=False, default=str))
else:
    print(f"Resume not found: {resume_id}")

INFO     | ReliableBatchWorker       | Found resume document for resume_id: bb5fd9e7-a57a-4c3e-af0e-88a90445aaf1
{
  "job_role": "oracle pl/sql developer",
  "professional_summary": [
    "IT professional with 9+ years of diverse experience in IT, including business analysis, applications management, strong software development skills and a solid technical aptitude for troubleshooting and problem solving.",
    "Excel in identifying system needs, implementing multi - faceted application solutions, and working with a variety of operational systems, software, development languages and tools."
  ],
  "technical_skills": [
    "Operating Systems: Linux, Unix, IBM AIX 5.3 and Windows 95/98/2000/NT/XP",
    "Languages: Oracle PL/SQL, UNIX Shell Scripting, Pro C, 4GL",
    "Databases: Oracle8i, 9i, 10g & 11g, Informix",
    "Tools: SQL*Plus, SQL* Loader, SQL Developer V 1.5.0.54.40, CTRL M",
    "Supporting Software: Borland StarTeam, Putty, Smartbear Code Collaborator V7, Exceed",
    "Middl