In [1]:
import json
import re
from pprint import pprint

from rapidfuzz import fuzz, process

from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import SentenceTransformerEmbeddings

from doman_dict import full_domain_mapping

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def lexical_match(input_string: str, data_dict: dict):
    input_string = input_string.lower()
    best_score = 0
    best_key = None
    best_value = None

    for key, values in data_dict.items():
        if not isinstance(values, list):
            values = [values]
        
        # Compare input_string against all values in the list using char-level similarity
        match, score, _ = process.extractOne(input_string, values, scorer=fuzz.ratio)
        
        if score > best_score:
            best_score = score
            best_key = key
            best_value = match

    return best_key, best_value, best_score


In [12]:
best_key, best_value, _ = lexical_match("ORACLE DeveLoper", full_domain_mapping)

In [13]:
best_key, best_value

('enterprise-apps', 'oracle-developer')

In [6]:
with open("resumes.json", "r") as f:
    data = json.load(f)

In [11]:
parsed_data = {}

def extract_resume_category(url: str) -> str:
    match = re.search(r"/resume-database/\d+-([a-zA-Z0-9-]+)/", url)
    return match.group(1) if match else None

for key, value in data.items():
    role = extract_resume_category(key)  # e.g., "oracle-resumes"

    # Initialize list if role not in parsed_data
    if role not in parsed_data:
        parsed_data[role] = []

    # Append the current resume to the list
    parsed_data[role].append(value)

In [35]:
list(parsed_data.keys())

['oracle-resumes',
 'peoplesoft-resumes',
 'oracle-developers-resumes',
 'sql-developers-resumes',
 'business-analyst-resumes',
 'oracle-dba-resumes',
 'sap-resumes',
 'business-intelligence-business-object-resumes',
 'datawarehousing-etl-informatica-resumes',
 'project-manager-resumes',
 'quality-assurance-resumes']

In [None]:
pprint(parsed_data.keys())

{'PROFESSIONAL EXPERIENCE': [{'Environment:': 'Oracle 12c, SQL, PL/SQL, '
                                              'Reports 10g/6i,SQL * Loader, '
                                              'PL/SQL, Workflow Builder, '
                                              'XML/BI Publisher, UNIX Shell '
                                              'Scripting, JDA, MMS, HPALM '
                                              '(Agile Lifecycle Management), '
                                              'Tortoise SVN.',
                              'Responsibilities:': ['Involved in the complete '
                                                    'design, development and '
                                                    'testing phases of AP '
                                                    '(Assortment Planning) and '
                                                    'EP (Enterprise Planning) '
                                                    'projects, part of MPI '
    

In [29]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

qdrant_client = QdrantClient(host="localhost", port=6333)
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Qdrant(
    client=qdrant_client,
    collection_name="resume_embeddings",
    embeddings=embedding_function
)

texts = [
    "AI Engineer at Databricks",
    "Machine Learning Intern at Tesla",
    "Data Scientist at OpenAI"
]

vectorstore.add_texts(texts)

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


UnexpectedResponse: Unexpected Response: 404 (Not Found)
Raw response content:
b'{"status":{"error":"Not found: Collection `resume_embeddings` doesn\'t exist!"},"time":0.00001596}'

In [None]:
def ingest_section(section_name, collection_name):
    """Ingests a specific section (e.g., summary, skills) into Qdrant"""
    print(f"\n🚀 Ingesting {section_name.upper()} into collection: {collection_name}")

    texts, metadatas, ids = [], [], []

    for idx, resume in enumerate(tqdm(resumes, desc=f"Processing {section_name}")):
        # Fix: handle case where each resume is a string
        if isinstance(resume, str):
            resume = json.loads(resume)

        text = resume.get(section_name, "").strip()
        if not text:
            continue

        texts.append(text)
        metadatas.append({
            "resume_id": idx,
            "section": section_name
        })
        ids.append(f"{section_name}_{idx}")

    if len(texts) == 0:
        print(f"⚠️ No data found for section: {section_name}")
        return

    # Create collection in Qdrant
    Qdrant.from_texts(
        texts=texts,
        embedding=embedding_model,
        metadatas=metadatas,
        ids=ids,
        url=QDRANT_URL,
        collection_name=collection_name
    )
    print(f"✅ {section_name} section ingested into Qdrant collection '{collection_name}'")

In [None]:
from langchain_community.llms import HuggingFaceHub
from langchain_community.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Qdrant

In [None]:
text_split = RecursiveCharacterTextSplitter(chunk_size=512,chunk_overlap=50)
chunks = text_split.split_documents(docs)