In [1]:
import pandas as pd
import os
import datetime
import dateparser
import numpy as np
from dotenv import load_dotenv
import json
from tqdm import tqdm
import string

from mistralai import Mistral
from datasets import load_dataset
from huggingface_hub import login
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from pydantic import BaseModel, Field
from typing import List, Optional, get_origin, get_args, Union
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType, IntegerType, FloatType, BooleanType, TimestampType, StructField, StructType

  from .autonotebook import tqdm as notebook_tqdm


# SOURCE

In [2]:
dataset = load_dataset("cnamuangtoun/resume-job-description-fit")
df = dataset["train"].to_pandas()


Downloading data: 100%|██████████| 53.4M/53.4M [00:08<00:00, 6.01MB/s]
Downloading data: 100%|██████████| 15.2M/15.2M [00:02<00:00, 5.17MB/s]
Generating train split: 100%|██████████| 6241/6241 [00:00<00:00, 18368.45 examples/s]
Generating test split: 100%|██████████| 1759/1759 [00:00<00:00, 19980.39 examples/s]


In [3]:
load_dotenv()
os.environ["MISTRAL_API_KEY"] = os.getenv("MISTRAL_API_KEY")

In [4]:
mistral = Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))


Generate random snapshot dates

In [26]:
# Create a seeded Generator
rng = np.random.default_rng(seed=42)

# Define start and end date
start_date = pd.to_datetime('2024-01-01')
end_date = pd.to_datetime('2025-01-01')

# Generate random timestamps between start_date and end_date
random_dates = pd.to_datetime(
    rng.uniform(start_date.value, end_date.value, size=len(df))
)

# Ensure it's treated as a pandas Series and convert to date
df['snapshot_date'] = pd.Series(random_dates).dt.date  # This will convert to date format

Generate random IDs

In [27]:
def generate_random_id(prefix: str, length=8, use_digits=True, use_letters=True, seed=42):
    rng = np.random.default_rng(seed=seed) 

    characters = ''
    
    if use_digits:
        characters += string.digits
    if use_letters:
        characters += string.ascii_letters

    # Ensure we have characters to choose from
    if not characters:
        raise ValueError("At least one of 'use_digits' or 'use_letters' must be True.")
    
    # Use np.random.choice to randomly select characters
    random_id = ''.join(rng.choice(list(characters), size=length))
    return prefix + random_id

In [28]:
df['resume_id'] = df.apply(lambda row: generate_random_id('RES_', seed=row.name), axis=1)
df['job_id'] = df.apply(lambda row: generate_random_id('JD_', seed=row.name), axis=1)

# BRONZE TABLE

## Resume Feature Extraction

In [29]:
# Define models
    
class Experience(BaseModel):
    role: Optional[str] = Field(None, description="The job title or position held")
    company: Optional[str] = Field(None, description="The name of the company. Exclude other description or location")
    date_start: Optional[str] = Field(None, description="The start date of the job. Dates must be in ISO 8601 format (YYYY-MM-DDTHH:MM:SS) or use the keywords 'present', 'current', or 'ongoing'")
    date_end: Optional[str] = Field(None, description="The end date of the job. Dates must be in ISO 8601 format (YYYY-MM-DDTHH:MM:SS) or use the keywords 'present', 'current', or 'ongoing'")
    role_description: Optional[str] = Field(None, description="A description of the responsibilities and achievements in the role")

class Education(BaseModel):
    degree: Optional[str] = Field(None, description="The academic degree obtained")
    institution: Optional[str] = Field(None, description="The name of the educational institution")
    date_start: Optional[str] = Field(None, description="The start date of the education program. Dates must be in ISO 8601 format (YYYY-MM-DDTHH:MM:SS) or use the keywords 'present', 'current', or 'ongoing'")
    date_end: Optional[str] = Field(None, description="The end date of the education program. Dates must be in ISO 8601 format (YYYY-MM-DDTHH:MM:SS) or use the keywords 'present', 'current', or 'ongoing'")
    grade: Optional[float] = Field(None, description="The GPA or final grade, if available")
    description: Optional[str] = Field(None, description="Additional details about the education")

class Resume(BaseModel):
    name: Optional[str] = Field(None, description="Full name of the person")
    location_preference: Optional[str] = Field(None, description="Preference for their work location / remote, if stated")
    work_authorizaton: Optional[str] = Field(None, description="Work authorization that the person holds, such as citizenship, if stated")
    employment_type_preference: Optional[str] = Field(
        None,
        description="Type of employment the resume is looking for such as Full-time, Part-time, Contract, Freelance, or Internship, if stated. It can also be a preference for remote work or on-site work"
    )
    hard_skills: List[str] = Field(default_factory=list, description="A list of hard or technical skills mentioned in the resume. All hard skills are tools, frameworks, or programming languages (e.g., Python, TensorFlow, Docker). Keep it as keywwords. Exclude certification or license")
    soft_skills: List[str] = Field(default_factory=list, description="A list of soft skills mentioned in the resume. Soft skills are qualities like communication, teamwork, leadership. Keep it as keywwords. Exclude required languages")
    languages: List[str]= Field(default_factory=list, description="A list of language proficiencies mentioned in the resume. If the resume does not mention any languages, then fill this with the language that the resume is written in")
    experience: List[Experience] = Field(default_factory=list, description="A list of past work experiences in reverse chronological order (most recent first).")
    education: List[Education] = Field(default_factory=list, description="A list of educational qualifications")
    certifications: List[str] = Field(default_factory=list, description="A list of certifications or licenses related with hard skills, medical skills, and software tools mentioned in the resume. For example, AWS Certified Solutions Architect, PMP, etc. Certifications must exclude any work role IDs, only include valid licenses or certifications.")

# Create the parser
resume_parser = PydanticOutputParser(pydantic_object=Resume)
format_instructions = resume_parser.get_format_instructions()

## Job Desc Feature Extraction

In [30]:
# Define models for job desc

class JD(BaseModel):
    company_name: Optional[str] = Field(None, description="Name of the company posting the job")
    role_title: Optional[str] = Field(None, description="Job title or position being offered")
    application_deadline: Optional[str] = Field(None, description="The deadline for submitting applications. Dates must be in ISO 8601 format (YYYY-MM-DDTHH:MM:SS)")
    date_posted: Optional[str] = Field(None, description="The date when the job was posted. Dates must be in ISO 8601 format (YYYY-MM-DDTHH:MM:SS)")
    employment_type: Optional[str] = Field(None, description="Type of employment, such as Full-time, Part-time, Contract, Freelance, or Internship. If not stated, it is assumed to be Full-time")
    about_the_company: Optional[str] = Field(None, description="A brief overview or description of the company")
    job_responsibilities: List[str] = Field(default_factory=list, description="A list of key duties, tasks, or responsibilities associated with the job")
    required_hard_skills: List[str] = Field(default_factory=list, description="A list of technical or hard skills required or preferred for the job. Keep it as keywords. This includes programming languages, software tools, or frameworks like Python, Java, SQL")
    required_soft_skills: List[str] = Field(default_factory=list, description="A list of soft skills or character required or preferred for the job. Keep it as keywords. This includes communication, teamwork, or leadership skills")   
    required_language_proficiencies: List[str] = Field(default_factory=list, description="A list of language proficiencies required for the job if stated. If the job description does not mention any languages, then fill this with the language that the job description is written in")
    required_education: Optional[str] = Field(None, description="The minimum educational qualification required for the job, such as a degree or certification")
    required_work_authorization: Optional[str] = Field(None, description="Work authorization required for the job")
    job_location: Optional[str] = Field(None, description="Location where the job is based, such as a city or remote")
    certifications: List[str] = Field(default_factory=list, description="A list of certifications or licenses related with hard skills, medical skills, and software tools mentioned in the resume. certifications should relate only to verifiable credentials (e.g., AWS, CISSP, PMP). Do not include work roles or job titles as certifications")
    
# Create the parser
jd_parser = PydanticOutputParser(pydantic_object=JD)
format_instructions = jd_parser.get_format_instructions()

## Parse

### Parse resume

In [31]:
# model options

models = mistral.models.list()
for m in models.data:
    print(m.id)


ministral-3b-2410
ministral-3b-latest
ministral-8b-2410
ministral-8b-latest
open-mistral-7b
mistral-tiny
mistral-tiny-2312
open-mistral-nemo
open-mistral-nemo-2407
mistral-tiny-2407
mistral-tiny-latest
open-mixtral-8x7b
mistral-small
mistral-small-2312
open-mixtral-8x22b
open-mixtral-8x22b-2404
mistral-small-2402
mistral-small-2409
mistral-medium-2312
mistral-large-2402
mistral-large-2407
mistral-large-2411
mistral-large-latest
pixtral-large-2411
pixtral-large-latest
mistral-large-pixtral-2411
codestral-2405
codestral-2501
codestral-latest
codestral-2412
codestral-2411-rc5
devstral-small-2505
devstral-small-latest
pixtral-12b-2409
pixtral-12b
pixtral-12b-latest
mistral-small-2501
mistral-small-2503
mistral-small-latest
mistral-saba-2502
mistral-saba-latest
mistral-medium-2505
mistral-medium-latest
mistral-medium
mistral-embed
codestral-embed
codestral-embed-2505
mistral-moderation-2411
mistral-moderation-latest
mistral-ocr-2503
mistral-ocr-2505
mistral-ocr-latest


In [32]:
import json

def parse_with_mistral(text: str, parser, format_instructions: str, label: str) -> BaseModel:

    prompt = (
    f"Parse the following text into a structured format according to the provided schema."
    f"If the same role at the same company appears more than once, merge the role descriptions and preserve the earliest start and latest end dates."
    f"{format_instructions}\n\n"
    f"{label}:\n{text}"
)

    response = mistral.chat.complete(
        model="mistral-medium-latest",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=2048
    )
    raw = response.choices[0].message.content
    return parser.parse(raw)

In [80]:
resume_text = df["resume_text"].iloc[6236]
parsed_resume = parse_with_mistral(resume_text, resume_parser, resume_parser.get_format_instructions(), "Resume")


In [81]:
print(parsed_resume.model_dump_json(indent=2))

{
  "name": null,
  "location_preference": null,
  "work_authorizaton": null,
  "employment_type_preference": null,
  "hard_skills": [
    "Microsoft Excel",
    "Microsoft Outlook",
    "Adobe Software",
    "Microsoft Office Suite",
    "Microsoft Access",
    "Database Management",
    "Data Compilation",
    "Data Review",
    "Data Verification"
  ],
  "soft_skills": [
    "Decision Making",
    "Service-Oriented",
    "Self-Starter",
    "Workflow Management",
    "Attention to Detail",
    "Multitasking and Prioritization",
    "Time Management",
    "Team Player",
    "Communication",
    "Leadership"
  ],
  "languages": [
    "English"
  ],
  "experience": [
    {
      "role": "Data Entry Specialist",
      "company": "Sonic Healthcare Usa",
      "date_start": "2020-09-01T00:00:00",
      "date_end": "current",
      "role_description": "Input client information into spreadsheets and company database to provide leaders with quick access to essential client data. Identified, 

### Parse job desc

In [83]:
print(df['job_description_text'].iloc[6236])

Hi,
Hope you are doing great today. Please find the job description below. Let me know your job interest as soon as possible. I will highly appreciate it if you can refer somebody suitable for this position. 
Role: Data Engineer (Oracle and DataStage).Location: RemoteContract Position
Job Description:RoleResponsibilities:Skills: Oracle, Datastage, UNIX, PLSQL, SQL. Good to have: AWS, Matillion, Snowflake. Data engineering experience; expert level experience with SQL. Experience with the cloud (AWS, Azure andor Google Cloud Platform).  Experience in cloud-based data warehouses (Snowflake, Google BigQuery, Amazon Redshift, Azure Synapse Analytics).  Experience with cloud-based ETLELT tools (Matillion, Glue, Data Factory) and data modelling.  Experience with version control systems (Git, SVN).  Understanding of and willingness to embrace Agile Principles. 
Looking forward to your response . 
Shubhanshu Tripathishubhanshu.t@cblsolutions.com 469-947-7816 (Ext  209)Cerebral Technologies, Inc

In [84]:
parsed_jd = parse_with_mistral(df['job_description_text'].iloc[6236], jd_parser, jd_parser.get_format_instructions(), "Job Description")

In [85]:
print(parsed_jd.model_dump_json(indent=2))

{
  "company_name": "Cerebral Technologies, Inc (D.B.A CBLSolutions)",
  "role_title": "Data Engineer (Oracle and DataStage)",
  "application_deadline": null,
  "date_posted": null,
  "employment_type": "Contract",
  "about_the_company": null,
  "job_responsibilities": [
    "Data engineering experience",
    "Expert level experience with SQL",
    "Experience with the cloud (AWS, Azure, and/or Google Cloud Platform)",
    "Experience in cloud-based data warehouses (Snowflake, Google BigQuery, Amazon Redshift, Azure Synapse Analytics)",
    "Experience with cloud-based ETL/ELT tools (Matillion, Glue, Data Factory) and data modeling",
    "Experience with version control systems (Git, SVN)",
    "Understanding of and willingness to embrace Agile Principles"
  ],
  "required_hard_skills": [
    "Oracle",
    "DataStage",
    "UNIX",
    "PLSQL",
    "SQL",
    "AWS",
    "Matillion",
    "Snowflake",
    "Google Cloud Platform",
    "Azure",
    "Google BigQuery",
    "Amazon Redshift",


### Parse 10 rows

In [33]:
df_subset = pd.concat([df[:5], df[-5:]])
df_subset

Unnamed: 0,resume_text,job_description_text,label,snapshot_date,resume_id,job_id
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit,2024-10-10,RES_QDvgj241,JD_QDvgj241
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit,2024-06-09,RES_tvKW28PW,JD_tvKW28PW
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit,2024-11-10,RES_Pg6ipOr5,JD_Pg6ipOr5
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit,2024-09-12,RES_O5bebNRA,JD_O5bebNRA
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit,2024-02-04,RES_JWSvWYY5,JD_JWSvWYY5
6236,SummaryResults-driven Data Entry Clerk with ex...,"Hi,\nHope you are doing great today. Please fi...",Good Fit,2024-06-02,RES_vNEJ62Py,JD_vNEJ62Py
6237,Professional SummaryWith the attitude of learn...,Job Title: DHT - Front End Software Engineer W...,Good Fit,2024-09-01,RES_DPqh0lVb,JD_DPqh0lVb
6238,Summary• \nOver\nThree years of extensi...,LHH Recruitment Solutions is looking for a Sof...,Good Fit,2024-11-02,RES_1HWrRA5T,JD_1HWrRA5T
6239,ProfileAbility to prioritize and multi-task in...,Our client is a growing Medical Device company...,Good Fit,2024-07-26,RES_XdUNowSD,JD_XdUNowSD
6240,SummaryFull stack Software Engineer with 8+ ye...,Robert Half is looking for a Senior Full Stack...,Good Fit,2024-08-22,RES_2RPwzELC,JD_2RPwzELC


In [82]:
for idx, row in tqdm(df_subset.iterrows(), total=len(df_subset)):
    resume_text = row['resume_text']
    jd_text = row['job_description_text']
    
    try:
        # Process resume
        parsed_resume = parse_with_mistral(
            resume_text,
            resume_parser,
            resume_parser.get_format_instructions(),
            "Resume"
        )
        parsed_resume_dict = parsed_resume.model_dump(mode="json")
        resume_output_path = os.path.join('examples_mistral', 'resume', f"{idx}.json")
        os.makedirs(os.path.dirname(resume_output_path), exist_ok=True) 
        with open(resume_output_path, "w") as f:
            json.dump(parsed_resume_dict, f, indent=2)

        # Process JD
        parsed_jd = parse_with_mistral(
            jd_text,
            jd_parser,
            jd_parser.get_format_instructions(),
            "Job Description"
        )
        parsed_jd_dict = parsed_jd.model_dump(mode="json")
        jd_output_path = os.path.join('examples_mistral', 'jd', f"{idx}.json")
        os.makedirs(os.path.dirname(jd_output_path), exist_ok=True) 
        with open(jd_output_path, "w") as f:
            json.dump(parsed_jd_dict, f, indent=2)

    except Exception as e:
        print(f"Error parsing row {idx}: {e}")

100%|██████████| 10/10 [03:01<00:00, 18.14s/it]


# Connecting to MongoDB

In [34]:
uri = os.environ.get("MONGO_DB_URL")

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [37]:
spark = SparkSession.builder \
    .appName("MongoDBIntegration") \
    .config("spark.mongodb.read.connection.uri", os.environ.get("MONGO_DB_URL")) \
    .config("spark.mongodb.write.connection.uri", os.environ.get("MONGO_DB_URL")) \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.0") \
    .getOrCreate()

### Save in mongodb per item (incrementing table)

In [24]:
db = client["jobmirror_db"]
resume_collection = db["resumes"]
jd_collection = db["job_descriptions"]

for idx, row in tqdm(df.iterrows(), total=len(df)):
    resume_text = row['resume_text']
    jd_text = row['job_description_text']
    try:
        # Process resume
        parsed_resume = parse_with_mistral(
            resume_text,
            resume_parser,
            resume_parser.get_format_instructions(),
            "Resume"
        )
        parsed_resume_dict = parsed_resume.model_dump(mode="json")
        parsed_resume_dict["row_idx"] = idx  
        resume_collection.insert_one(parsed_resume_dict) 

        # Process JD
        parsed_jd = parse_with_mistral(
            jd_text,
            jd_parser,
            jd_parser.get_format_instructions(),
            "Job Description"
        )
        parsed_jd_dict = parsed_jd.model_dump(mode="json")
        parsed_jd_dict["row_idx"] = idx  
        jd_collection.insert_one(parsed_jd_dict)

    except Exception as e:
        print(f"Error parsing row {idx}: {e}")

  0%|          | 12/6241 [03:16<23:37:10, 13.65s/it]

Error parsing row 11: API error occurred: Status 429
{"message":"Requests rate limit exceeded"}


  9%|▉         | 590/6241 [3:03:25<29:16:54, 18.65s/it]


KeyboardInterrupt: 

In [23]:
# Clear collections

db = client["jobmirror_db"]
resume_collection = db["resumes"]
jd_collection = db["job_descriptions"]

resume_collection.delete_many({})
jd_collection.delete_many({})



DeleteResult({'n': 640, 'electionId': ObjectId('7fffffff00000000000001b7'), 'opTime': {'ts': Timestamp(1748846844, 37), 't': 439}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1748846844, 37), 'signature': {'hash': b',;\x0f\x14\xdc\x04\xae#\x05\xf8\x8d\xce\n\x97y\xad\xfd\xffO\xa5', 'keyId': 7450535577176244240}}, 'operationTime': Timestamp(1748846844, 37)}, acknowledged=True)

Convert into PySpark Dataframe (Overwrite table)

In [38]:
def python_type_to_spark_type(annotation):
    origin = get_origin(annotation)

    if origin is Union:  # Handle Optional
        args = [arg for arg in get_args(annotation) if arg is not type(None)]
        return python_type_to_spark_type(args[0])

    if origin in (list, List):
        element_type = python_type_to_spark_type(get_args(annotation)[0])
        return ArrayType(element_type)

    if isinstance(annotation, type):
        if issubclass(annotation, BaseModel):
            return pydantic_to_spark_schema(annotation)
        if issubclass(annotation, str):
            return StringType()
        if issubclass(annotation, int):
            return IntegerType()
        if issubclass(annotation, float):
            return FloatType()
        if issubclass(annotation, bool):
            return BooleanType()
        if issubclass(annotation, datetime.datetime):
            return StringType()

    return StringType()

def pydantic_to_spark_schema(model: type) -> StructType:
    fields = []

    for name, field in model.model_fields.items():
        annotation = field.annotation

    spark_type = python_type_to_spark_type(annotation)
    fields.append(StructField(name, spark_type, True))  # assume all nullable
    fields.append(StructField('snapshot_date', StringType(), True))
    fields.append(StructField('id', StringType(), True))

    return StructType(fields)

In [None]:
parsed_resumes = []
parsed_jds = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    try:
        # Parse resume
        parsed_resume = parse_with_mistral(
            row['resume_text'],
            resume_parser,
            resume_parser.get_format_instructions(),
            "Resume"
        )
        parsed_resume_dict = parsed_resume.model_dump(mode="json")
        parsed_resume_dict['snapshot_date'] = row['snapshot_date']
        parsed_resume_dict['id'] = row['resume_id']
        parsed_resumes.append(parsed_resume_dict)

        # Parse JD
        parsed_jd = parse_with_mistral(
            row['job_description_text'],
            jd_parser,
            jd_parser.get_format_instructions(),
            "Job Description"
        )
        parsed_jd_dict = parsed_jd.model_dump(mode="json")
        parsed_jd_dict['snapshot_date'] = row['snapshot_date']
        parsed_jd_dict['id'] = row['job_id']
        parsed_jds.append(parsed_jd_dict)
    except Exception as e:
        print(f"Error parsing row {idx}: {e}")

  3%|▎         | 159/6241 [1:32:54<658:56:45, 390.04s/it]

Error parsing row 158: Server disconnected without sending a response.


  3%|▎         | 196/6241 [1:53:24<31:27:15, 18.73s/it]  

In [None]:
resume_schema = pydantic_to_spark_schema(Resume)
jd_schema = pydantic_to_spark_schema(JD)

resume_df = spark.createDataFrame(parsed_resumes, schema=resume_schema)
jd_df = spark.createDataFrame(parsed_jds, schema=jd_schema)


In [None]:
resume_df.write.format("mongodb") \
            .mode("overwrite") \
            .option("database", "jobmirror") \
            .option("collection", "resumes") \
            .save()

In [None]:
jd_df.write.format("mongodb") \
            .mode("overwrite") \
            .option("database", "jobmirror") \
            .option("collection", "jd") \
            .save()

# SILVER

# GOLD

## Get scores

In [37]:
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004", task_type="SEMANTIC_SIMILARITY"
)

In [39]:
embeddings_required_skills = embedding_model.embed_documents(parsed_jd.required_hard_skills)
embeddings_skills_owned = embedding_model.embed_documents(parsed_resume.hard_skills)

In [40]:
required_skills = np.array(embeddings_required_skills)
skills_owned = np.array(embeddings_skills_owned)

# Normalize embeddings to unit vectors (L2 norm)
required_skills = required_skills / np.linalg.norm(required_skills, axis=1, keepdims=True)
skills_owned = skills_owned / np.linalg.norm(skills_owned, axis=1, keepdims=True)

# Compute cosine similarity matrix by dot product
similarity_matrix = np.dot(required_skills, skills_owned.T)

In [42]:
best_matches = []

for i, req_skill in enumerate(parsed_jd.required_hard_skills):
    j = similarity_matrix[i].argmax()
    score = similarity_matrix[i, j]
    if score >= 0.6:
        best_matches.append((req_skill, parsed_resume.hard_skills[j], score))

# Print
for req_skill, own_skill, score in best_matches:
    print(f"Required: {req_skill}  <=> Best Owned: {own_skill}  | Similarity: {score:.2f}")

Required: PostgreSQL  <=> Best Owned: PostgreSQL  | Similarity: 1.00
Required: Express  <=> Best Owned: EF  | Similarity: 0.63
Required: React  <=> Best Owned: HTML5  | Similarity: 0.63
Required: NodeJS  <=> Best Owned: AngularJS  | Similarity: 0.73
Required: Redux  <=> Best Owned: Redmine  | Similarity: 0.63
Required: HTML  <=> Best Owned: HTML  | Similarity: 1.00
Required: CSS  <=> Best Owned: CSS  | Similarity: 1.00
Required: JavaScript  <=> Best Owned: jQuery  | Similarity: 0.86
Required: JSON  <=> Best Owned: JSON  | Similarity: 1.00
Required: Git  <=> Best Owned: GIT  | Similarity: 0.95
Required: REST  <=> Best Owned: REST  | Similarity: 1.00
Required: Firebase  <=> Best Owned: Hangfire  | Similarity: 0.62
Required: Material-UI  <=> Best Owned: AngularJS  | Similarity: 0.63
Required: D3js  <=> Best Owned: jQuery  | Similarity: 0.72
Required: Docker (Compose)  <=> Best Owned: Composer  | Similarity: 0.67
Required: AWS  <=> Best Owned: AWS EC2  | Similarity: 0.85


In [47]:
embeddings_role_name = embedding_model.embed_query(parsed_jd.role_title)
embeddings_experience_titles = embedding_model.embed_documents([exp.role for exp in parsed_resume.experience])

In [43]:
parsed_jd.role_title

'Senior Full Stack Engineer (PERN Stack)'

In [48]:
[exp.role for exp in parsed_resume.experience]

['Software Developer',
 'Software .Net Developer',
 'Software Engineer and Professor']

In [49]:
role_name = np.array(embeddings_role_name)
experiences = np.array(embeddings_experience_titles)

# Normalize embeddings to unit vectors (L2 norm)
role_name = role_name / np.linalg.norm(role_name)
experiences = experiences / np.linalg.norm(experiences, axis=1, keepdims=True)

# Compute cosine similarity matrix by dot product
similarity_matrix = np.dot(experiences, role_name.T)

In [50]:
similarity_matrix

array([0.65028087, 0.62905722, 0.6121288 ])