# Semantical Evalution: ReAct <> Taxonomy

In [1]:
%load_ext autoreload 
%autoreload 2 

In [2]:
import ast

import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from utils import get_console

console = get_console()

## Load Taxonomy 

In [3]:
# Load the datasets 
occupations_df: pd.DataFrame = pd.read_csv(
    "./data/occupations_en.csv",
    header=0,
    delimiter=",",
    usecols=["conceptType", "conceptUri","iscoGroup", "preferredLabel", "altLabels", "description"]
).assign(
    conceptUri = lambda uri: uri["conceptUri"].str.rsplit("/").apply(lambda uri: uri[-1]),
    alternative_labels_exp = lambda label_text: label_text["altLabels"].str.split("\n"), 
).drop("altLabels", axis=1)

skill_df: pd.DataFrame = pd.read_csv(
    "./data/skills_en.csv",
    header=0,
    delimiter=",",
    usecols=["conceptType", "conceptUri", "skillType", "preferredLabel", "altLabels", "description"]
).assign(
    conceptUri = lambda uri: uri["conceptUri"].str.rsplit("/").apply(lambda uri: uri[-1]),
    alternative_labels_exp = lambda label_text: label_text["altLabels"].str.split("\n") 
).drop("altLabels", axis=1)

# Display datasets 
print(occupations_df.shape)
display(occupations_df.head())
print(skill_df.shape)
display(skill_df.head())

(3039, 6)


Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,description,alternative_labels_exp
0,Occupation,00030d09-2b3a-4efd-87cc-c4ea39d27c34,2654,technical director,Technical directors realise the artistic visio...,"[technical and operations director, head of te..."
1,Occupation,000e93a3-d956-4e45-aacb-f12c83fedf84,8121,metal drawing machine operator,Metal drawing machine operators set up and ope...,"[metal drawing machine technician, metal drawi..."
2,Occupation,0019b951-c699-4191-8208-9822882d150c,7543,precision device inspector,Precision device inspectors make sure precisio...,"[inspector of precision instruments, precision..."
3,Occupation,0022f466-426c-41a4-ac96-a235c945cf97,3155,air traffic safety technician,Air traffic safety technicians provide technic...,[air traffic safety electronics hardware speci...
4,Occupation,002da35b-7808-43f3-83bf-63596b8b351f,2431,hospitality revenue manager,Hospitality revenue managers maximise revenue ...,"[hospitality revenues manager, yield manager, ..."


(13939, 6)


Unnamed: 0,conceptType,conceptUri,skillType,preferredLabel,description,alternative_labels_exp
0,KnowledgeSkillCompetence,0005c151-5b5a-4a66-8aac-60e734beb1ab,skill/competence,manage musical staff,Assign and manage staff tasks in areas such as...,"[manage staff of music, coordinate duties of m..."
1,KnowledgeSkillCompetence,00064735-8fad-454b-90c7-ed858cc993f2,skill/competence,supervise correctional procedures,Supervise the operations of a correctional fac...,"[oversee prison procedures, manage correctiona..."
2,KnowledgeSkillCompetence,000709ed-2be5-4193-b056-45a97698d828,skill/competence,apply anti-oppressive practices,"Identify oppression in societies, economies, c...","[apply non-oppressive practices, apply an anti..."
3,KnowledgeSkillCompetence,0007bdc2-dd15-4824-b7d6-416522c46f35,skill/competence,control compliance of railway vehicles regulat...,"Inspect rolling stock, components and systems ...",[monitoring of compliance with railway vehicle...
4,KnowledgeSkillCompetence,00090cc1-1f27-439e-a4e0-19a87a501bfc,skill/competence,identify available services,Identify the different services available for ...,"[establish available services, determine rehab..."


In [4]:
# Combining all labels 
occupations_df["all_labels"] = occupations_df["preferredLabel"].apply(lambda x: [x]) + occupations_df["alternative_labels_exp"]
skill_df["all_labels"] = skill_df["preferredLabel"].apply(lambda x: [x]) + skill_df["alternative_labels_exp"]

# Adding alternative_size column for both 
occupations_df["alternative_size"] = occupations_df["all_labels"].apply(lambda alternatives: len(alternatives) if not isinstance(alternatives, float) else alternatives)
skill_df["alternative_size"] = skill_df["all_labels"].apply(lambda alternatives: len(alternatives) if not isinstance(alternatives, float) else alternatives)

# Drop irrelevant columns & NA values 
skill_df = skill_df.drop(columns=["alternative_labels_exp", "preferredLabel", "conceptType", "skillType"]).dropna()
occupations_df = occupations_df.drop(columns=["alternative_labels_exp", "preferredLabel", "iscoGroup", "conceptType"]).dropna()

In [5]:
print(occupations_df.shape)
display(occupations_df.head())
print(skill_df.shape)
display(skill_df.head())

(3011, 4)


Unnamed: 0,conceptUri,description,all_labels,alternative_size
0,00030d09-2b3a-4efd-87cc-c4ea39d27c34,Technical directors realise the artistic visio...,"[technical director, technical and operations ...",7.0
1,000e93a3-d956-4e45-aacb-f12c83fedf84,Metal drawing machine operators set up and ope...,"[metal drawing machine operator, metal drawing...",12.0
2,0019b951-c699-4191-8208-9822882d150c,Precision device inspectors make sure precisio...,"[precision device inspector, inspector of prec...",11.0
3,0022f466-426c-41a4-ac96-a235c945cf97,Air traffic safety technicians provide technic...,"[air traffic safety technician, air traffic sa...",8.0
4,002da35b-7808-43f3-83bf-63596b8b351f,Hospitality revenue managers maximise revenue ...,"[hospitality revenue manager, hospitality reve...",6.0


(13591, 4)


Unnamed: 0,conceptUri,description,all_labels,alternative_size
0,0005c151-5b5a-4a66-8aac-60e734beb1ab,Assign and manage staff tasks in areas such as...,"[manage musical staff, manage staff of music, ...",5.0
1,00064735-8fad-454b-90c7-ed858cc993f2,Supervise the operations of a correctional fac...,"[supervise correctional procedures, oversee pr...",7.0
2,000709ed-2be5-4193-b056-45a97698d828,"Identify oppression in societies, economies, c...","[apply anti-oppressive practices, apply non-op...",8.0
3,0007bdc2-dd15-4824-b7d6-416522c46f35,"Inspect rolling stock, components and systems ...",[control compliance of railway vehicles regula...,9.0
4,00090cc1-1f27-439e-a4e0-19a87a501bfc,Identify the different services available for ...,"[identify available services, establish availa...",9.0


In [6]:
# Exploring on average how many alternatives 
console.print(f"On average there are {occupations_df["alternative_size"].mean().round()} alternatives for OCCUPATIONS!")
console.print(f"On average there are {skill_df["alternative_size"].mean().round()} alternatives for SKILLS!")

## Loading the sample results from our ReAct agent

In [7]:
react_res_df: pd.DataFrame = pd.read_csv("./data/sample_results.csv")
console.print(react_res_df.shape)
react_res_df.head()

Unnamed: 0,thread_id,messages,user_id,skills,job_ads,todos,requirements,responsibilities,evaluation
0,thread-0,"[SystemMessage(content=""Based upon a user's re...",user-dEf@ulT-1,"{'hard': ['education qualifications', 'client ...",[{'ad_text': 'Are you looking to join a thrivi...,[],"['Experience within Early Education', 'Passion...",['Building relationships with range of existin...,"{'skills': [{'hard': {'reasoning': [""The job a..."
1,thread-1,"[SystemMessage(content=""Based upon a user's re...",user-dEf@ulT-1,"{'hard': ['retail sales', 'sales target achiev...",[{'ad_text': '· Casual hours as required (tra...,[{'content': 'Extract raw job advertisement te...,"['Have a passion for retail', 'Be highly motiv...",[],"{'skills': [{'hard': {'reasoning': [""The skill..."
2,thread-2,"[SystemMessage(content=""Based upon a user's re...",user-dEf@ulT-1,"{'hard': ['software development', 'continuous ...","[{'ad_text': ""Readify helps organizations inno...",[{'content': 'Extract raw job advertisement te...,['willingness to learn and a positive attitude...,"['Design, develop, test and deliver custom sof...","{'skills': [{'hard': {'reasoning': [""The job a..."
3,thread-3,"[SystemMessage(content=""Based upon a user's re...",user-dEf@ulT-1,"{'hard': [""driver's license"", 'community work'...","[{'ad_text': ""Be part of an exciting start-up,...",[{'content': 'Extract raw job advertisement te...,['Substantial experience in and knowledge of y...,"['Provide comprehensive intake, initial screen...","{'skills': [{'hard': {'reasoning': [""The job a..."
4,thread-4,"[SystemMessage(content=""Based upon a user's re...",user-dEf@ulT-1,"{'hard': ['strategic initiative management', '...","[{'ad_text': ""This is a key role within a mark...",[{'content': 'Extract the raw context of the j...,[],[],"{'skills': [{'hard': {'reasoning': [""The skill..."


In [8]:
results_skills = []
result_responsibilities = []
result_requirements = []

for record in react_res_df.to_dict(orient="records"): 
    thread_skills = {}
    thread_responsibilities = {}
    thread_requirements = {}

    if record.get("skills") != 'hard=[] soft=[] both=[]':
        # Using ast.literal_eval to safly parse strings containing python code (dict, list, etc.)
        thread_id = record.get("thread_id")
        skills = ast.literal_eval(record.get("skills"))
        responsibilities = ast.literal_eval(record.get("responsibilities"))
        requirements = ast.literal_eval(record.get("requirements"))
        evaluation = ast.literal_eval(record.get("evaluation"))
        has_values = any(value for value in skills.values() if skills) # Ensure skill dictionary contain values 

        
        if has_values and requirements and responsibilities:
            hard_skills = skills["hard"]
            soft_skills = skills["soft"]

            # Extract evaluation 
            if evaluation != {}: 
                skill_eval = evaluation.get("skills")
                responsibility_eval = evaluation.get("responsibilities")
                requirements_eval = evaluation.get("requirements")

                # Filtering only for non-empty results 
                if skill_eval != [] and responsibility_eval is not None and requirements_eval is not None:
                    # Check for both 
                    if len(skill_eval) > 2: 
                        print("Both found! ")
                        hard_eval = skill_eval[0]["hard"]["score"]
                        soft_eval = skill_eval[1]["soft"]["score"]
                        both_eval = skill_eval[2]["both"]["score"]
                        both_skills = skills["both"]
                        if (len(hard_skills) == len(hard_eval)) and (len(soft_skills) == len(soft_eval)) and (len(both_skills) == len(both_eval)): 
                            thread_skills["thread_id"] = thread_id
                            thread_skills["skills"] = hard_skills + soft_skills + both_skills
                            thread_skills["skill_eval"] = hard_eval + soft_eval + both_eval
                            results_skills.append(thread_skills)
                    else: 
                        hard_eval = skill_eval[0]["hard"]["score"]
                        soft_eval = skill_eval[1]["soft"]["score"]
                        
                        if (len(hard_skills) == len(hard_eval)) and len(soft_skills) == len(soft_eval): 
                            thread_skills["thread_id"] = thread_id
                            thread_skills["skills"] = hard_skills + soft_skills
                            thread_skills["skill_eval"] = hard_eval + soft_eval
                            results_skills.append(thread_skills)

                
                    # Responsibilities 
                    responsibility_eval_scores = responsibility_eval.get("score")
                    requirements_eval_scores = requirements_eval.get("score")
                    if (len(responsibilities) == len(responsibility_eval_scores)) and (len(requirements) == len(requirements_eval_scores)):
                        thread_responsibilities["thread_id"] = thread_id
                        thread_responsibilities["responsibilities"] = responsibilities
                        thread_responsibilities["responsibilities_eval"] = responsibility_eval_scores

                        thread_requirements["thread_id"] = thread_id
                        thread_requirements["requirements"] = requirements
                        thread_requirements["requirements_eval"] = requirements_eval_scores
                        result_responsibilities.append(thread_responsibilities)
                        result_requirements.append(thread_requirements)

#? Skill DataFrame                  
react_skill_df: pd.DataFrame = pd.DataFrame(results_skills).explode(column=["skills", "skill_eval"])
react_skill_df["skills"] = react_skill_df["skills"].apply(lambda x: " ".join(x.split("_")) if "_" in x else x)

#? Responsibilities DataFrame                  
react_responsibilities_df: pd.DataFrame = pd.DataFrame(result_responsibilities).explode(column=["responsibilities", "responsibilities_eval"])

#? Requirements DataFrame                  
react_requirements_df: pd.DataFrame = pd.DataFrame(result_requirements).explode(column=["requirements", "requirements_eval"])


# Display results 
print(react_skill_df.shape)
display(react_skill_df.head())
print(react_responsibilities_df.shape)
display(react_responsibilities_df.head())
print(react_requirements_df.shape)
display(react_requirements_df.head())

Both found! 
(299, 3)


Unnamed: 0,thread_id,skills,skill_eval
0,thread-7,regional and remote work experience,1.0
0,thread-7,stakeholder engagement,1.0
0,thread-7,cultural competency,0.8
0,thread-7,youth mental health support & intervention,1.0
0,thread-7,driving license,0.0


(192, 3)


Unnamed: 0,thread_id,responsibilities,responsibilities_eval
0,thread-0,Building relationships with range of existing ...,1.0
0,thread-0,Identify their issues and help provide the sol...,0.3
0,thread-0,Attending client visits and understanding clie...,1.0
0,thread-0,Representing Pulse Child Care Crew as an ambas...,1.0
0,thread-0,Managing recruitment processes to deliver mont...,1.0


(221, 3)


Unnamed: 0,thread_id,requirements,requirements_eval
0,thread-0,Experience within Early Education,1.0
0,thread-0,Passion for business development,0.3
0,thread-0,"Ambition, focus, drive, enthusiasm and profess...",0.8
0,thread-0,Desire to become an expert in your area,0.8
1,thread-2,willingness to learn and a positive attitude,1.0


## Creating embeddings 
For this I will use [JobBert](https://huggingface.co/TechWolf/JobBERT-v2): 
> JobBert is a `sentence-transformers` model specifically trained for job title matching and similarity. It's finetuned from `sentence-transformers/all-mpnet-base-v2` on a large dataset of **job titles and their associated skills/requirements**. The model maps job titles and descriptions to a 1024-dimensional dense vector space and **can be used for semantic job title matching, job similarity search, and related HR/recruitment tasks.**
<br>

Main points to capture about the model: 
* The model was trained on 5.5M+ job title - skills pairs
* Can be used for sentence-similarity
* The expensions from 768 (`hidden_size`) to 1024 dense vecors was conducted to improve semantic robustness and similarity performance in contrastive or retrieval tasks.

Other embeddings models for production level:
* Cohere
* OpenAI
* MongoDB
* Gimini etc.

---


### Approach & Assumptions
#### **Assumptions**
1. Responsibilities can be derived from the `description` column of `occupations_df`. 
2. Requirements can be derived from the `description` column of `skill_df`. 

#### **Approach** 
**General**: To get embeddings for the values in a scalable manner we will use 🤗 `Dataset.map()` utility, as the `map` utility posses "super powers" to apply batch processing (`batch=true`), pararallism with `num_proc`, and `memory-mapping` (preprocessing elements of the datasets without the requirement to load it fully to memory). 
1. Taxonomy: 
    - **Skills**: As per the above, I have combined all possible results into a single list of values 
        * OR use `encode_batch` to get the embeddings for each of the alternatives that coexist in `all_labels` <br>
        🎯 **I will encode each of the alternatives** to get the top k matches that were found to be relivant with the extracted results from the `ReAct` agent. 

    - **Responsibilities & Requirements**: Use `encode_batch` to get the embeddings only for the `description` columns and compare with `responsibilities` and `requirements` accordingly! 

2. ReAct: Here we will first filter based on the `_eval` column where `Correctness` GEval score was computed to ensure that we only evaluate results that were identified to be well aligned! 

In [9]:
# Load the model
model = SentenceTransformer("TechWolf/JobBERT-v2", device="cpu")

In [10]:
# Encoding example
res = model.encode(skill_df["all_labels"].iloc[0], normalize_embeddings=True)
console.print(res.shape)
console.print(res)

In [11]:
# Creating a Dataset to leverage the map function to convert a column to embedding with batch processing 
skill_dataset: Dataset = Dataset.from_pandas(skill_df)
occupation_dataset: Dataset = Dataset.from_pandas(occupations_df)

react_skill_dataset: Dataset = Dataset.from_pandas(react_skill_df)
react_responsibilities_dataset: Dataset = Dataset.from_pandas(react_responsibilities_df)
react_requirements_dataset: Dataset = Dataset.from_pandas(react_requirements_df)

In [12]:
def encode_batch(batch, col_names: list[str]):
    """
    Encodes multiple text columns in a batched Hugging Face Dataset.
    Handles both string and list-of-string columns.
    
    Returns a dict of new embedding columns (as lists of arrays).
    """
    results = {}

    for col_name in col_names:
        if col_name not in batch:
            print(f"Warning: Column '{col_name}' not found in the batch.")
            continue

        column_values = batch[col_name]

        # Case 1: each row contains a list of strings (e.g., skills)
        if isinstance(column_values[0], list):
            # Flatten
            flat_texts = [text for sublist in column_values for text in sublist]

            # Encode all at once
            embeddings = model.encode(
                flat_texts,
                convert_to_numpy=True,
                normalize_embeddings=True,
                show_progress_bar=False
            )

            # Reconstruct per-row embeddings
            reconstructed_embeds = []
            idx = 0
            for sublist in column_values:
                n = len(sublist)
                reconstructed_embeds.append(embeddings[idx: idx + n])
                idx += n
                
            results[col_name + "_embeds"] = [emb.tolist() for emb in reconstructed_embeds]

        # Case 2: each row is a single string
        else:
            embeddings = model.encode(
                column_values,
                convert_to_numpy=True,
                normalize_embeddings=True,
                show_progress_bar=False
            )
            results[col_name + "_embeds"] = [emb.tolist() for emb in embeddings]

    return results

In [13]:
# Encode values ["description", "all_labels"]
skill_dataset = skill_dataset.map(
    encode_batch,
    fn_kwargs={"col_names": ["description", "all_labels"]},
    batched=True,
    batch_size=100
)

Map:   0%|          | 0/13591 [00:00<?, ? examples/s]

In [14]:
# Encode values ["description"]
occupation_dataset = occupation_dataset.map(
    encode_batch,
    fn_kwargs={"col_names": ["description"]},
    batched=True,
    batch_size=100
)

Map:   0%|          | 0/3011 [00:00<?, ? examples/s]

In [15]:
# Encode values ["skills"]
react_skill_dataset = react_skill_dataset.map(
    encode_batch,
    fn_kwargs={"col_names": ["skills"]},
    batched=True,
    batch_size=100
)

# Encode values ["responsibilities"]
react_responsibilities_dataset = react_responsibilities_dataset.map(
    encode_batch,
    fn_kwargs={"col_names": ["responsibilities"]},
    batched=True,
    batch_size=100
)

# Encode values ["requirements"]
react_requirements_dataset = react_requirements_dataset.map(
    encode_batch,
    fn_kwargs={"col_names": ["requirements"]},
    batched=True,
    batch_size=100
)

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

Map:   0%|          | 0/221 [00:00<?, ? examples/s]

## Semantical Evaluation 

In [16]:
def embedding_similarity(embed1, embed2):
    """
    Compute cosine similarity between:
    - embed1: (d,) size diemsntions (1024) -> converted to (1, 1024)
    - embed2: (n, d) where n is the number of vectors and d is the size of the dimensions (for example: 4, 1024)
    Returns: max similarity if multiple embeddings are in embed2.
    """
    e1 = np.array(embed1, dtype=np.float32).reshape(1, -1) # Ensuring we have at list one dimension in our embed

    # Ensure embed2 is a 2D array
    e2 = np.array(embed2, dtype=object)

    # embed2 is list of vectors (list of list of floats -> all_labels_embeds )
    if e2.ndim == 1 and isinstance(e2[0], (list, np.ndarray)):
        # Convert properly to 2D float array
        e2 = np.vstack([np.array(x, dtype=np.float32) for x in e2])
    elif e2.ndim == 1:
        e2 = e2.reshape(1, -1).astype(np.float32)

    # Compute cosine similarity
    sims = cosine_similarity(e1, e2)
    return float(np.max(sims))


def rank_similar_skills(
        react_embed, 
        df: pd.DataFrame,
        emb_col,
        text_col: str,
        k: int 
    ):
    """
    This function compute rank the similarity between the react results and the embedding column at the source skill/occupation taxonomy. It returns the most similar items (k)

    Args: 
        react_embed: The react embeddings created
        df: Skill/occupation taxonomy 
        emb_col: The embedding column within the source table
        text_col: Sorted labels to return 
        k: The k results to return 
    Return: 
        Top k similar results
    """
    similarities = df[emb_col].apply(
        lambda source_embeds: embedding_similarity(react_embed, source_embeds)
    )

    sorted_indices = similarities.argsort()[::-1] 
    return df[text_col].iloc[sorted_indices].tolist()[:k], similarities.iloc[sorted_indices].tolist()[:k]


In [17]:
# Convert results to pandas 
skill_dataset_df =  skill_dataset.to_pandas()
occupation_dataset_df = occupation_dataset.to_pandas()

react_skill_dataset_df = react_skill_dataset.to_pandas()
react_requirements_dataset_df = react_requirements_dataset.to_pandas()
react_responsibilities_dataset_df = react_responsibilities_dataset.to_pandas() 

In [18]:
# Filter react results based on skill_eval threshold 
alpha = 0.8 #? I am setting 0.8 as per the GEval score (High compliance with all aspects of the criterion, please see GEval for more context)
react_skill_dataset_filtered = react_skill_dataset_df.loc[react_skill_dataset_df["skill_eval"]>=alpha]
react_requirements_dataset_filtered = react_requirements_dataset_df.loc[react_requirements_dataset_df["requirements_eval"]>=alpha]
react_responsibilities_dataset_filtered = react_responsibilities_dataset_df.loc[react_responsibilities_dataset_df["responsibilities_eval"]>=alpha]

console.print(react_skill_dataset_filtered.shape)
display(react_skill_dataset_filtered.head())
console.print(react_requirements_dataset_filtered.shape)
display(react_requirements_dataset_filtered.head())
console.print(react_responsibilities_dataset_filtered.shape)
display(react_responsibilities_dataset_filtered.head())

Unnamed: 0,thread_id,skills,skill_eval,__index_level_0__,skills_embeds
0,thread-7,regional and remote work experience,1.0,0,"[0.002536714542657137, 0.0054779439233243465, ..."
1,thread-7,stakeholder engagement,1.0,0,"[0.06041805073618889, 0.0003966428921557963, -..."
2,thread-7,cultural competency,0.8,0,"[-0.0014480315148830414, 0.02782272920012474, ..."
3,thread-7,youth mental health support & intervention,1.0,0,"[-0.02431654930114746, 0.012723845429718494, 0..."
5,thread-7,youth program facilitation & coordination,1.0,0,"[0.0015053973766043782, -0.025403685867786407,..."


Unnamed: 0,thread_id,requirements,requirements_eval,__index_level_0__,requirements_embeds
0,thread-0,Experience within Early Education,1.0,0,"[-0.0322035551071167, -0.022759711369872093, 0..."
2,thread-0,"Ambition, focus, drive, enthusiasm and profess...",0.8,0,"[-0.0441158264875412, -0.013053668662905693, 0..."
3,thread-0,Desire to become an expert in your area,0.8,0,"[-0.008897569961845875, -0.011824222281575203,..."
4,thread-2,willingness to learn and a positive attitude,1.0,1,"[-0.03703227639198303, -0.004422439262270927, ..."
5,thread-2,strong communication skills,1.0,1,"[-0.04897941276431084, 0.046800900250673294, 0..."


Unnamed: 0,thread_id,responsibilities,responsibilities_eval,__index_level_0__,responsibilities_embeds
0,thread-0,Building relationships with range of existing ...,1.0,0,"[-0.012364339083433151, -0.007528501097112894,..."
2,thread-0,Attending client visits and understanding clie...,1.0,0,"[-0.0050888098776340485, -0.030687129124999046..."
3,thread-0,Representing Pulse Child Care Crew as an ambas...,1.0,0,"[-0.003397054271772504, -0.02220015786588192, ..."
4,thread-0,Managing recruitment processes to deliver mont...,1.0,0,"[0.08528010547161102, -0.0147091718390584, -0...."
6,thread-0,Coordinating resources to ensure all vacancies...,1.0,0,"[0.010971897281706333, -0.021807627752423286, ..."


In [19]:
# Define value for k 
k = 7 

In [20]:
# Compute similarities 
tqdm.pandas()

# Soft hard and both eval
skill_ranked_res = (
    react_skill_dataset_filtered["skills_embeds"].progress_apply(
        lambda skill_embed: rank_similar_skills(
            skill_embed,
            skill_dataset_df,
            "all_labels_embeds",
            "all_labels",
            k
        )
    ).apply(pd.Series)
    .explode([0, 1])
    .rename(columns= {0: "all_labels", 1: "similarity"})
    .assign(react_extractions=react_skill_dataset_filtered["skills"]).reset_index()
    .drop(columns="index")
)

100%|██████████| 137/137 [03:40<00:00,  1.61s/it]


In [21]:
# Explore results 
(
    skill_ranked_res.groupby("react_extractions", group_keys=True)
    [["all_labels", "similarity"]]
    .agg({"all_labels": "first", "similarity": "mean"}) # first == the top match; similarity == the average similarity score for K results
    .sort_values(by="similarity")
)

Unnamed: 0_level_0,all_labels,similarity
react_extractions,Unnamed: 1_level_1,Unnamed: 2_level_1
sdlc experience,"[query languages, HTSQL, Gellish, Concept-Orie...",0.512002
windows desktop management,"[protect ICT devices, take care of ICT devices...",0.546125
regional and remote work experience,"[macro-regional strategy, large-scale regional...",0.55051
windows server experience,"[database management systems, Jasmine ii, Db.s...",0.55517
typescript experience,"[JavaScript Framework, JavaScript Framework]",0.560491
...,...,...
glass installation,"[install frameless glass, setting frameless gl...",0.841166
glazing installation,"[install structural glazing, fitting structura...",0.843417
business analysis,"[financial analysis, economic audit, business ...",0.885501
project management,"[project commissioning, managing projects, pro...",0.886229


In [22]:
# Overall similarity score 
skill_ranked_res["similarity"].mean().round(2)

np.float64(0.71)

In [28]:
requirements_ranked_res = (
    react_requirements_dataset_filtered["requirements_embeds"].progress_apply(
        lambda requirements_embeds: rank_similar_skills(
            requirements_embeds,
            skill_dataset_df,
            "description_embeds",
            "description",
            k
        )
    )
    .apply(pd.Series).explode([0, 1])
    .rename(columns= {0: "descriptions", 1: "similarity"})
    .assign(react_extractions=react_requirements_dataset_filtered["requirements"]).reset_index()
    .drop(columns="index")
)

100%|██████████| 174/174 [04:56<00:00,  1.70s/it]


In [33]:
# Explore results 
(
    requirements_ranked_res.groupby("react_extractions", group_keys=True)
    [["descriptions", "similarity"]]
    .agg({"descriptions": "first", "similarity": "mean"})
    .sort_values(by="similarity")
)

Unnamed: 0_level_0,descriptions,similarity
react_extractions,Unnamed: 1_level_1,Unnamed: 2_level_1
Windows Server Experience,Set up and run a media server.,0.425074
experience supporting at the c-suite level,Support designers in the course of the develop...,0.42543
Available to work Monday Friday 8am - 4pm,Be able to avoid threats to psychological well...,0.427227
Minimum 12 months experience in a customer service supervisor role,Processes and principles related to the custom...,0.428908
experience consulting,Create customer experiences to maximise client...,0.432265
...,...,...
Minimum 2 years working as a dietitian,Formulate and supervise nutrition schemes to m...,0.705146
"Understanding of construction, techniques, terminology, costs and be able to communicate this in Layman's terms",Ensure that the drawings of the architectural ...,0.718811
Experience in measuring and installing glass,"Manipulate the properties, shape and size of g...",0.720312
Fluent verbal and written communication skills in English and Chinese,Read and comprehend written texts in Chinese.,0.736053


In [34]:
# Overall similarity score 
requirements_ranked_res["similarity"].mean().round(2)

np.float64(0.58)

In [35]:
responsibilities_ranked_res =  (
    react_responsibilities_dataset_filtered["responsibilities_embeds"].progress_apply(
        lambda responsibilities_embed: rank_similar_skills(
            responsibilities_embed,
            occupation_dataset_df,
            "description_embeds",
            "description",
            k
        )
    )
    .apply(pd.Series).explode([0, 1])
    .rename(columns= {0: "descriptions", 1: "similarity"})
    .assign(react_extractions=react_responsibilities_dataset_filtered["responsibilities"]).reset_index()
    .drop(columns="index")
)

100%|██████████| 163/163 [01:01<00:00,  2.66it/s]


In [36]:
# Explore results 
(
    responsibilities_ranked_res.groupby("react_extractions", group_keys=True)
    [["descriptions", "similarity"]]
    .agg({"descriptions": "first", "similarity": "mean"})
    .sort_values(by="similarity")
)

Unnamed: 0_level_0,descriptions,similarity
react_extractions,Unnamed: 1_level_1,Unnamed: 2_level_1
Follow up leads,Case administrators supervise the progress of ...,0.33111
Prompt and professional,Promotions demonstrators proactively seek out ...,0.344328
Working in a small team coaching junior staff as required,Business coaches guide employees of a company ...,0.365461
Effective diary and inbox management,Secretaries perform a variety of administrativ...,0.369754
"Actioning opportunities via all channels including in person, phone, email, etc","Activism officers promote or hinder social, po...",0.3714
...,...,...
"Qualified with minimum 3 years’ experience in glazing, fabricating, installing and working with aluminium",Plate glass installers fit panes of glass into...,0.660584
fabricator/welder,Welders operate welding equipment in order to ...,0.70408
Preparing complete sets of working drawings,Drafters prepare and create technical drawings...,0.708375
Fully maintain rental files including accurate notes.,Rental managers supervise the activities of a ...,0.712185


In [37]:
# Overall similarity score 
responsibilities_ranked_res["similarity"].mean().round(2)

np.float64(0.48)