# Green Job Detection and Normalization
This notebooks aims to **detect** (know if a job contains green skills or not) and normalize (map the green skills from the *.csv* to the `ESCO` taxonomy).

In [17]:
import pandas as pd
import numpy as np
import faiss
import json 
from openai import OpenAI
from dotenv import load_dotenv
import os
from tqdm import tqdm

from warning_management import send_warning
from warning_management import send_error

load_dotenv()

True

Load indexes

In [5]:
index_green_skills = faiss.read_index("../data/embeddings/esco_green_skills_text-embedding-3-large.index")
index_job_skills = faiss.read_index("../data/embeddings/job_skills_embeddings.index")

a, b = (index_green_skills.ntotal, index_green_skills.d,)
c, d = (index_job_skills.ntotal, index_job_skills.d,)

print(f"Green skills index contains {a} vectors of dimension {b}.")
print(f"Job skills index contains {c} vectors of dimension {d}.")

Green skills index contains 2539 vectors of dimension 3072.
Job skills index contains 70318 vectors of dimension 3072.


In [6]:
df_jobs = pd.read_csv("../data/jan_to_apr_2025_with_languages_cleaned.csv")

## Normalization

## Hyperparameters
- `K_N`: number of nearest neighbors to retrieve from the index.
- `MIN_THRESHOLD`: minimum score threshold to consider an entry relevant.

In [10]:
# Modify this value in case of needing a different threshold for the 
# confidence of the green job detection
MIN_THRESHOLD = 0.1

# Modify this value in case of needing a different number of neighbors
K_N = 5

## Loading the indexes and mappings

In [12]:
# Load indexes
index_green_skills = faiss.read_index("../data/embeddings/esco_green_skills_text-embedding-3-large.index")
index_job_skills = faiss.read_index("../data/embeddings/job_skills_embeddings.index")

# Load mappings
id_to_job = json.load(open("../data/mapping/id_to_job.json", "r"))
id_to_skill = json.load(open("../data/mapping/id_to_skill.json", "r"))


## Prompt
We will use the following prompt to detect and normalize green skills in job descriptions, it receives as arguments: 
- `k_closest`: a list of the `k` closest green skills from the `ESCO` taxonomy.
- `skill`: the skill extracted from the job description, and also the one we want to normalize.
- `job`: the name of the job.

In [8]:
# Source of the definition: https://esco.ec.europa.eu/system/files/2025-01/Green%20Skills%20and%20Knowledge%20-%20Labelling%20ESCO.pdf#page=4
def get_prompt(k_closest: list[str], skill: str, job: str) -> str:
    prompt = f"""
You are an expert in identifying whether a skill can be mapped to a *green skill* in the ESCO taxonomy.
A green skill is defined as the abilities, values and attitudes needed to live in, develop and support
a society which reduces the impact of human activity on the environment.
        
Focus on environmental or sustainability-related aspects.
Ignore social or economic aspects unless they have a clear environmental connection.
        
Your task:
Determine if the following skill can be semantically matched to one of the provided green skills from ESCO:
Skill to classify: "{skill}", and is used in the context of the job: "{job}".
        
Closest ESCO green skills (formatted as 'Main Name': 'Alternative Name'):
"""

    for k in k_closest:
        prompt += f"- {k}\n"

    prompt += """
Decide whether the skill can be reasonably mapped to one of the ESCO green skills above
based on meaning and context.

If it can, output exactly the 'Main Name' of the most semantically similar ESCO skill.
If it cannot, output exactly "No".

Output:
<Main Name or No> 
//
Only answer with the Main Name or "No", without any additional text or explanation (the diagonals and this text should not be included in the output).
        """
    return prompt

## Model
We will use the **gpt-5-mini** model from **OpenAI** for the normalization task.

In [7]:
OPENAI_KEY = os.getenv("OPENAI_KEY")
def run_model(prompt: str) -> str:
    client = OpenAI(api_key = OPENAI_KEY)
    response = client.responses.create(
        model="gpt-5-mini",
        input=prompt
    )
    return response.output_text


## Example of a prompt with index 0

In [13]:
D, I = index_green_skills.search(np.array([index_job_skills.reconstruct(0)]), K_N)
k_closest = [f"{id_to_skill[str(idx)][0]}: {id_to_skill[str(idx)][1]}" for idx in I[0]]
skill = id_to_job[str(0)][1]
name  = df_jobs[df_jobs["Job_ID"] == id_to_job[str(0)][0]]["Title"].values[0]
prompt = get_prompt(k_closest, skill, name)
print(prompt)


You are an expert in identifying whether a skill can be mapped to a *green skill* in the ESCO taxonomy.
A green skill is defined as the abilities, values and attitudes needed to live in, develop and support
a society which reduces the impact of human activity on the environment.

Focus on environmental or sustainability-related aspects.
Ignore social or economic aspects unless they have a clear environmental connection.

Your task:
Determine if the following skill can be semantically matched to one of the provided green skills from ESCO:
Skill to classify: "experiencia en prospeccion de clientes", and is used in the context of the job: "ventas flotillas".

Closest ESCO green skills (formatted as 'Main Name': 'Alternative Name'):
- conduct environmental site assessments: site prospection management
- conduct environmental site assessments: management of site prospection
- conduct environmental site assessments: managing of site prospection
- conduct environmental site assessments: mana

## Prompt generator examples
An entry used to generate sample prompts to test the prompt generator function.

In [15]:
# Deprecated cell
with open("../test/example_prompts.txt", "w") as f:

    for i in range(0, 100):
        D, I = index_green_skills.search(np.array([index_job_skills.reconstruct(i)]), K_N)
    
        k_closest = [f"\t{id_to_skill[str(idx)][0]}: {id_to_skill[str(idx)][1]}" for idx in I[0]]
        name  = df_jobs[df_jobs["Job_ID"] == id_to_job[str(i)][0]]["Title"].values[0]
        
        skill = id_to_job[str(i)][1]
        prompt = get_prompt(k_closest, skill, name)
        f.write(prompt + "\n\n")
    

## Normalization process
The following code cell shows how the normalization process is done for every job skill in the dataset. The steps are:
1. For every i-th job, get the top `K_N` nearest neighbors to the i-th job skill, comparing them with the `ESCO` green skill taxonomy using **cosine similarity** as distance metric.
2. If the **max score** among the `K_N` neighbors is greater or equal than `MIN_THRESHOLD`, then we consider that the job **may** contains green skills, and we proceed to normalize it using the prompt defined before and the language model.
3. We run the prompt, were we could get two types of answers:
   - The skill has been normalized to one of the `k` closest green skills, in that case it returns the **normalized skill**.
   - The skill could not be normalized to any of the `k` closest green skills, in that case it returns **No**.
4. We save the entry in a new dataframe, and also keep a set to avoid duplicates (i.e. the same job skill being normalized to the same green skill more than once, which would lead to counting it multiple times when analyzing the results).

In [None]:
df_new_dataset = pd.DataFrame(list(job_skill_set), columns=["job_id", "job_skill", "skill_id" ,"esco_skill_name", "alternative_name", "prompt"])
new_dataset_set = set()

ctr = 0
SAVE_EVERY = 100  

for i in tqdm(range(0, index_job_skills.ntotal), desc="Classifying Green Skills", unit="entry"):  
    try:
        D, I = index_green_skills.search(np.array([index_job_skills.reconstruct(i)]), K_N)
    except Exception as e:
        send_error(f"Searching embeddings at index {i}: {e}")
        continue

    try:
        if max(D[0]) < MIN_THRESHOLD:
            df_new_dataset = pd.concat([df_new_dataset, pd.DataFrame({
                "job_id": [id_to_job[str(i)][0]],
                "job_skill": [id_to_job[str(i)][1]],
                "skill_id": ["No"],
                "esco_skill_name": ["No"],
                "alternative_name": ["No"],
            })], ignore_index=True)
            continue
    except Exception as e:
        send_error(f"Threshold check or concatenation failed at index {i}: {e}")
        continue

    flag = False
    try:
        for score, idx in zip(D[0], I[0]):
            try:
                name  = df_jobs[df_jobs["Job_ID"] == id_to_job[str(i)][0]]["Title"].values[0]
                prompt = get_prompt([f"{id_to_skill[str(idx)][0]}: {id_to_skill[str(idx)][1]}" for idx in I[0]], id_to_job[str(i)][1], name)
                response = run_model(prompt)

                response.replace("//", "").strip()
            except Exception as e:
                df_new_dataset = pd.concat([df_new_dataset, pd.DataFrame({
                    "job_id": [id_to_job[str(i)][0]],
                    "job_skill": [id_to_job[str(i)][1]],
                    "skill_id": ["No"],
                    "esco_skill_name": ["No"],
                    "alternative_name": ["No"],
                    "prompt": ["No"]
                })], ignore_index=True)
                continue

            try:
                if response != "No" and (id_to_job[str(i)][0], id_to_skill[str(idx)][0]) not in new_dataset_set:
                    new_prompt = prompt + " | " + response
                    new_prompt = new_prompt.replace("\n", " ").replace("\t", " ").strip()
                    df_new_dataset = pd.concat([df_new_dataset, pd.DataFrame({
                        "job_id": [id_to_job[str(i)][0]],
                        "job_skill": [id_to_job[str(i)][1]],
                        "skill_id": [list(id_to_skill.keys())[idx]],
                        "esco_skill_name": [id_to_skill[str(idx)][0]],
                        "alternative_name": [id_to_skill[str(idx)][1]],
                        "prompt": [new_prompt]
                    })], ignore_index=True)

                    new_dataset_set.add((id_to_job[str(i)][0], id_to_skill[str(idx)][0]))
                    flag = True
                    break
            except Exception as e:
                send_error(f"Adding green skill at job index {i}, skill index {idx}: {e}")
                continue
    except Exception as e:
        send_error(f"Iterating through scores at index {i}: {e}")
        continue

    try:
        if not flag and (id_to_job[str(i)][1], "No") not in new_dataset_set:
            df_new_dataset = pd.concat([df_new_dataset, pd.DataFrame({
                "job_id": [id_to_job[str(i)][0]],
                "job_skill": [id_to_job[str(i)][1]],
                "skill_id": ["No"],
                "esco_skill_name": ["No"],
                "alternative_name": ["No"],
                "prompt": ["No"]
            })], ignore_index=True)
            new_dataset_set.add((id_to_job[str(i)][0], "No"))
    except Exception as e:
        df_new_dataset = pd.concat([df_new_dataset, pd.DataFrame({
            "job_id": [id_to_job[str(i)][0]],
            "job_skill": [id_to_job[str(i)][1]],
            "skill_id": ["No"],
            "esco_skill_name": ["No"],
            "alternative_name": ["No"],
            "prompt": ["No"]
         })], ignore_index=True)
        continue
    ctr += 1
    if ctr % SAVE_EVERY == 0:
        df_new_dataset.to_csv("../data/green_skills_with_GPT-5.csv", index=False)

df_new_dataset.to_csv("../data/green_skills_with_GPT-5.csv", index=False)

Classifying Green Skills:   4%|▍         | 2740/70318 [15:19:43<378:03:41, 20.14s/entry]


KeyboardInterrupt: 

In [130]:
df_new_dataset.to_csv("../data/green_skills_with_GPT-5.csv", index=False)