# Green Job Detection and Normalization
This notebooks aims to **detect** (know if a job contains green skills or not) and normalize (map the green skills from the *.csv* to the `ESCO` taxonomy).

In [1]:
import pandas as pd
import numpy as np
import faiss
import json 
from openai import OpenAI
from dotenv import load_dotenv
import os
from tqdm import tqdm
import jsonschema
from pydantic import BaseModel
import threading
from threading import Lock

from warning_management import send_warning
from warning_management import send_error

load_dotenv()

True

## Loading the indexes and mappings

Load indexes

In [2]:
# Load indexes
"""
@Returns:
    index_green_skills: FAISS index for green skills embeddings.
    id_to_skill: Dictionary mapping skill IDs to skill names.
    index_job_skills: FAISS index for job skills embeddings.
    id_to_job: Dictionary mapping job IDs to job names.
"""
def get_indexes_and_mappings() -> tuple[faiss.Index, dict[str, str], faiss.Index, dict[str, str]]:
    index_green_skills = faiss.read_index("../data/embeddings/esco_green_skills_text-embedding-3-large.index")
    index_job_skills = faiss.read_index("../data/embeddings/full_job_skills_embeddings.index")

    # Load mappings
    id_to_job = json.load(open("../data/mapping/id_to_jobs.json", "r"))
    id_to_skill = json.load(open("../data/mapping/id_to_skills.json", "r"))
    return index_green_skills, id_to_skill, index_job_skills, id_to_job


In [None]:
index_green_skills, id_to_skill, index_job_skills, id_to_job = get_indexes_and_mappings()

a, b = (index_green_skills.ntotal, index_green_skills.d,)
c, d = (index_job_skills.ntotal, index_job_skills.d,)

print(f"Green skills index contains {a} vectors of dimension {b}.")
print(f"Job skills index contains {c} vectors of dimension {d}.")

In [3]:
df_jobs = pd.read_csv("../data/full_dataset/jul24_to_jul_2025_cleaned_sorted.csv")
print("Full dataset shape:", df_jobs.shape)

Full dataset shape: (204372, 6)


## Normalization

## Hyperparameters
- `K_N`: number of nearest neighbors to retrieve from the index.
- `MIN_THRESHOLD`: minimum score threshold to consider an entry relevant.

In [4]:
# Modify this value in case of needing a different threshold for the 
# confidence of the green job detection
MIN_THRESHOLD = 0.1

# Modify this value in case of needing a different number of neighbors
K_N = 5

## Prompts
## Green Skill Classification Prompt
We will use the following prompt to detect and normalize green skills in job descriptions, it receives as arguments: 
- `k_closest`: a list of the `k` closest green skills from the `ESCO` taxonomy.
- `skill`: the skill extracted from the job description, and also the one we want to normalize.
- `job`: the name of the job.
It returns a list of dictionaries representing the messages to be sent to the model and the roles of each message.
* **system**: It is a system message that defines the role of the model.
* **user**: It is the user message that contains the actual prompt with the context and instructions.
* **developer**: It is the developer message that contains the instructions for the model's response format (in this case the pydantic class).

In [5]:
def get_green_skill_classification_prompt(k_closest: list[str], skill: str, job: str) -> list[dict]:
    system_prompt = """
You are an expert in identifying whether a skill can be mapped to a *green skill* in the ESCO taxonomy.
A green skill is defined as the abilities, values and attitudes needed to live in, develop and support
a society which reduces the impact of human activity on the environment.

See if the provided skill can be used to perform tasks that contribute to environmental sustainability.
Examples: 
- knowledge of vehicle cleaning standards and processes -> perform cleaning activities in an environmentally friendly way
- supervision de limpieza y preparacion de unidades -> perform cleaning activities in an environmentally friendly way
- experiencia en reparaciones y mantenimiento de unidades -> maintain concentrated solar power systems
- experiencia en pintura automotriz -> use environmental friendly materials
- control de procesos -> monitor manufacturing impact
- experiencia en proceso de gestion de inventario y ventas en el area de refacciones -> monitor ingredient storage
- conocimiento en logistica -> develop efficiency plans for logistics operations

Meaning that the skill can be used to carry out tasks that have a positive impact on the environment.
"""
    
    user_prompt = f"""
Determine if the following skill can be semantically matched to one of the provided green skills from ESCO.

Skill to classify: "{skill}"
Job context: "{job}"

Closest ESCO green skills (formatted as 'Main Name': 'Alternative Name'):
"""

    for k in k_closest:
        user_prompt += f"- {k}\n"

    user_prompt += """
Decide whether the skill can be reasonably mapped to one of the ESCO green skills above
based on meaning, context and if it contributes to environmental sustainability in any way.
"""

    developer_prompt = """
You must respond strictly following the response schema.
If the skill matches one of the ESCO green skills, return its 'Main Name'.
Otherwise, return 'No'.
"""

    return [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": user_prompt.strip()},
        {"role": "developer", "content": developer_prompt.strip()}
    ]

# Models for output parsing

## GreenSkillClassification
Model to parse the output of the green skill normalization.

In [6]:
class GreenSkillClassification(BaseModel):
    mapped_skill: str # The main name of the mapped ESCO green skill, or "No" if there is no match.

## Model
We will use the **gpt-4o** model from **OpenAI** for the normalization task.

In [7]:
load_dotenv()

# Load OpenAI key
API_KEY = os.getenv("API_KEY")
ENDPOINT = os.getenv("ENDPOINT")
DEPLOYMENT = os.getenv("DEPLOYMENT")    
API_VERSION = os.getenv("API_VERSION")

OPENAI_KEY = os.getenv("OPENAI_KEY")

# Source: https://platform.openai.com/docs/guides/structured-outputs
# Function to run the model with output parsing
# Load OpenAI key

""" def run_model(prompt: list[str], client: OpenAI) -> str:
    response = client.responses.parse(
        model="gpt-4o",
        input=[
            {"role": msg["role"], "content": msg["content"]} for msg in prompt
        ],
        text_format=GreenSkillClassification,
    )
    return response.output_parsed.mapped_skill """


def run_model(prompt: list[str], client: OpenAI) -> str:
    response = client.chat.completions.create(
        model=DEPLOYMENT,  
        messages=[
            {"role": msg["role"], "content": msg["content"]} for msg in prompt
        ]
    )
    return response.choices[0].message.content.strip()

def createClient() -> OpenAI:
    return OpenAI(
        base_url=f"{ENDPOINT}openai/deployments/{DEPLOYMENT}/",
        api_key=API_KEY,
        default_query={"api-version": API_VERSION},
        default_headers={"api-key": API_KEY},
    )


""" def createClient() -> OpenAI:
    return OpenAI (api_key=OPENAI_KEY) """


' def createClient() -> OpenAI:\n    return OpenAI (api_key=OPENAI_KEY) '

In [None]:
# Test run of the model

D, I = index_green_skills.search(np.array([index_job_skills.reconstruct(0)]), K_N)
k_closest = [f"\t{id_to_skill[str(idx)][0]}: {id_to_skill[str(idx)][1]}" for idx in I[0]]
name  = df_jobs[df_jobs["Job_ID"] == id_to_job[str(0)][0]]["Title"].values[0]
skill = id_to_job[str(0)][1]
prompt = get_green_skill_classification_prompt(k_closest, skill, name)
response = run_model(prompt, createClient())
print("Model response:", response)
print(type(response))
print(response)

## Prompt generator examples
An entry used to generate sample prompts to test the prompt generator function.

In [28]:
FROM, TO = 0, 100

with open("../test/example_prompts.txt", "w") as f:
    index_green_skills, id_to_skill, index_job_skills, id_to_job = get_indexes_and_mappings()
    for i in range(FROM, TO):
        D, I = index_green_skills.search(np.array([index_job_skills.reconstruct(i)]), K_N)
    
        k_closest = [f"\t{id_to_skill[str(idx)][0]}: {id_to_skill[str(idx)][1]}" for idx in I[0]]
        name  = df_jobs[df_jobs["Job_ID"] == id_to_job[str(i)][0]]["Title"].values[0]
        
        skill = id_to_job[str(i)][1]
        prompt = get_green_skill_classification_prompt(k_closest, skill, name)
        for msg in prompt:
            f.write(f"{msg['role'].upper()}:\n{msg['content']}\n\n")
        f.write("-----\n\n")

## Normalization process
The following code cell shows how the normalization process is done for every job skill in the dataset. The steps are:
1. For every i-th job, get the top `K_N` nearest neighbors to the i-th job skill, comparing them with the `ESCO` green skill taxonomy using **cosine similarity** as distance metric.
2. If the **max score** among the `K_N` neighbors is greater or equal than `MIN_THRESHOLD`, then we consider that the job **may** contains green skills, and we proceed to normalize it using the prompt defined before and the language model.
3. We run the prompt, were we could get two types of answers:
   - The skill has been normalized to one of the `k` closest green skills, in that case it returns the **normalized skill**.
   - The skill could not be normalized to any of the `k` closest green skills, in that case it returns **No**.
4. We save the entry in a new dataframe, and also keep a set to avoid duplicates (i.e. the same job skill being normalized to the same green skill more than once, which would lead to counting it multiple times when analyzing the results).

In this case, we have to use **threads** in order to speed up the process, since we have to process a lot of entries and the model calls are time consuming.

At first, every entry was taking around **20 seconds**, but with threads we were able to reduce it to around **2-3 seconds** per entry, which is a significant improvement.
> WARNING: 20 threads crashed my system due to high memory usage (Ryzen 7 3800x, 32GB RAM), so I had to reduce it to 10 threads. Be careful when setting this parameter.

In [7]:
"""  
Main task function to process a partition of job skills, classify them using the language model,
and save the results to a CSV file.
@Params:
    left: Left index of the partition (inclusive).
    right: Right index of the partition (exclusive).
    partition_id: Identifier for the partition (used in the output file name).
    global_set: A shared set to track already processed (job_id, skill_id) pairs which prevent overcounting.
    save_every: Number of processed entries after which to save intermediate results. (default is 50)
"""

def write_log_string(message: str, ERROR_FILE_NAME: str) -> str:
    with open(ERROR_FILE_NAME, "a") as f:
        f.write(message + "\n")
    return message

def task(left : int, right : int, partition_id: int, global_set: set[str], save_every: int = 50):
    FILE_NAME = f"../data/green_skill_classification/full_green_skills_with_GPT-4_part_{partition_id}.csv"
    ERROR_FILE_NAME = f"../data/logs/green_skill_classification/errors_part_{partition_id}.log"
    CLIENT = createClient()

    # Use a python array here because it was faster than a pandas dataframe for appending
    df_new_dataset = []

    # Every threads gets their own indexes and mappings in order to avoid conflicts
    index_green_skills, id_to_skill, index_job_skills, id_to_job = get_indexes_and_mappings()

    ctr = 0
    for i in range(left, right):
        try:
            # Search for the k nearest green skills for the job skill at index i
            D, I = index_green_skills.search(np.array([index_job_skills.reconstruct(i)]), K_N)
        except Exception as e:
            send_error(f"Searching embeddings at index {i}: {e}")
            continue

        try:
            # Check if the maximum similarity score is below the minimum threshold
            # In that case, we directly add a "No" entry for this job skill, since it is not similar enough to any green skill
            if max(D[0]) < MIN_THRESHOLD:
                df_new_dataset.append({
                    "job_id": id_to_job[str(i)][0],
                    "job_skill": id_to_job[str(i)][1],
                    "month": id_to_job[str(i)][2],
                    "year": id_to_job[str(i)][3],
                    "skill_id": "No",
                    "esco_skill_name": "No",
                    "alternative_name": "No",
                    "prompt": "No"})
                continue
        except Exception as e:
            send_error(f"Threshold check or concatenation failed at index {i}: {e}")
            continue

        # Iterate through the k closest green skills and classify using the language model
        flag = False # Flag to indicate if a mapping was found
        try:
            # Iterate through the k closest green skills (score, idx)
            for score, idx in zip(D[0], I[0]):
                try:
                    name  = df_jobs[df_jobs["Job_ID"] == id_to_job[str(i)][0]]["Title"].values[0]
                    prompt = get_green_skill_classification_prompt([f"{id_to_skill[str(idx)][0]}: {id_to_skill[str(idx)][1]}" for idx in I[0]], id_to_job[str(i)][1], name)
                    response = run_model(prompt, CLIENT)
                except Exception as e:
                    write_log_string(f"Cannot generate prompt or run model at job index {i}, skill index {idx}: {e}", ERROR_FILE_NAME)

                    df_new_dataset.append({
                        "job_id": id_to_job[str(i)][0],
                        "job_skill": id_to_job[str(i)][1],
                        "month": id_to_job[str(i)][2],
                        "year": id_to_job[str(i)][3],
                        "skill_id": "No",
                        "esco_skill_name": "No",
                        "alternative_name": "No",
                        "prompt": "No"})
                    break;

                try:
                    if response != "No" and (id_to_job[str(i)][0], id_to_skill[str(idx)][0]) not in global_set:
                        new_prompt = str(prompt) + " | " + response
                        new_prompt = new_prompt.replace("\n", " ").replace("\t", " ").strip()

                        df_new_dataset.append({
                            "job_id": id_to_job[str(i)][0],
                            "job_skill": id_to_job[str(i)][1],
                            "month": id_to_job[str(i)][2],
                            "year": id_to_job[str(i)][3],
                            "skill_id": list(id_to_skill.keys())[idx],
                            "esco_skill_name": id_to_skill[str(idx)][0],
                            "alternative_name": id_to_skill[str(idx)][1],
                            "prompt": new_prompt})
                        with lock:
                            global_set.add((id_to_job[str(i)][0], id_to_skill[str(idx)][0]))
                        flag = True
                        break
                except Exception as e:
                    send_error(f"Adding green skill at job index {i}, skill index {idx}: {e}")
                    continue
        except Exception as e:
            send_error(f"Iterating through scores at index {i}: {e}")
            continue

        try:
            if not flag and (id_to_job[str(i)][1], "No") not in global_set:                
                new_prompt = str(prompt) + " | " + response
                new_prompt = new_prompt.replace("\n", " ").replace("\t", " ").strip()
                df_new_dataset.append({
                    "job_id": id_to_job[str(i)][0],
                    "job_skill": id_to_job[str(i)][1],
                    "month": id_to_job[str(i)][2],
                    "year": id_to_job[str(i)][3],
                    "skill_id": "No",
                    "esco_skill_name": "No",
                    "alternative_name": "No",
                    "prompt": new_prompt})
                with lock:
                    global_set.add((id_to_job[str(i)][0], "No"))
        except Exception as e:
            df_new_dataset.append({
                    "job_id": id_to_job[str(i)][0],
                    "job_skill": id_to_job[str(i)][1],
                    "month": id_to_job[str(i)][2],
                    "year": id_to_job[str(i)][3],
                    "skill_id": "No",
                    "esco_skill_name": "No",
                    "alternative_name": "No",
                    "prompt": "No"})
            continue
        ctr += 1
        if ctr % save_every == 0:
            pd.DataFrame(df_new_dataset).to_csv(FILE_NAME, index=False)
        with lock:
            pbar.update(1)
    pd.DataFrame(df_new_dataset).to_csv(FILE_NAME, index=False)

Modified task for indexed task

In [1]:
def index_task(index_list: list[int], partition_id: int, global_set: set[str], save_every: int = 50):
    """
    Main task function to process a list of dataframe indices, classify them using the language model,
    and save the results to a CSV file.
    @Params:
        index_list: List of actual dataframe indices (not positional).
        partition_id: Identifier for the partition (used in the output file name).
        global_set: Shared set to track already processed (job_id, skill_id) pairs.
        save_every: Number of processed entries after which to save intermediate results.
    """
    FILE_NAME = f"../data/green_skill_classification/full_green_skills_with_GPT-4_part_{partition_id}_2.csv"
    CLIENT = createClient()

    df_new_dataset = []

    # Each thread gets its own local mappings
    index_green_skills, id_to_skill, index_job_skills, id_to_job = get_indexes_and_mappings()

    ctr = 0
    for idx in index_list:  # now iterating over absolute dataframe indices
        try:
            # Attempt to reconstruct vector at index `idx`
            D, I = index_green_skills.search(np.array([index_job_skills.reconstruct(idx)]), K_N)
        except Exception as e:
            send_error(f"Searching embeddings at index {idx}: {e}")
            continue

        try:
            if max(D[0]) < MIN_THRESHOLD:
                df_new_dataset.append({
                    "job_id": id_to_job[str(idx)][0],
                    "job_skill": id_to_job[str(idx)][1],
                    "skill_id": "No",
                    "esco_skill_name": "No",
                    "alternative_name": "No",
                    "prompt": "No"})
                continue
        except Exception as e:
            send_error(f"Threshold check failed at index {idx}: {e}")
            continue

        flag = False
        try:
            for score, jdx in zip(D[0], I[0]):
                try:
                    name = df_jobs[df_jobs["Job_ID"] == id_to_job[str(idx)][0]]["Title"].values[0]
                    prompt = get_green_skill_classification_prompt(
                        [f"{id_to_skill[str(k)][0]}: {id_to_skill[str(k)][1]}" for k in I[0]],
                        id_to_job[str(idx)][1],
                        name
                    )
                    response = run_model(prompt, CLIENT)
                except Exception as e:
                    send_error(f"Cannot run model at job index {idx}, skill index {jdx}: {e}")
                    df_new_dataset.append({
                        "job_id": id_to_job[str(idx)][0],
                        "job_skill": id_to_job[str(idx)][1],
                        "skill_id": "No",
                        "esco_skill_name": "No",
                        "alternative_name": "No",
                        "prompt": "No"})
                    break

                try:
                    if response != "No" and (id_to_job[str(idx)][0], id_to_skill[str(jdx)][0]) not in global_set:
                        new_prompt = f"{prompt} | {response}".replace("\n", " ").replace("\t", " ").strip()
                        df_new_dataset.append({
                            "job_id": id_to_job[str(idx)][0],
                            "job_skill": id_to_job[str(idx)][1],
                            "skill_id": list(id_to_skill.keys())[jdx],
                            "esco_skill_name": id_to_skill[str(jdx)][0],
                            "alternative_name": id_to_skill[str(jdx)][1],
                            "prompt": new_prompt})
                        with lock:
                            global_set.add((id_to_job[str(idx)][0], id_to_skill[str(jdx)][0]))
                        flag = True
                        break
                except Exception as e:
                    send_error(f"Adding skill failed at job {idx}, skill {jdx}: {e}")
                    continue
        except Exception as e:
            send_error(f"Iterating through scores at index {idx}: {e}")
            continue

        try:
            if not flag and (id_to_job[str(idx)][1], "No") not in global_set:
                new_prompt = f"{prompt} | {response}".replace("\n", " ").replace("\t", " ").strip()
                df_new_dataset.append({
                    "job_id": id_to_job[str(idx)][0],
                    "job_skill": id_to_job[str(idx)][1],
                    "skill_id": "No",
                    "esco_skill_name": "No",
                    "alternative_name": "No",
                    "prompt": new_prompt})
                with lock:
                    global_set.add((id_to_job[str(idx)][0], "No"))
        except Exception as e:
            df_new_dataset.append({
                "job_id": id_to_job[str(idx)][0],
                "job_skill": id_to_job[str(idx)][1],
                "skill_id": "No",
                "esco_skill_name": "No",
                "alternative_name": "No",
                "prompt": "No"})
            continue

        ctr += 1
        if ctr % save_every == 0:
            pd.DataFrame(df_new_dataset).to_csv(FILE_NAME, index=False)
        with lock:
            pbar.update(1)

    pd.DataFrame(df_new_dataset).to_csv(FILE_NAME, index=False)


## Globar parameters for threading
- `NUMBER_OF_PARTITIONS`: number of partitions to divide the dataset into, each partition will be processed by a different thread.
- `TOTAL_ENTRIES`: total number of entries in the dataset.

In [8]:
from threading import Lock
from tqdm import tqdm

_, _, job_skills, _ = get_indexes_and_mappings()

TOTAL_ENTRIES = job_skills.ntotal
NUMBER_OF_PARTITIONS = 24
LEFT_BOUND = 0 

"""  
Generate partition limits for dividing the dataset into multiple threads.
@Params:
    total_entries: Total number of entries in the dataset.
    number_of_partitions: Number of partitions to divide the dataset into.
    left_bound: Starting index for processing (default = 0).
@Returns:
    List of tuples representing the (left, right) limits for each partition.
"""
def generate_limits(total_entries: int, number_of_partitions: int, left_bound: int = 0):
    effective_total = total_entries - left_bound
    entries_per_partition = effective_total // number_of_partitions
    partitions = [
        (left_bound + i * entries_per_partition, left_bound + (i + 1) * entries_per_partition)
        for i in range(number_of_partitions)
    ]
    partitions[-1] = (partitions[-1][0], total_entries)
    return partitions

global_set = set()

partition_limits = generate_limits(TOTAL_ENTRIES, NUMBER_OF_PARTITIONS, LEFT_BOUND)

pbar = tqdm(total=TOTAL_ENTRIES - LEFT_BOUND, desc="Overall Progress", position=0)
lock = Lock()


Overall Progress:   0%|          | 0/204372 [00:00<?, ?it/s]

In [9]:
print(partition_limits)
print(job_skills.ntotal)

[(0, 8515), (8515, 17030), (17030, 25545), (25545, 34060), (34060, 42575), (42575, 51090), (51090, 59605), (59605, 68120), (68120, 76635), (76635, 85150), (85150, 93665), (93665, 102180), (102180, 110695), (110695, 119210), (119210, 127725), (127725, 136240), (136240, 144755), (144755, 153270), (153270, 161785), (161785, 170300), (170300, 178815), (178815, 187330), (187330, 195845), (195845, 204372)]
204372


## Running the threading process

In [None]:
threads = []

MAX_THREADS = 6

for i, (left, right) in enumerate(partition_limits):
    thread = threading.Thread(target=task, args=(left, right, i, global_set))
    threads.append(thread)
    thread.start()

    if (i + 1) % MAX_THREADS == 0 or (i + 1) == len(partition_limits):
        for thread in threads:
            thread.join()
        threads = []

Overall Progress:   1%|          | 2020/204372 [27:18<39:20:19,  1.43it/s] 2025-11-05 02:15 - ERROR - Iterating through scores at index 25873: [Errno 2] No such file or directory: '../data/logs/green_skill_classification/errors_part_3.log'
2025-11-05 02:15 - ERROR - Iterating through scores at index 331: [Errno 2] No such file or directory: '../data/logs/green_skill_classification/errors_part_0.log'
Overall Progress:   1%|          | 2022/204372 [27:21<55:49:47,  1.01it/s]2025-11-05 02:15 - ERROR - Iterating through scores at index 17373: [Errno 2] No such file or directory: '../data/logs/green_skill_classification/errors_part_2.log'
Overall Progress:   1%|          | 2026/204372 [27:23<42:14:50,  1.33it/s]2025-11-05 02:15 - ERROR - Iterating through scores at index 8869: [Errno 2] No such file or directory: '../data/logs/green_skill_classification/errors_part_1.log'
Overall Progress:   1%|          | 2173/204372 [29:19<23:55:24,  2.35it/s] 2025-11-05 02:17 - ERROR - Iterating through 

Join the results from every thread into a single dataframe and save it as a *.csv* file.

In [8]:
dataframe = pd.DataFrame(columns=["job_id", "job_skill", "skill_id" , "month", "year","esco_skill_name", "alternative_name", "prompt"])
FOLDER = "../data/green_skill_classification"

folders = []

for file in os.listdir(FOLDER):
    if file.endswith(".csv"):
        folders.append(os.path.join(FOLDER, file))

folders.sort()
for file in folders:
    df_part = pd.read_csv(file)
    dataframe = pd.concat([dataframe, df_part], ignore_index=True)

  dataframe = pd.concat([dataframe, df_part], ignore_index=True)


In [9]:
dataframe.to_csv("../data/green_skill_classification/green_skills_with_GPT-4_full_dataset.csv", index=False)

In [None]:
df_full = pd.read_csv("../data/full_dataset/jul24_to_jul_2025_cleaned_sorted.csv")
df_green_skills = pd.read_csv("../data/green_skill_classification/green_skills_with_GPT-4_full_dataset.csv")

print("Full dataset shape:", df_full.shape)
print("Green skills dataset shape:", df_green_skills.shape)

print(df_full.head())
print(df_green_skills.head())

Full dataset shape: (204372, 6)
Green skills dataset shape: (196019, 8)
                   Title                Job_ID  source  \
0  auxiliar de mostrador  job_624fd0a8f34770fb  indeed   
1  auxiliar de mostrador  job_624fd0a8f34770fb  indeed   
2  auxiliar de mostrador  job_624fd0a8f34770fb  indeed   
3  auxiliar de mostrador  job_624fd0a8f34770fb  indeed   
4  auxiliar de mostrador  job_624fd0a8f34770fb  indeed   

                                        Skills  month  year  
0           experiencia en atencion al cliente    7.0  2024  
1  gusto por el servicio y atencion al cliente    7.0  2024  
2                       preparatoria terminada    7.0  2024  
3                                mayor de anos    7.0  2024  
4          experiencia en empleos presenciales    7.0  2024  
                 job_id                                    job_skill skill_id  \
0  job_624fd0a8f34770fb           experiencia en atencion al cliente       No   
1  job_624fd0a8f34770fb  gusto por el servici

In [13]:
df_full_without_no = df_green_skills[df_green_skills["skill_id"] != "No"]
df_full_without_no.to_csv("../data/green_skill_classification/green_skills_with_GPT-4_full_dataset_no_no.csv", index=False)

In [None]:
df_full = df_full.copy()
df_full["__original_index__"] = df_full.index

df_merged = pd.merge(
    df_full,
    df_green_skills,
    how="left",
    left_on=["Job_ID", "Skills", "month", "year"],
    right_on=["job_id", "job_skill", "month", "year"],
    indicator=True
)

df_missing = df_merged[df_merged["_merge"] == "left_only"]
df_missing = df_missing.set_index("__original_index__")
df_missing = df_missing[df_full.columns.drop("__original_index__")]

print("Missing entries shape:", df_missing.shape)


Missing entries shape: (8615, 6)
