# Green Job Detection and Normalization
This notebooks aims to **detect** (know if a job contains green skills or not) and normalize (map the green skills from the *.csv* to the `ESCO` taxonomy).

In [24]:
import pandas as pd
import numpy as np
import faiss
import json 


In [10]:
index = faiss.read_index("../data/embeddings/job_skills_embeddings.index")
a, b = (index.ntotal, index.d,)

print(f"Index contains {a} vectors of dimension {b}.")

Index contains 70318 vectors of dimension 3072.


In [19]:
id_to_text = json.load(open("../data/mapping/id_to_skill.json", "r"))
ctr = 0

while ctr < 5:
    print(id_to_text[str(ctr)])
    ctr+=1

['train staff to reduce food waste', 'teach students food waste reduction practices']
['train staff to reduce food waste', 'inform staff on food waste reduction practices']
['train staff to reduce food waste', 'educate workers on food recycling methods']
['train staff to reduce food waste', 'educate staff on food waste reduction']
['develop energy saving concepts', 'create concepts for energy saving']


Load indexes

In [23]:
index_green_skills = faiss.read_index("../data/embeddings/esco_green_skills_text-embedding-3-large.index")
index_job_skills = faiss.read_index("../data/embeddings/job_skills_embeddings.index")

a, b = (index_green_skills.ntotal, index_green_skills.d,)
c, d = (index_job_skills.ntotal, index_job_skills.d,)

print(f"Green skills index contains {a} vectors of dimension {b}.")
print(f"Job skills index contains {c} vectors of dimension {d}.")

Green skills index contains 3673 vectors of dimension 3072.
Job skills index contains 70318 vectors of dimension 3072.


## Normalization

Steps for normalization process: 
1. For every i-th job, get the top `k` nearest neighbors (`k` tweakable, using **cosine similarity** as distance metric) comparing them with the `ESCO` green skill taxonomy, in order to determine if it has green skills or not.
2. Go through the `k` values, if the score is greater or equal than `THRESHOLD`, then we add it to the dataframe.

In [69]:
# Modify this value in case of needing a different threshold for the 
# confidence of the green job detection
THRESHOLD = 0.5

# Modify this value in case of needing a different number of neighbors
K_N = 3

In [None]:
# Load indexes
index_green_skills = faiss.read_index("../data/embeddings/esco_green_skills_text-embedding-3-large.index")
index_job_skills = faiss.read_index("../data/embeddings/job_skills_embeddings.index")

# Load mappings
id_to_job = json.load(open("../data/mapping/id_to_job.json", "r"))
id_to_skill = json.load(open("../data/mapping/id_to_skill.json", "r"))


In [120]:
# Dataframe to store the results
df_results = pd.DataFrame(columns=["job_id", "skill_id", "similarity_score"])

"""
The same job can have multiple skills that are green, so we use a set to avoid duplicates
of the form (job_id, skill_id), this way we ensure that each job-skill pair is unique in the results.
"""
job_skill_set = set()

for i in range(0, index_job_skills.ntotal):
    D, I = index_green_skills.search(np.array([index_job_skills.reconstruct(i)]), K_N)
    
    for score, idx in zip(D[0], I[0]):
        if score >= THRESHOLD and (id_to_job[str(i)][0], id_to_skill[str(idx)][0]) not in job_skill_set:
            df_results = pd.concat([df_results, pd.DataFrame({"job_id": str(id_to_job[str(i)]), 
                                                              "skill_id": list(id_to_skill.keys())[idx], 
                                                              "similarity_score": [score]})], ignore_index=True)
            job_skill_set.add((id_to_job[str(i)][0], id_to_skill[str(idx)][0]))

  df_results = pd.concat([df_results, pd.DataFrame({"job_id": str(id_to_job[str(i)]),


In [None]:
pd.set_option('display.max_colwidth', None) 

df_results_copy = df_results.copy()

# Eliminate brackets and quotes from the job_id and skill_id columns
# ['identifier'] -> identifier

df_results_copy["job_id"] = df_results_copy["job_id"].apply(lambda x: x[2: len(x)-2])
df_results_copy.to_csv("../data/green_jobs_normalized.csv", index=False)