<a href="https://colab.research.google.com/github/JYP97/DS2_Proj_Jobs_skills_analysis/blob/master/AutomaticOntologyMatching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Loading datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
import pandas as pd
import numpy as np
import itertools
import spacy

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# Processing job dataset


job = pd.read_json('/content/drive/MyDrive/DS2/career_builder_jobs_10501.json') # Read US jobs dataset
# job = np.array(job['skills']) # Leave only skills
skills = []
for description in job['skills']:
  skill_list = description.split(', ')
  skills.append(skill_list)
flat_list = list(itertools.chain(*skills))
flat_array = np.array(flat_list)
job_skills = np.unique(flat_array)

In [None]:
df_dataset = pd.DataFrame(data=job_skills, columns=['skill'])
df_dataset['skill'].replace('', np.nan, inplace=True)
df_dataset.dropna(subset=['skill'], inplace=True)
df_dataset.reset_index(drop=True, inplace=True)

display(df_dataset[0:30])

Unnamed: 0,skill
0,(American Society For Quality) ASQ Certified
1,.NET Framework
2,24/7 Services
3,3D Modeling
4,A/B Testing
5,AC Power Plugs And Sockets
6,ADO.NET (Programming Language)
7,AMAP
8,AMT Asset Management Software
9,ANCC Certified


In [None]:
# Processing esco dataset


esco = pd.read_csv('/content/drive/MyDrive/DS2/skillsHierarchy_en.csv')
esco = esco[esco['Level 0 preferred term'].isin(['skills'])] # Remove all term of Level 0 which is not 'skills'
esco = esco.drop(['Level 0 URI', 'Level 0 preferred term', 'Level 1 URI', 'Level 2 URI', 'Level 3 URI', 'Description', 'Scope note'], axis=1)
esco_l1 = esco['Level 1 preferred term']
esco_l2 = esco['Level 2 preferred term']
esco_l3 = esco['Level 3 preferred term']
l1_skills = esco_l1.unique()
l2_skills = esco_l2.unique()
l3_skills = esco_l3.unique()
# all_skills = np.concatenate((l1_skills, l2_skills, l3_skills), axis=0)
# esco_skills = np.array(all_skills)
esco_skills = np.array(l3_skills)

In [None]:
df_esco = pd.DataFrame(data=esco_skills, columns=['skill'])
df_esco['skill'].replace('', np.nan, inplace=True)
df_esco.dropna(subset=['skill'], inplace=True)
df_esco.reset_index(drop=True, inplace=True)

display(df_esco)

Unnamed: 0,skill
0,"communication, collaboration and creativity"
1,negotiating
2,negotiating and managing contracts and agreements
3,mediating and resolving disputes
4,responding to complaints
...,...
294,maintaining mechanical equipment
295,"installing, maintaining and repairing electric..."
296,"installing and repairing electrical, electroni..."
297,"maintaining electrical, electronic and precisi..."


# 2. BERT embeddings generation and similarity matching

## 2.1 Load Transformer (BERT)

In [None]:
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 2.2 Generate BERT embeddings of ESCO dataset & job dataset

In [None]:
esco_embeddings = []
for skill in df_esco['skill']:
    esco_embeddings.append(model.encode(skill))
esco_embeddings = np.array(esco_embeddings)

In [None]:
esco_embeddings.shape

(299, 768)

In [None]:
job_embeddings = []
for skill in df_dataset['skill']:
    job_embeddings.append(model.encode(skill))
job_embeddings = np.array(job_embeddings)

In [None]:
job_embeddings.shape

(4712, 768)

## 2.3 Compute cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
for idx, skill in enumerate(df_dataset['skill']):
    matched = np.argmax(cosine_similarity(job_embeddings[idx].reshape(1, 768), esco_embeddings))
    df_dataset.loc[idx, 'Level 3 Ontology'] = df_esco['skill'][matched]

In [None]:
display(df_dataset.sample(50))

Unnamed: 0,skill,Level 3 Ontology
4183,Swaption,negotiating
305,Audit Info System (AIS),managing information
2312,Irrigation (Health),developing health programmes
1404,Earned Value Management,determining values of goods or services
2181,Information Governance,managing information
845,Colorimetric Analysis,measuring dimensions and related properties
1546,Environmental Engineering,monitoring environmental conditions
1859,Genomics,developing research plans
1421,Educational Technologies,developing educational programmes
3640,Resource Conservation And Recovery Act (RCRA),developing contingency and emergency response ...


In [None]:
display(esco)

Unnamed: 0,Level 1 preferred term,Level 2 preferred term,Level 3 preferred term
0,,,
1,"communication, collaboration and creativity",,
2,"communication, collaboration and creativity","communication, collaboration and creativity",
3,"communication, collaboration and creativity","communication, collaboration and creativity","communication, collaboration and creativity"
4,"communication, collaboration and creativity",negotiating,
...,...,...,...
378,working with machinery and specialised equipment,"installing, maintaining and repairing electric...","installing, maintaining and repairing electric..."
379,working with machinery and specialised equipment,"installing, maintaining and repairing electric...","installing and repairing electrical, electroni..."
380,working with machinery and specialised equipment,"installing, maintaining and repairing electric...","maintaining electrical, electronic and precisi..."
381,working with machinery and specialised equipment,operating aircraft,


In [None]:
print(esco[esco['Level 3 preferred term'] == 'mediating and resolving disputes'].values.shape)

(1, 3)


In [51]:
for idx, ont in enumerate(df_dataset['Level 3 Ontology']):
    ontology_tree = esco[esco['Level 3 preferred term'] == ont].values
    df_dataset.loc[idx, 'Level 2 Ontology'] = ontology_tree[0, 1]
    df_dataset.loc[idx, 'Level 1 Ontology'] = ontology_tree[0, 0]
        

In [52]:
df_dataset.groupby(['Level 1 Ontology']).count()

Unnamed: 0_level_0,skill,Level 3 Ontology,Level 2 Ontology
Level 1 Ontology,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
assisting and caring,258,258,258
"communication, collaboration and creativity",1021,1021,1021
constructing,197,197,197
handling and moving,414,414,414
information skills,1378,1378,1378
management skills,535,535,535
working with computers,409,409,409
working with machinery and specialised equipment,500,500,500


In [55]:
display(df_dataset.sample(20))

Unnamed: 0,skill,Level 3 Ontology,Level 2 Ontology,Level 1 Ontology
3954,Smoothing,cleaning,cleaning,handling and moving
4236,Target Costing,estimating resource needs,calculating and estimating,information skills
2797,Modems,operating communications equipment,using precision instrumentation and equipment,working with machinery and specialised equipment
1078,Cryptography,resolving computer problems,setting up and protecting computer systems,working with computers
3098,Patient Education,providing medical advice,advising and consulting,"communication, collaboration and creativity"
737,Chemical Ionization,"operating petroleuem, chemical or water proces...",operating machinery for the extraction and pro...,working with machinery and specialised equipment
4412,Traumatic Brain Injury,reporting incidents and defects,documenting and recording information,information skills
2375,Kidneys,operating medical equipment,using precision instrumentation and equipment,working with machinery and specialised equipment
3910,Shopper Marketing,selling products or services,"promoting, selling and purchasing","communication, collaboration and creativity"
938,Conceptual Design,constructing,constructing,constructing


In [56]:
df_dataset.to_csv('/content/drive/MyDrive/DS2/job_with_ontology.csv')