<a href="https://colab.research.google.com/github/JYP97/DS2_Proj_Jobs_skills_analysis/blob/master/AutomaticLabel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading the datasets

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
import pandas as pd
import numpy as np
import itertools
import spacy

Mounted at /content/drive/




In [3]:
# Processing job dataset


job = pd.read_json('/content/drive/MyDrive/DS2/career_builder_jobs_10501.json') # Read US jobs dataset
# job = np.array(job['skills']) # Leave only skills
skills = []
for description in job['skills']:
  skill_list = description.split(', ')
  skills.append(skill_list)
flat_list = list(itertools.chain(*skills))
flat_array = np.array(flat_list)
job_skills = np.unique(flat_array)

In [45]:
df_dataset = pd.DataFrame(data=job_skills, columns=['skill'])
df_dataset['skill'].replace('', np.nan, inplace=True)
df_dataset.dropna(subset=['skill'], inplace=True)
df_dataset.reset_index(drop=True, inplace=True)

display(df_dataset[0:30])

Unnamed: 0,skill
0,(American Society For Quality) ASQ Certified
1,.NET Framework
2,24/7 Services
3,3D Modeling
4,A/B Testing
5,AC Power Plugs And Sockets
6,ADO.NET (Programming Language)
7,AMAP
8,AMT Asset Management Software
9,ANCC Certified


In [14]:
# Processing esco dataset


esco = pd.read_csv('/content/drive/MyDrive/DS2/skillsHierarchy_en.csv')
esco = esco[esco['Level 0 preferred term'].isin(['skills'])] # Remove all term of Level 0 which is not 'skills'
esco = esco.drop(['Level 0 URI', 'Level 0 preferred term', 'Level 1 URI', 'Level 2 URI', 'Level 3 URI', 'Description', 'Scope note'], axis=1)
esco_l1 = esco['Level 1 preferred term']
esco_l2 = esco['Level 2 preferred term']
esco_l3 = esco['Level 3 preferred term']
l1_skills = esco_l1.unique()
l2_skills = esco_l2.unique()
l3_skills = esco_l3.unique()
all_skills = np.concatenate((l1_skills, l2_skills, l3_skills), axis=0)
esco_skills = np.array(all_skills)

In [15]:
df_esco = pd.DataFrame(data=esco_skills, columns=['skill'])
df_esco['skill'].replace('', np.nan, inplace=True)
df_esco.dropna(subset=['skill'], inplace=True)
df_esco.reset_index(drop=True, inplace=True)

display(df_esco)

Unnamed: 0,skill
0,"communication, collaboration and creativity"
1,information skills
2,assisting and caring
3,management skills
4,working with computers
...,...
377,maintaining mechanical equipment
378,"installing, maintaining and repairing electric..."
379,"installing and repairing electrical, electroni..."
380,"maintaining electrical, electronic and precisi..."


# Method 2: BERT embeddings and Clustering

In [7]:
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 16.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 61.4 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 62.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 56.8 MB/s 
Building wheels for collected pa

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [16]:
esco_embeddings = []
for skill in df_esco['skill']:
    esco_embeddings.append(model.encode(skill))

In [19]:
esco_embeddings = np.array(esco_embeddings)
esco_embeddings.shape

(382, 768)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
test_job = model.encode(df_dataset['skill'][29])
test_job = test_job.reshape(1, 768)
matched = np.argmax(cosine_similarity(test_job, esco_embeddings))
matched

241

In [51]:
df_esco['skill'][matched]

'executing financial transactions'