<a href="https://colab.research.google.com/github/JYP97/DS2_Proj_Jobs_skills_analysis/blob/master/BERT_BACKEND_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [151]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [152]:
# Dependencies
!pip install transformers
!pip install -U sentence-transformers
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np
from itertools import chain
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import collections
import math

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [154]:
# Phrase 0: Load tokenizer & models

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print("=== BERT base tokenizer loaded. ===")

fine_tuned_model = BertForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/DataScience/DS2/acc_models_0.51_0.49_warmup100_2e-5_666", 
    num_labels = 8
)
print("=== Fine-tuned model loaded. ===")

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
print("=== Sentence BERT loaded. ===")

job_df = pd.read_csv('/content/drive/MyDrive/DS2/clean_dataset_1794.csv', index_col=0).reset_index(drop=True)
print("=== Job dataset loaded. ===")

job_emb = np.array(pd.read_csv('/content/drive/MyDrive/DS2/job_emb.csv', header=None).to_numpy())
print("=== Pre-calculated job skill embeddings loaded. ===")

=== BERT base tokenizer loaded. ===
=== Fine-tuned model loaded. ===
=== Sentence BERT loaded. ===
=== Job dataset loaded. ===
=== Pre-calculated job skill embeddings loaded. ===


In [287]:
from pandas.io.formats.style_render import defaultdict
# Phrase I: Map user input skills to job category

# User input
# Method A
# input_skills = input("\n\nPlease input your skills: ") # only one string in this list

# Method B
rank = 1
# input_skills for outputing category
input_skills = [] # if there are n skills, n strings will be in this list
weights = []
weight_exp = 7
# input_dict for percise matching
input_dict = defaultdict(list) # key: weight, value: input_skill
print("Please input your skills with the order of proficiency, enter q to quit: ")

# Input loop
while(True):
    print(f"Your {rank}. proficient skill: ")
    input_value = input()
    if input_value == 'q':
        print("Quit")
        break
    input_skills.append(input_value)
    weights.append(math.exp(weight_exp))
    rank += 1
    weight_exp -= 0.4
    if weight_exp <= 0:
        print("Exceeds input limit, quit...")
        break

# Generate a defaultdict with type of list
for key, value in zip(weights, input_skills):
    input_dict[key].append(value)


Please input your skills with the order of proficiency, enter q to quit: 
Your 1. proficient skill: 
manager
Your 2. proficient skill: 
group management 
Your 3. proficient skill: 
communication
Your 4. proficient skill: 
leadership
Your 5. proficient skill: 
q
Quit


In [288]:
input_skills

['manager', 'group management ', 'communication', 'leadership']

In [289]:
input_skills_join = (', '.join(input_skills))
input_skills_join

'manager, group management , communication, leadership'

In [290]:
# Encoding input skills

encodings = tokenizer.encode_plus(
    input_skills_join,
    # None,
    add_special_tokens=True,
    max_length = 512,           # Pad & truncate all sentences.
    pad_to_max_length = True,
    # truncation=True,
    return_attention_mask = True,   # Construct attn. masks.
    return_tensors = 'pt',
)

# Define labels
labels = ['Managers',
 'Professionals',
 'Service and sales workers',
 'Plant and machine operators and assemblers',
 'Craft and related trades workers',
 'Technicians and associate professionals',
 'Clerical support workers',
 'Elementary occupations']

fine_tuned_model.eval()

# Evaluate fine-tuned model with input skill and output label index with the highest possibility 
with torch.no_grad():
    input_ids=encodings.input_ids
    attention_mask=encodings.attention_mask
    token_type_ids=encodings.token_type_ids
    output=fine_tuned_model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output.logits).cpu().detach().numpy().tolist()
    print(int(np.argmax(final_output, axis=1)))

# Get output labels
probabilities=list(chain.from_iterable(final_output))
predictions = dict(zip(labels,probabilities))
pred_label = max(predictions, key=predictions.get)
pred_label

0


'Managers'

In [None]:
# Phase II: Map input skills to job titles with the help of job category

# Method A: Use **cosine similarity** to find the most approprate job titles for user's input

input_embeddings = sbert_model.encode(input_skills)
idx = list(job_df[job_df['job category'] == pred_label].index)
cos = cosine_similarity(input_embeddings.reshape(1, -1), job_emb[idx]).reshape(-1,)
match_dict = collections.defaultdict(list)

for key, value in zip(idx, cos):
    match_dict[key].append(value)

sorted_match_dict = sorted(match_dict.items(), key=lambda x:x[1], reverse=True)

print("Input skills: ", input_skills)
job_list = []
for idx in sorted_match_dict[:10]:
    print('Recommended job: ', job_df['title'][idx[0]], '\n ')
    job_list.append(job_df['title'][idx[0]])
job_list

In [291]:
# Phase II: Map input skills to job titles with the help of job category

# Method B: Use **skill proficiency** to find job title

item_skill = []
item_weight = []
for i in range(len(list(input_dict.items()))):
    item_skill.append(list(input_dict.items())[i][1])
    item_weight.append(list(input_dict.items())[i][0])
item_weight

input_embeddings = sbert_model.encode(input_skills) # (# of input skills, 768)
jobs_idx = list(job_df[job_df['job category'] == pred_label].index)
# cos = []
# job_cos_all = []
# for i_skill, skill in enumerate(input_embeddings):
#     for i_job, job in enumerate(job_emb[jobs_idx]):
#         job_cos = cosine_similarity(skill.reshape(1, -1), job_emb[jobs_idx][i_job].reshape(1, -1))
#         cos.append(job_cos * item_weight[i_skill])
#     job_cos_all.append(cos)
confidence = {}
for idx in jobs_idx:
    cos_sum = 0
    for i_skill, skill in enumerate(input_embeddings):
        cos = cosine_similarity(skill.reshape(1, -1), job_emb[idx].reshape(1, -1)).reshape(-1,)
        cos_sum += cos * item_weight[i_skill]
    confidence[idx] = cos_sum


# sum = 0  
# for idx, emb_cos in enumerate(cos):
#     sum += emb_cos * item_weight[idx]
# print(sum)
# len(job_cos_all)



In [292]:
confidence = sorted(confidence.items(), key=lambda x:x[1], reverse=True)
confidence

[(59, array([2137.92050961])),
 (1616, array([1803.55581389])),
 (0, array([1688.73749307])),
 (1532, array([1662.66859652])),
 (120, array([1617.95906401])),
 (279, array([1609.16209083])),
 (694, array([1609.16209083])),
 (464, array([1587.7934107])),
 (1281, array([1566.54082574])),
 (1159, array([1560.94405095])),
 (1693, array([1548.68405256])),
 (586, array([1524.34344677])),
 (408, array([1496.40293606])),
 (230, array([1492.30408212])),
 (210, array([1484.77340912])),
 (1352, array([1462.16591677])),
 (619, array([1460.93498769])),
 (726, array([1454.34474631])),
 (920, array([1449.64546727])),
 (1078, array([1444.08181537])),
 (713, array([1435.22913599])),
 (700, array([1432.47097701])),
 (1644, array([1431.17430411])),
 (69, array([1424.15345824])),
 (1466, array([1422.94156949])),
 (449, array([1417.09910128])),
 (239, array([1410.68197995])),
 (1470, array([1394.64132791])),
 (352, array([1362.94796495])),
 (372, array([1360.83151728])),
 (631, array([1351.57638088])),
 (1

In [293]:
print(input_skills)
for item in confidence[0:10]:
    print(job_df['title'][item[0]])

['manager', 'group management ', 'communication', 'leadership']
Court Services Manager
Nurse Manager Neuro ICU
Head Start Teacher
Service Supervisor
Clinical Supervisor - $2,000 Sign On Bonus
Facilites Operations and Maintenance Manager
Quality Control Manager
Entry Level Business Management Trainee - New Jersey Clean Energy Campaign
Data Center Project Manager
Sr Program Manager


### Appendix

```

# Encoding skill list of each job using SBERT and add a new column 'method A embedding'and 
job_df = pd.read_csv('/content/drive/MyDrive/DS2/clean_dataset_1794.csv', index_col=0)
# category_df = job_df[job_df['job category'] == str(pred_label)]
# category_df.reset_index(drop=True, inplace=True)
job_emb = []
for idx, skill in enumerate(job_df['skills']):
    job_emb.append(sbert_model.encode(skill))

np.savetxt('/content/drive/MyDrive/DS2/job_emb.csv', job_emb, delimiter=',')

job_emb = pd.read_csv('/content/drive/MyDrive/DS2/job_emb.csv', header=None)

```

