<a href="https://colab.research.google.com/github/JYP97/DS2_Proj_Jobs_skills_analysis/blob/master/BERT_BACKEND_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [45]:
# Load tokenizer & fine-tuned model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print("=== BERT tokenizer loaded. ===")
fine_tuned_model = BertForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/DataScience/DS2/acc_models_0.51_0.49_warmup100_2e-5_666", 
    num_labels = 8
)
print("=== Fine-tuned model loaded. ===")

=== BERT tokenizer loaded. ===
=== Fine-tuned model loaded. ===


In [146]:
# Let users input their skills and turn the input into tokens
input_skills = input("\n\nPlease input your skills: ")
encodings = tokenizer.encode_plus(
    input_skills,
    # None,
    add_special_tokens=True,
    max_length = 512,           # Pad & truncate all sentences.
    pad_to_max_length = True,
    # truncation=True,
    return_attention_mask = True,   # Construct attn. masks.
    return_tensors = 'pt',
)



Please input your skills: education, math, python, leadership




In [147]:
labels = ['Managers',
 'Professionals',
 'Service and sales workers',
 'Plant and machine operators and assemblers',
 'Craft and related trades workers',
 'Technicians and associate professionals',
 'Clerical support workers',
 'Elementary occupations']

In [148]:
fine_tuned_model.eval()
with torch.no_grad():
    input_ids=encodings.input_ids
    attention_mask=encodings.attention_mask
    token_type_ids=encodings.token_type_ids
    output=fine_tuned_model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output.logits).cpu().detach().numpy().tolist()
    print(int(np.argmax(final_output, axis=1)))

1


In [149]:
final_output

[[0.7831828594207764,
  0.974862277507782,
  0.6745386123657227,
  0.49678298830986023,
  0.7063820958137512,
  0.14812183380126953,
  0.07069255411624908,
  0.07306355983018875]]

In [150]:
from itertools import chain
probabilities=list(chain.from_iterable(final_output))
predictions = dict(zip(labels,probabilities))
pred_label = max(predictions, key=predictions.get)
pred_label

'Professionals'

In [151]:
import pandas as pd
job_df = pd.read_csv('/content/drive/MyDrive/DS2/clean_dataset_1794.csv')
category_df = job_df[job_df['job category'] == str(pred_label)]
display(category_df)

Unnamed: 0.1,Unnamed: 0,salary,title,job category,skills,labels
1,1,$19.00 - $26.00 / hour,Teacher of English for Online Groups,Professionals,"Vocabularies, Grammars, Teaching, Lesson Plann...",1
2,2,"$106,250.00 - $125,000.00 / year",CRM / PHP Developer,Professionals,"PHP (Scripting Language), Debugging, Web Servi...",1
5,5,"$65,000.00 - $120,000.00 / year",Financial advisor/ Insurance Agent,Professionals,"Self Motivation, Positive Attitude, Attention ...",1
7,7,"$85,000.00 - $146,100.00 / year",Network Security Engineer,Professionals,"Certified Information Security Manager, EC Cou...",1
13,13,$20.00 - $25.00 / hour,Business Analyst,Professionals,"Attention To Detail, Positive Attitude, Schedu...",1
...,...,...,...,...,...,...
1780,1786,,Predictive Modeling Analyst IV,Professionals,"Scheduling, Data Mining, Predictive Analytics,...",1
1785,1791,,Sr Reimbursement Analyst,Professionals,"Accounting, Auditing, Finance, Chargemaster, I...",1
1788,1794,,Peer Recovery Facilitator-DAS,Professionals,"Mental Health, Substance Abuse",1
1791,1797,,Human Resources Recruiter ( REMOTE ),Professionals,"Recruitment, Complex Problem Solving, Leadersh...",1


In [141]:
# SBERT for similarity matching
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [152]:
# Generate embeddings of user's input
input_embeddings = sbert_model.encode(input_skills)
# Generate embeddings of skills of each job
job_embeddings = []
for skill in category_df['skills']:
    job_embeddings.append(sbert_model.encode(skill))
job_embeddings = np.array(job_embeddings)
print(input_embeddings, '\n')
print(job_embeddings)

[-3.78561854e-01  7.74539411e-01  1.29693890e+00  1.72434166e-01
  7.28741614e-03  1.93213716e-01  5.10211103e-02  1.33779037e+00
 -3.86799514e-01  9.50074643e-02 -5.65313578e-01  6.56979918e-01
  3.83174509e-01  3.59270036e-01  3.16011570e-02  2.17214391e-01
 -4.61483806e-01 -7.47673810e-01  2.77953446e-01 -1.34245694e-01
 -4.27345306e-01 -1.11211747e-01 -1.52017191e-01 -4.59112704e-01
  3.25013012e-01  6.80829167e-01 -1.41028449e-01 -2.00653523e-01
 -1.11735988e+00 -2.65522122e-01 -8.43747437e-01 -9.51119542e-01
  1.36349142e+00 -1.76714465e-01  4.22209054e-01 -2.47856319e-01
  1.73575461e-01  9.70276177e-01  1.24686416e-02 -8.07807222e-02
  6.39228821e-01 -3.75214994e-01  4.73931104e-01  2.06603799e-02
 -1.00926173e+00 -2.52210408e-01 -4.99444485e-01  2.07805157e-01
 -6.27654433e-01 -5.98822117e-01 -1.25649047e+00  6.81016386e-01
  5.37857890e-01 -4.19182897e-01 -5.46030402e-01  9.68302488e-02
  1.58406436e+00 -1.32852495e+00  3.26640189e-01  1.72453672e-01
 -8.36882472e-01  6.15896

In [153]:
print(job_embeddings.shape)

(503, 768)


In [154]:
from sklearn.metrics.pairwise import cosine_similarity
for idx in range(job_embeddings.shape[0]):
    matched = np.argsort(-cosine_similarity(input_embeddings.reshape(1, -1), job_embeddings)).reshape(-1,)

In [155]:
display("Input: ", input_skills)
display(category_df['title'].iloc[matched[:10]])

'Input: '

'education, math, python, leadership'

577                                     Data Scientist II
1379                               Backend Java Developer
789     Software Developer to Salesforce: Paid Trainin...
1119                         Quantitative Finance Analyst
912     Infant and Toddler Teacher – Early Childhood E...
1440                                          Co-Op - COL
410                DevOps Engineer | $120k | Richmond, VA
1087    Senior Software Engineer - Customer Care Self-...
510                             Certification Coordinator
2                                     CRM / PHP Developer
Name: title, dtype: object