<a href="https://colab.research.google.com/github/JYP97/DS2_Proj_Jobs_skills_analysis/blob/master/BERT_BACKEND_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Dependencies
!pip install transformers
!pip install -U sentence-transformers
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np
from itertools import chain
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us

In [81]:
# Phrase 0: Load tokenizer & models

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print("=== BERT base tokenizer loaded. ===")

fine_tuned_model = BertForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/DataScience/DS2/acc_models_0.51_0.49_warmup100_2e-5_666", 
    num_labels = 8
)
print("=== Fine-tuned model loaded. ===")

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
print("=== Sentence BERT loaded. ===")

job_df = pd.read_csv('/content/drive/MyDrive/DS2/clean_dataset_1794.csv', index_col=0)
print("=== Job dataset loaded. ===")

job_emb = pd.read_csv('/content/drive/MyDrive/DS2/job_emb.csv', header=None).to_numpy()
print("=== Pre-calculated job skill embeddings loaded. ===")

=== BERT base tokenizer loaded. ===
=== Fine-tuned model loaded. ===
=== Sentence BERT loaded. ===
=== Job dataset loaded. ===
=== Pre-calculated job skill embeddings loaded. ===


In [77]:
# Phrase I: Map user input skills to job category

# User input
input_skills = input("\n\nPlease input your skills: ")
# Encoding input skills
encodings = tokenizer.encode_plus(
    input_skills,
    # None,
    add_special_tokens=True,
    max_length = 512,           # Pad & truncate all sentences.
    pad_to_max_length = True,
    # truncation=True,
    return_attention_mask = True,   # Construct attn. masks.
    return_tensors = 'pt',
)

# Define labels
labels = ['Managers',
 'Professionals',
 'Service and sales workers',
 'Plant and machine operators and assemblers',
 'Craft and related trades workers',
 'Technicians and associate professionals',
 'Clerical support workers',
 'Elementary occupations']

fine_tuned_model.eval()

# Evaluate fine-tuned model with input skill and output label index with the highest possibility 
with torch.no_grad():
    input_ids=encodings.input_ids
    attention_mask=encodings.attention_mask
    token_type_ids=encodings.token_type_ids
    output=fine_tuned_model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output.logits).cpu().detach().numpy().tolist()
    print(int(np.argmax(final_output, axis=1)))

# Get output labels
probabilities=list(chain.from_iterable(final_output))
predictions = dict(zip(labels,probabilities))
pred_label = max(predictions, key=predictions.get)
pred_label



Please input your skills: c++


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1


'Professionals'

In [80]:
# Phase II: Map input skills to job titles with the help of job category

# Method A: Use **cosine similarity** to find the most approprate job titles for user's input

# Generate embeddings of user's input
input_embeddings = sbert_model.encode(input_skills)
for skill in range(job_emb.shape[0]):
    matched = np.argsort(-cosine_similarity(input_embeddings.reshape(1, -1), job_emb)).reshape(-1,)


In [83]:
display("Input skills: ", input_skills)
display(job_df['title'].iloc[matched[:10]])

'Input skills: '

'c++'

1647                         Powertrain Software Engineer
549                  Mixed-Signal IC Integration Engineer
935                                    Warehouse Operator
182                                       3D Modeler -CAD
1208                             Reading Specialist - ESY
1092    Senior Software Engineer - Customer Care Self-...
1249                                        Jr. Developer
673                                     Platform Engineer
1444                       Residential Direct Care Worker
516                                Part Time Receptionist
Name: title, dtype: object

### Appendix

```

# Encoding skill list of each job using SBERT and add a new column 'method A embedding'and 
job_df = pd.read_csv('/content/drive/MyDrive/DS2/clean_dataset_1794.csv', index_col=0)
# category_df = job_df[job_df['job category'] == str(pred_label)]
# category_df.reset_index(drop=True, inplace=True)
job_emb = []
for idx, skill in enumerate(job_df['skills']):
    job_emb.append(sbert_model.encode(skill))

np.savetxt('/content/drive/MyDrive/DS2/job_emb.csv', job_emb, delimiter=',')

job_emb = pd.read_csv('/content/drive/MyDrive/DS2/job_emb.csv', header=None)

```

