# Import Libraries

In [13]:
# import all the required packages
import sys
import os
# change according to the status of GPU
# this command must before import pytorch
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # setting the GPUs

In [14]:
import pandas as pd
import numpy as np
import json, csv

import torch
from datasets import load_dataset, load_metric
from transformers import AutoModel, BertForSequenceClassification, BertTokenizer
from sklearn.preprocessing import LabelEncoder

In [15]:
# check the running environment
print(sys.executable)
print(torch.cuda.is_available())
torch.cuda.device_count()

/home/user/jinqli/anaconda3/envs/new_env/bin/python
True


1

# Load jobads data

In [16]:
#data_path = '/srv/scratch2/jinq/taskontology/jobad_taskdata/task_sample-0.2.jsonl'
#jobads = pd.read_json(data_path,lines=True)
#print('numbers of records in job ads:', jobads.shape[0])
#jobtask = jobads[jobads['label']=='TASK']
#jobtask

gold_path = '/srv/scratch2/jinq/taskontology/jobad_taskdata/goldstandard-subspans-ActObj.jsonl'
goldCont_path = '/srv/scratch2/jinq/taskontology/jobad_taskdata/goldstandard-subspans-ActObjCont.jsonl'

gold_obj = pd.read_json(gold_path,lines=True)
gold_objCont = pd.read_json(goldCont_path,lines=True)

print('numbers of records in job ads:', gold_objCont.shape[0])
gold_obj

numbers of records in job ads: 4586


Unnamed: 0,id,meta,text,label
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten,ActObj
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObj
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObj
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObj
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObj
...,...,...,...,...
4581,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",realisierst du herausfordernde Automatisierungen,ActObj
4582,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Entwicklung einer Self-Service Plattform,ActObj
4583,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...","Weiterentwicklung, Optimierung und Erneuerung ...",ActObj
4584,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Implementierung von Continuous Deployment Prin...,ActObj


# Import the model for inference

In [17]:

model_checkpoint = 'trained_models/job_results/checkpoint-2500/'

# import the tokenizer 
SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'

# import the model
model = BertForSequenceClassification.from_pretrained(model_checkpoint, problem_type="multi_label_classification")
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)

In [18]:
print(model.num_labels)

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

37


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [32]:
# example
inputs = tokenizer(gold_obj['text'].tolist()[0], padding=True, truncation=True, return_tensors="pt")
print(inputs)

# encoded all the text of jobads
encoded_data = [tokenizer(item['text'], 
                              return_tensors="pt", padding=True, truncation=True) for _,item in gold_obj.iterrows()]

print(encoded_data[0])

{'input_ids': tensor([[   101,  41357,  78560,  32346, 106172,    102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[   101,  41357,  78560,  32346, 106172,    102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


# Prediction

In [19]:
max_length = 128
def get_prediction(text, top_n: int=5):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    # perform inference to our model
    logits = model(**inputs).logits
    # get output probabilities by doing softmax
    # probs = outputs[0].softmax(1)
    
    #get the top_n candidates and corresponding logits as score
    value, indices = logits.topk(top_n, 1,sorted=True)
    results = [(id_.item(),round(val.item(),4)) for val,id_ in zip(value[0], indices[0])]
    results = [j for item in results for j in item ]
    return results

def apply_classify_on_df(df):
    """
    Apply a function and return multiple values so that you can create multiple columns, return a pd.Series with the values instead:
    Source: https://queirozf.com/entries/pandas-dataframes-apply-examples
    """
    df[['la1', 'la1score', 'la2', 'la2score', 'la3', 'la3score', 'la4', 'la4score', 'la5', 'la5score']] = df.apply(lambda row: pd.Series(get_prediction(row.loc['text'])), axis=1)
    return df

In [20]:
# one example
get_prediction(gold_obj['text'].tolist()[0])

[13, 4.2459, 23, 1.9499, 5, 1.589, 1, 1.1303, 20, 0.9828]

In [None]:
predicted_gold = apply_classify_on_df(gold_obj)
predicted_gold.head()

Unnamed: 0,id,meta,text,label,la1,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten,ActObj,13.0,4.2459,23.0,1.9499,5.0,1.589,1.0,1.1303,20.0,0.9828
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObj,12.0,3.7614,4.0,2.9967,1.0,2.2433,19.0,1.5339,34.0,1.3064
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObj,5.0,4.5981,13.0,4.007,24.0,2.996,12.0,2.733,21.0,1.1866
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObj,23.0,1.9933,25.0,1.924,4.0,1.7337,32.0,1.243,1.0,1.0442
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObj,17.0,5.0698,0.0,3.3261,11.0,2.0837,12.0,1.8981,19.0,1.5893


In [22]:
predicted_goldCon = apply_classify_on_df(gold_objCont)
predicted_goldCon.head()

Unnamed: 0,id,meta,text,label,la1,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten allgemein und in Medizinal...,ActObjCont,1.0,3.8359,23.0,2.3428,34.0,2.2222,19.0,1.2557,12.0,1.2552
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObjCont,12.0,3.7614,4.0,2.9967,1.0,2.2433,19.0,1.5339,34.0,1.3064
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObjCont,5.0,4.5981,13.0,4.007,24.0,2.996,12.0,2.733,21.0,1.1866
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObjCont,23.0,1.9933,25.0,1.924,4.0,1.7337,32.0,1.243,1.0,1.0442
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObjCont,0.0,3.8098,17.0,3.6089,11.0,3.1391,15.0,1.6991,33.0,1.6983


## Inverse GWA class to title

In [27]:
de_path = '/srv/scratch2/jinq/taskontology/task_to_GWA_IWA_DWA_DE.csv'
dataset = load_dataset('csv', data_files=de_path,split='train')

# encoding labels
y_encoded = LabelEncoder().fit(dataset['GWA Title'])

gwa_titles = y_encoded.inverse_transform(predicted_gold['la1'].astype(int)).tolist()
print(len(gwa_titles))
predicted_gold['gwa_class'] = gwa_titles
predicted_gold

Using custom data configuration default-2bd0bdea745cdb59
Found cached dataset csv (/home/user/jinqli/.cache/huggingface/datasets/csv/default-2bd0bdea745cdb59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


4586


Unnamed: 0,id,meta,text,label,la1,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score,gwa_class
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten,ActObj,13.0,4.2459,23.0,1.9499,5.0,1.5890,1.0,1.1303,20.0,0.9828,Handling and Moving Objects
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObj,12.0,3.7614,4.0,2.9967,1.0,2.2433,19.0,1.5339,34.0,1.3064,"Guiding, Directing, and Motivating Subordinates"
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObj,5.0,4.5981,13.0,4.0070,24.0,2.9960,12.0,2.7330,21.0,1.1866,Controlling Machines and Processes
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObj,23.0,1.9933,25.0,1.9240,4.0,1.7337,32.0,1.2430,1.0,1.0442,Performing Administrative Activities
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObj,17.0,5.0698,0.0,3.3261,11.0,2.0837,12.0,1.8981,19.0,1.5893,"Judging the Qualities of Objects, Services, or..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",realisierst du herausfordernde Automatisierungen,ActObj,20.0,2.8844,24.0,2.5318,13.0,2.0681,23.0,1.7760,28.0,1.5675,Monitoring and Controlling Resources
4582,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Entwicklung einer Self-Service Plattform,ActObj,33.0,3.9628,6.0,3.7426,18.0,3.4174,22.0,1.7213,12.0,1.3430,Thinking Creatively
4583,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...","Weiterentwicklung, Optimierung und Erneuerung ...",ActObj,33.0,3.9743,6.0,3.4586,7.0,1.9572,12.0,1.7509,36.0,1.6522,Thinking Creatively
4584,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Implementierung von Continuous Deployment Prin...,ActObj,33.0,3.5367,36.0,2.8986,12.0,1.7958,13.0,1.6560,6.0,1.4936,Thinking Creatively


In [29]:
# save the predicted file
outfile = '/srv/scratch2/jinq/taskontology/jobad_taskdata/predicted_gold/predicted_gold.csv'
predicted_gold.to_csv(outfile, header=True)

## predicted_goldCon

In [28]:
de_path = '/srv/scratch2/jinq/taskontology/task_to_GWA_IWA_DWA_DE.csv'
dataset = load_dataset('csv', data_files=de_path,split='train')

# encoding labels
y_encoded = LabelEncoder().fit(dataset['GWA Title'])

gwa_titles = y_encoded.inverse_transform(predicted_goldCon['la1'].astype(int)).tolist()
print(len(gwa_titles))
predicted_goldCon['gwa_class'] = gwa_titles
predicted_goldCon

Using custom data configuration default-2bd0bdea745cdb59
Found cached dataset csv (/home/user/jinqli/.cache/huggingface/datasets/csv/default-2bd0bdea745cdb59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


4586


Unnamed: 0,id,meta,text,label,la1,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score,gwa_class
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten allgemein und in Medizinal...,ActObjCont,1.0,3.8359,23.0,2.3428,34.0,2.2222,19.0,1.2557,12.0,1.2552,Assisting and Caring for Others
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObjCont,12.0,3.7614,4.0,2.9967,1.0,2.2433,19.0,1.5339,34.0,1.3064,"Guiding, Directing, and Motivating Subordinates"
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObjCont,5.0,4.5981,13.0,4.0070,24.0,2.9960,12.0,2.7330,21.0,1.1866,Controlling Machines and Processes
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObjCont,23.0,1.9933,25.0,1.9240,4.0,1.7337,32.0,1.2430,1.0,1.0442,Performing Administrative Activities
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObjCont,0.0,3.8098,17.0,3.6089,11.0,3.1391,15.0,1.6991,33.0,1.6983,Analyzing Data or Information
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",realisierst du herausfordernde Automatisierung...,ActObjCont,20.0,4.0462,24.0,1.6639,7.0,1.6495,18.0,1.5068,1.0,1.2773,Monitoring and Controlling Resources
4582,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...","Entwicklung einer Self-Service Plattform mit, ...",ActObjCont,33.0,5.3207,6.0,3.1257,18.0,3.0168,12.0,1.6546,36.0,0.8809,Thinking Creatively
4583,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...","Weiterentwicklung, Optimierung und Erneuerung ...",ActObjCont,33.0,3.9743,6.0,3.4586,7.0,1.9572,12.0,1.7509,36.0,1.6522,Thinking Creatively
4584,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Implementierung von Continuous Deployment Prin...,ActObjCont,33.0,3.5367,36.0,2.8986,12.0,1.7958,13.0,1.6560,6.0,1.4936,Thinking Creatively


In [30]:
# save the preprocessed file
outfile = '/srv/scratch2/jinq/taskontology/jobad_taskdata/predicted_gold/predicted_goldCon.csv'
predicted_goldCon.to_csv(outfile, header=True)