# Import Libraries

In [1]:
# import all the required packages
import sys
import os
# change according to the status of GPU
# this command must before import pytorch
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # setting the GPUs

In [2]:
import pandas as pd
import numpy as np
import json, csv

import torch
from datasets import load_dataset, load_metric
from transformers import AutoModel, BertForSequenceClassification, BertTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# check the running environment
print(sys.executable)
print(torch.cuda.is_available())
torch.cuda.device_count()

/home/user/jinqli/anaconda3/envs/new_env/bin/python
True


1

# Load jobads data

In [58]:
#data_path = '/srv/scratch2/jinq/taskontology/jobad_taskdata/task_sample-0.2.jsonl'
#jobads = pd.read_json(data_path,lines=True)
#print('numbers of records in job ads:', jobads.shape[0])
#jobtask = jobads[jobads['label']=='TASK']
#jobtask

gold_path = '/srv/scratch2/jinq/taskontology/jobad_taskdata/goldstandard-subspans-ActObj.jsonl'
goldCont_path = '/srv/scratch2/jinq/taskontology/jobad_taskdata/goldstandard-subspans-ActObjCont.jsonl'

gold_obj = pd.read_json(gold_path,lines=True)
gold_objCont = pd.read_json(goldCont_path,lines=True)

print('numbers of records in job ads:', gold_objCont.shape[0])
gold_obj

numbers of records in job ads: 4586


Unnamed: 0,id,meta,text,label
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten,ActObj
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObj
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObj
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObj
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObj
...,...,...,...,...
4581,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",realisierst du herausfordernde Automatisierungen,ActObj
4582,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Entwicklung einer Self-Service Plattform,ActObj
4583,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...","Weiterentwicklung, Optimierung und Erneuerung ...",ActObj
4584,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Implementierung von Continuous Deployment Prin...,ActObj


# Import the model for inference

In [59]:

model_checkpoint = 'de_trained_models/m4_multijob_results/checkpoint-2000'

# import the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, problem_type="multi_label_classification")


In [60]:
print(model.num_labels)

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

37


XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [62]:
# example
inputs = tokenizer(gold_obj['text'].tolist()[0], padding=True, truncation=True, return_tensors="pt")
print(inputs)

# encoded all the text of jobads
encoded_data = [tokenizer(item['text'], 
                              return_tensors="pt", padding=True, truncation=True) for _,item in gold_obj.iterrows()]

print(encoded_data[0])

{'input_ids': tensor([[     0, 230726,      7,  48798,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
{'input_ids': tensor([[     0, 230726,      7,  48798,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


# Prediction

In [63]:
max_length = 64
def get_prediction(text, top_n: int=5):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    # perform inference to our model
    logits = model(**inputs).logits
    # get output probabilities by doing softmax
    probs = logits[0].softmax(0)
    
    #get the top_n candidates and corresponding prob as score
    value, indices = probs.topk(top_n, sorted=True)
    results = [(id_.item(),round(val.item(),4)) for val,id_ in zip(value, indices)]
    results = [j for item in results for j in item ]
    return results

def apply_classify_on_df(df):
    """
    Apply a function and return multiple values so that you can create multiple columns, return a pd.Series with the values instead:
    Source: https://queirozf.com/entries/pandas-dataframes-apply-examples
    """
    df[['la1', 'la1score', 'la2', 'la2score', 'la3', 'la3score', 'la4', 'la4score', 'la5', 'la5score']] = df.apply(lambda row: pd.Series(get_prediction(row.loc['text'])), axis=1)
    return df

In [64]:
# one example
get_prediction(gold_obj['text'].tolist()[0])

[13, 0.8237, 20, 0.0369, 23, 0.0279, 24, 0.0198, 7, 0.014]

In [65]:
predicted_gold = apply_classify_on_df(gold_obj)
predicted_gold.head()

Unnamed: 0,id,meta,text,label,la1,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten,ActObj,13.0,0.8237,20.0,0.0369,23.0,0.0279,24.0,0.0198,7.0,0.014
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObj,24.0,0.4771,13.0,0.2779,28.0,0.0914,5.0,0.0418,1.0,0.0151
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObj,19.0,0.4072,15.0,0.0987,12.0,0.0661,20.0,0.046,10.0,0.0453
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObj,31.0,0.311,12.0,0.1819,4.0,0.0697,3.0,0.0696,25.0,0.0347
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObj,15.0,0.5256,0.0,0.1335,9.0,0.0554,5.0,0.0383,33.0,0.0346


In [66]:
predicted_goldCon = apply_classify_on_df(gold_objCont)
predicted_goldCon.head()

Unnamed: 0,id,meta,text,label,la1,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten allgemein und in Medizinal...,ActObjCont,13.0,0.6766,24.0,0.0973,20.0,0.0763,28.0,0.0316,23.0,0.0238
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObjCont,24.0,0.4771,13.0,0.2779,28.0,0.0914,5.0,0.0418,1.0,0.0151
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObjCont,19.0,0.4072,15.0,0.0987,12.0,0.0661,20.0,0.046,10.0,0.0453
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObjCont,31.0,0.311,12.0,0.1819,4.0,0.0697,3.0,0.0696,25.0,0.0347
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObjCont,15.0,0.4119,0.0,0.1864,11.0,0.0868,9.0,0.0567,33.0,0.0438


## Inverse GWA class to title

In [67]:
de_path = '/srv/scratch2/jinq/taskontology/task_to_GWA_IWA_DWA_DE.csv'
dataset = load_dataset('csv', data_files=de_path,split='train')
# encoding labels
y_encoded = LabelEncoder().fit(dataset['GWA Title'])

Using custom data configuration default-2bd0bdea745cdb59
Found cached dataset csv (/home/user/jinqli/.cache/huggingface/datasets/csv/default-2bd0bdea745cdb59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


In [68]:
predicted_gold['pred_gwa1'] = y_encoded.inverse_transform(predicted_gold['la1'].astype(int)).tolist()
predicted_gold['pred_gwa2'] = y_encoded.inverse_transform(predicted_gold['la2'].astype(int)).tolist()
predicted_gold['pred_gwa3'] = y_encoded.inverse_transform(predicted_gold['la3'].astype(int)).tolist()
predicted_gold['pred_gwa4'] = y_encoded.inverse_transform(predicted_gold['la4'].astype(int)).tolist()
predicted_gold['pred_gwa5'] = y_encoded.inverse_transform(predicted_gold['la5'].astype(int)).tolist()

predicted_gold

Unnamed: 0,id,meta,text,label,la1,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score,pred_gwa1,pred_gwa2,pred_gwa3,pred_gwa4,pred_gwa5
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten,ActObj,13.0,0.8237,20.0,0.0369,23.0,0.0279,24.0,0.0198,7.0,0.0140,Handling and Moving Objects,Monitoring and Controlling Resources,Performing Administrative Activities,Performing General Physical Activities,Documenting/Recording Information
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObj,24.0,0.4771,13.0,0.2779,28.0,0.0914,5.0,0.0418,1.0,0.0151,Performing General Physical Activities,Handling and Moving Objects,Repairing and Maintaining Mechanical Equipment,Controlling Machines and Processes,Assisting and Caring for Others
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObj,19.0,0.4072,15.0,0.0987,12.0,0.0661,20.0,0.0460,10.0,0.0453,"Monitoring Processes, Materials, or Surroundings","Inspecting Equipment, Structures, or Materials","Guiding, Directing, and Motivating Subordinates",Monitoring and Controlling Resources,Evaluating Information to Determine Compliance...
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObj,31.0,0.3110,12.0,0.1819,4.0,0.0697,3.0,0.0696,25.0,0.0347,Selling or Influencing Others,"Guiding, Directing, and Motivating Subordinates","Communicating with Supervisors, Peers, or Subo...",Communicating with People Outside the Organiza...,Performing for or Working Directly with the Pu...
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObj,15.0,0.5256,0.0,0.1335,9.0,0.0554,5.0,0.0383,33.0,0.0346,"Inspecting Equipment, Structures, or Materials",Analyzing Data or Information,Estimating the Quantifiable Characteristics of...,Controlling Machines and Processes,Thinking Creatively
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",realisierst du herausfordernde Automatisierungen,ActObj,33.0,0.8509,36.0,0.0399,6.0,0.0310,18.0,0.0174,13.0,0.0085,Thinking Creatively,Working with Computers,Developing Objectives and Strategies,Making Decisions and Solving Problems,Handling and Moving Objects
4582,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Entwicklung einer Self-Service Plattform,ActObj,6.0,0.4588,33.0,0.4271,36.0,0.0221,18.0,0.0162,7.0,0.0083,Developing Objectives and Strategies,Thinking Creatively,Working with Computers,Making Decisions and Solving Problems,Documenting/Recording Information
4583,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...","Weiterentwicklung, Optimierung und Erneuerung ...",ActObj,6.0,0.5810,33.0,0.1836,18.0,0.0567,36.0,0.0520,0.0,0.0175,Developing Objectives and Strategies,Thinking Creatively,Making Decisions and Solving Problems,Working with Computers,Analyzing Data or Information
4584,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Implementierung von Continuous Deployment Prin...,ActObj,6.0,0.6661,33.0,0.0787,18.0,0.0571,36.0,0.0332,0.0,0.0211,Developing Objectives and Strategies,Thinking Creatively,Making Decisions and Solving Problems,Working with Computers,Analyzing Data or Information


In [71]:
# save the predicted file
outfile = '/srv/scratch2/jinq/taskontology/jobad_taskdata/predicted_gold/predicted_gold.jsonl'
predicted_gold.to_json(outfile,force_ascii=False, orient="records", lines=True)

## predicted_goldCon

In [70]:

predicted_goldCon['pred_gwa1'] = y_encoded.inverse_transform(predicted_goldCon['la1'].astype(int)).tolist()
predicted_goldCon['pred_gwa2'] = y_encoded.inverse_transform(predicted_goldCon['la2'].astype(int)).tolist()
predicted_goldCon['pred_gwa3'] = y_encoded.inverse_transform(predicted_goldCon['la3'].astype(int)).tolist()
predicted_goldCon['pred_gwa4'] = y_encoded.inverse_transform(predicted_goldCon['la4'].astype(int)).tolist()
predicted_goldCon['pred_gwa5'] = y_encoded.inverse_transform(predicted_goldCon['la5'].astype(int)).tolist()

# save the preprocessed file
out_goldCon = '/srv/scratch2/jinq/taskontology/jobad_taskdata/predicted_gold/predicted_goldCon.jsonl'
predicted_goldCon.to_json(out_goldCon, force_ascii=False, orient="records", lines=True)

predicted_goldCon.head()

Unnamed: 0,id,meta,text,label,la1,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score,pred_gwa1,pred_gwa2,pred_gwa3,pred_gwa4,pred_gwa5
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten allgemein und in Medizinal...,ActObjCont,13.0,0.6766,24.0,0.0973,20.0,0.0763,28.0,0.0316,23.0,0.0238,Handling and Moving Objects,Performing General Physical Activities,Monitoring and Controlling Resources,Repairing and Maintaining Mechanical Equipment,Performing Administrative Activities
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObjCont,24.0,0.4771,13.0,0.2779,28.0,0.0914,5.0,0.0418,1.0,0.0151,Performing General Physical Activities,Handling and Moving Objects,Repairing and Maintaining Mechanical Equipment,Controlling Machines and Processes,Assisting and Caring for Others
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObjCont,19.0,0.4072,15.0,0.0987,12.0,0.0661,20.0,0.046,10.0,0.0453,"Monitoring Processes, Materials, or Surroundings","Inspecting Equipment, Structures, or Materials","Guiding, Directing, and Motivating Subordinates",Monitoring and Controlling Resources,Evaluating Information to Determine Compliance...
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObjCont,31.0,0.311,12.0,0.1819,4.0,0.0697,3.0,0.0696,25.0,0.0347,Selling or Influencing Others,"Guiding, Directing, and Motivating Subordinates","Communicating with Supervisors, Peers, or Subo...",Communicating with People Outside the Organiza...,Performing for or Working Directly with the Pu...
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObjCont,15.0,0.4119,0.0,0.1864,11.0,0.0868,9.0,0.0567,33.0,0.0438,"Inspecting Equipment, Structures, or Materials",Analyzing Data or Information,Getting Information,Estimating the Quantifiable Characteristics of...,Thinking Creatively


In [32]:
path_gold = '/srv/scratch2/jinq/taskontology/jobad_taskdata/predicted_gold/predicted_gold.jsonl'
path_goldCon = '/srv/scratch2/jinq/taskontology/jobad_taskdata/predicted_gold/predicted_goldCon.jsonl'


In [72]:
gold = pd.read_json(path_gold,lines=True)
gold.head()

Unnamed: 0,id,meta,text,label,la1,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score,pred_gwa1,pred_gwa2,pred_gwa3,pred_gwa4,pred_gwa5
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten,ActObj,13,0.8237,20,0.0369,23,0.0279,24,0.0198,7,0.014,Handling and Moving Objects,Monitoring and Controlling Resources,Performing Administrative Activities,Performing General Physical Activities,Documenting/Recording Information
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObj,24,0.4771,13,0.2779,28,0.0914,5,0.0418,1,0.0151,Performing General Physical Activities,Handling and Moving Objects,Repairing and Maintaining Mechanical Equipment,Controlling Machines and Processes,Assisting and Caring for Others
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObj,19,0.4072,15,0.0987,12,0.0661,20,0.046,10,0.0453,"Monitoring Processes, Materials, or Surroundings","Inspecting Equipment, Structures, or Materials","Guiding, Directing, and Motivating Subordinates",Monitoring and Controlling Resources,Evaluating Information to Determine Compliance...
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObj,31,0.311,12,0.1819,4,0.0697,3,0.0696,25,0.0347,Selling or Influencing Others,"Guiding, Directing, and Motivating Subordinates","Communicating with Supervisors, Peers, or Subo...",Communicating with People Outside the Organiza...,Performing for or Working Directly with the Pu...
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObj,15,0.5256,0,0.1335,9,0.0554,5,0.0383,33,0.0346,"Inspecting Equipment, Structures, or Materials",Analyzing Data or Information,Estimating the Quantifiable Characteristics of...,Controlling Machines and Processes,Thinking Creatively


In [None]:
goldCon = pd.read_json(path_goldCon,lines=True)
goldCon.head()