In [4]:
import sys
import os

import pandas as pd
import numpy as np
import json, csv

import torch
from datasets import load_dataset, load_metric
from transformers import AutoModel, AutoTokenizer, BertForSequenceClassification, XLMRobertaForSequenceClassification, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

from train import LitModel

In [5]:
gwa_labels=37
iwa_labels=332
dwa_labels=2085

MAX_LENGTH = 64
LABEL = 'GWA_deepl_de'

# files
path = '../switchdrive/thesis/data/task_to_ALL_DE.csv'
train_file = '../switchdrive/thesis/data/task_train.csv'
val_file = '../switchdrive/thesis/data/task_val.csv'
test_file = '../switchdrive/thesis/data/task_test.csv'

root_dir = '../switchdrive/thesis/'
models_dir = root_dir+'trained_models/deepl_de/'

checkpoint_gbert = models_dir + 'german_model.ckpt'
checkpoint_job = models_dir + 'job_model.ckpt'
checkpoint_multilingual = models_dir + 'multilingual_model.ckpt'
checkpoint_multi_job = models_dir + 'multi_job.ckpt'

data_df = pd.read_csv(path, index_col=0)
train_df = pd.read_csv(train_file, index_col=0)
test_df = pd.read_csv(test_file, index_col=0)
y_encoded = LabelEncoder().fit(train_df[LABEL])

CHECKPOINT = checkpoint_job

In [6]:
def get_prediction(model,tokenizer,text, top_n: int=5):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
    # perform inference to our model
    logits = model(**inputs).logits
    # get output probabilities by doing softmax
    # sigmoid for multi-label
    probs = logits[0].softmax(0)
    
    #get the top_n candidates and corresponding prob as score
    value, indices = probs.topk(top_n, sorted=True)
    # results = [(id_.item(),round(val.item(),4)) for val,id_ in zip(value[0], indices[0])]
    results = [(id_.item(),round(val.item(),4)) for val,id_ in zip(value, indices)]
    results = [j for item in results for j in item ]
    return results

def apply_classify_on_df(model,tokenizer,df):
    """
    Apply a function and return multiple values so that you can create multiple columns, return a pd.Series with the values instead:
    Source: https://queirozf.com/entries/pandas-dataframes-apply-examples
    """
    df[['pred_la1','la1score','pred_la2','la2score','pred_la3','la3score','pred_la4','la4score','pred_la5',
        'la5score']] = df.apply(lambda row: pd.Series(get_prediction(model,tokenizer,row.loc['text'])), axis=1)
    return df


## Loading the model from checkpoint

In [9]:

new_model = LitModel.load_from_checkpoint(checkpoint_path=CHECKPOINT)
tokenizer = new_model.tokenizer
new_model.eval()
print("MODEL reloaded from checkpoint: {0} !".format(CHECKPOINT))

Some weights of the model checkpoint at agne/jobGBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at agne/jobGBERT and are newly ini

MODEL reloaded from checkpoint: ../switchdrive/thesis/trained_models/deepl_de/job_model.ckpt !


## GoldenStandard Jobads

In [25]:
job_path = '../switchdrive/thesis/taskontology/jobad_taskdata/goldstandard-subspans-ActObj.jsonl'
jobCon_path = '../switchdrive/thesis/taskontology/jobad_taskdata/goldstandard-subspans-ActObjCont.jsonl'

job_df = pd.read_json(job_path,lines=True)
jobCon_df = pd.read_json(jobCon_path,lines=True)

In [26]:
def pred_jobads(model, tokenizer, job_df):
    
    pred_df = apply_classify_on_df(model,tokenizer,job_df)

    pred_df['pred_la1'] = y_encoded.inverse_transform(pred_df['pred_la1'].astype(int)).tolist()
    pred_df['pred_la2'] = y_encoded.inverse_transform(pred_df['pred_la2'].astype(int)).tolist()
    pred_df['pred_la3'] = y_encoded.inverse_transform(pred_df['pred_la3'].astype(int)).tolist()
    pred_df['pred_la4'] = y_encoded.inverse_transform(pred_df['pred_la4'].astype(int)).tolist()
    pred_df['pred_la5'] = y_encoded.inverse_transform(pred_df['pred_la5'].astype(int)).tolist()

    return pred_df

In [27]:
pred_job = pred_jobads(new_model, tokenizer, job_df)
job_outfile = root_dir + 'pred_job/jobmodel_deepl_job.csv'
pred_job.to_csv(job_outfile, header=True)

In [28]:
pred_jobCon = pred_jobads(new_model, tokenizer, jobCon_df)

jobCon_outfile = root_dir+'pred_job/jobmodel_deepl_jobCon.csv'
pred_job.to_csv(jobCon_outfile, header=True)

## Check the predicted jobads

In [32]:
pred_df = pd.read_csv(root_dir+'pred_job/jobmodel_deepl_job.csv')
pred_df

Unnamed: 0.1,Unnamed: 0,id,meta,text,label,pred_la1,la1score,pred_la2,la2score,pred_la3,la3score,pred_la4,la4score,pred_la5,la5score
0,0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten,ActObj,['Handhabung und Bewegung von Objekten'],0.5904,['Ausführen allgemeiner körperlicher Aktivität...,0.2507,['Reparatur und Wartung von mechanischen Gerät...,0.0398,['Steuerung von Maschinen und Prozessen'],0.0235,['Überwachung und Kontrolle von Ressourcen'],0.0183
1,1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObj,['Handhabung und Bewegung von Objekten'],0.9519,['Ausführen allgemeiner körperlicher Aktivität...,0.0159,['Steuerung von Maschinen und Prozessen'],0.0059,['Reparatur und Wartung von mechanischen Gerät...,0.0048,['Kreativ denken'],0.0039
2,2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObj,['Reparatur und Wartung von mechanischen Gerät...,0.2926,['Ausführen allgemeiner körperlicher Aktivität...,0.2763,['Handhabung und Bewegung von Objekten'],0.1256,"['Inspektion von Ausrüstung, Strukturen oder M...",0.1133,"['Überwachung von Prozessen, Materialien oder ...",0.0521
3,3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObj,['Ausführen allgemeiner körperlicher Aktivität...,0.2955,['Anderen helfen und für sie sorgen'],0.0924,"['Führung, Leitung und Motivation von Untergeb...",0.0777,['Handhabung und Bewegung von Objekten'],0.0596,['Verkaufen oder Beeinflussen anderer'],0.0521
4,4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObj,"['Inspektion von Ausrüstung, Strukturen oder M...",0.3466,['Schätzung der quantifizierbaren Merkmale von...,0.2203,['Informationen erhalten'],0.0565,['Kreativ denken'],0.0489,['Handhabung und Bewegung von Objekten'],0.0443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581,4581,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",realisierst du herausfordernde Automatisierungen,ActObj,['Kreativ denken'],0.6306,['Entwicklung von Zielsetzungen und Strategien'],0.0895,['Arbeiten mit Computern'],0.0464,"['Führung, Leitung und Motivation von Untergeb...",0.0328,['Entscheidungen treffen und Probleme lösen'],0.0250
4582,4582,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Entwicklung einer Self-Service Plattform,ActObj,['Kreativ denken'],0.7178,['Entwicklung von Zielsetzungen und Strategien'],0.1093,['Arbeiten mit Computern'],0.0300,['Dokumentieren/Aufzeichnen von Informationen'],0.0157,"['Organisieren, Planen und Priorisieren der Ar...",0.0148
4583,4583,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...","Weiterentwicklung, Optimierung und Erneuerung ...",ActObj,['Kreativ denken'],0.4547,['Entwicklung von Zielsetzungen und Strategien'],0.2162,['Arbeiten mit Computern'],0.0508,['Dokumentieren/Aufzeichnen von Informationen'],0.0274,['Entscheidungen treffen und Probleme lösen'],0.0257
4584,4584,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Implementierung von Continuous Deployment Prin...,ActObj,['Kreativ denken'],0.3532,['Arbeiten mit Computern'],0.3342,['Entwicklung von Zielsetzungen und Strategien'],0.0903,['Entscheidungen treffen und Probleme lösen'],0.0226,['Steuerung von Maschinen und Prozessen'],0.0166


In [31]:
pred_df = pd.read_csv(root_dir+'pred_job/jobmodel_deepl_jobCon.csv')
pred_jobCon

Unnamed: 0,id,meta,text,label,pred_la1,la1score,pred_la2,la2score,pred_la3,la3score,pred_la4,la4score,pred_la5,la5score
0,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Verpackungsarbeiten allgemein und in Medizinal...,ActObjCont,['Handhabung und Bewegung von Objekten'],0.6834,['Ausführen allgemeiner körperlicher Aktivität...,0.1804,['Reparatur und Wartung von mechanischen Gerät...,0.0376,['Steuerung von Maschinen und Prozessen'],0.0220,['Überwachung und Kontrolle von Ressourcen'],0.0127
1,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Einrichten des eigenen Arbeitsplatzes,ActObjCont,['Handhabung und Bewegung von Objekten'],0.9519,['Ausführen allgemeiner körperlicher Aktivität...,0.0159,['Steuerung von Maschinen und Prozessen'],0.0059,['Reparatur und Wartung von mechanischen Gerät...,0.0048,['Kreativ denken'],0.0039
2,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Kontrollarbeiten,ActObjCont,['Reparatur und Wartung von mechanischen Gerät...,0.2926,['Ausführen allgemeiner körperlicher Aktivität...,0.2763,['Handhabung und Bewegung von Objekten'],0.1256,"['Inspektion von Ausrüstung, Strukturen oder M...",0.1133,"['Überwachung von Prozessen, Materialien oder ...",0.0521
3,sjmm-22011109412481,"{'id': 'sjmm-22011109412481', 'year': 2011, 'i...",Aussenarbeit,ActObjCont,['Ausführen allgemeiner körperlicher Aktivität...,0.2955,['Anderen helfen und für sie sorgen'],0.0924,"['Führung, Leitung und Motivation von Untergeb...",0.0777,['Handhabung und Bewegung von Objekten'],0.0596,['Verkaufen oder Beeinflussen anderer'],0.0521
4,sjmm-22011108302017,"{'id': 'sjmm-22011108302017', 'year': 2011, 'i...","Planung, Durchführung und Auswertung von hocha...",ActObjCont,"['Inspektion von Ausrüstung, Strukturen oder M...",0.2831,['Schätzung der quantifizierbaren Merkmale von...,0.2284,['Informationen erhalten'],0.0839,['Handhabung und Bewegung von Objekten'],0.0713,['Kreativ denken'],0.0665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",realisierst du herausfordernde Automatisierung...,ActObjCont,['Kreativ denken'],0.6254,['Entwicklung von Zielsetzungen und Strategien'],0.0829,['Arbeiten mit Computern'],0.0805,"['Führung, Leitung und Motivation von Untergeb...",0.0256,"['Organisieren, Planen und Priorisieren der Ar...",0.0218
4582,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...","Entwicklung einer Self-Service Plattform mit, ...",ActObjCont,['Kreativ denken'],0.7152,['Entwicklung von Zielsetzungen und Strategien'],0.0732,['Arbeiten mit Computern'],0.0705,['Dokumentieren/Aufzeichnen von Informationen'],0.0137,"['Organisieren, Planen und Priorisieren der Ar...",0.0127
4583,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...","Weiterentwicklung, Optimierung und Erneuerung ...",ActObjCont,['Kreativ denken'],0.4547,['Entwicklung von Zielsetzungen und Strategien'],0.2162,['Arbeiten mit Computern'],0.0508,['Dokumentieren/Aufzeichnen von Informationen'],0.0274,['Entscheidungen treffen und Probleme lösen'],0.0257
4584,sjmm_large-9202135719010297,"{'id': 'sjmm_large-9202135719010297', 'year': ...",Implementierung von Continuous Deployment Prin...,ActObjCont,['Kreativ denken'],0.3532,['Arbeiten mit Computern'],0.3342,['Entwicklung von Zielsetzungen und Strategien'],0.0903,['Entscheidungen treffen und Probleme lösen'],0.0226,['Steuerung von Maschinen und Prozessen'],0.0166
