In [1]:
import pandas as pd
import spacy
from spacy import displacy
from tqdm import tqdm
import json

In [2]:
import requests
import re

def lemmatize(token):
    url = f'https://www.deutschestextarchiv.de/demo/cab/query?a=default&fmt=text&clean=1&pretty=1&raw=1&q={token}'
    response = requests.get(url)

    if response.status_code == 200:
        content = response.text
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return(token)
    
    match = re.search(r'\[moot/lemma\] (\S+)', content)
    if match:
        lemma = match.group(1)
        return lemma
    else:
        print("The [moot/lemma] tag was not found.")

In [3]:
# Load spaCy model for tokenization
nlp = spacy.load('de_core_news_lg')

In [4]:
# Load testing data
with open('test_data.json', 'r', encoding='utf-8') as f:
    testing_data = json.load(f)

In [5]:
# Load job titles dictionary
with open('positions_list_lemmatized.txt', 'r', encoding='ANSI') as f:
    positions_dict = [line.strip() for line in f.readlines()]

In [6]:
testing_data[0]

{'text': 'Gute Köchin neben Stuben mädchen gesucht. 2. Bez., Unt. Donaustr. 29, Tür 11. 21856',
 'entities': [[5, 11, 'position']]}

In [7]:
positions_dict[0]

'Abdecker'

In [8]:
predicted_positions = []
exact_matches = ['Mädchen für Alles', 'Mädchen für alles', 'Stütze der Hausfrau']

for entry in tqdm(testing_data):
    text = entry['text']
    positions_row = []
    tokens = text.split()
    
    for exact_match in exact_matches:
        if exact_match in text:
            positions_row.append(exact_match)

    for k in range(len(tokens)):
        tokens[k] = lemmatize(tokens[k])
        for position_name in positions_dict:
            if position_name.lower() in tokens[k].lower():
                positions_row.append(tokens[k])
                
    for i in range(len(positions_row)):
        if 'stelle' in positions_row[i]:
            positions_row[i] = positions_row[i].replace('stelle', '')
            
    predicted_positions.append(positions_row)

  7%|█████▍                                                                         | 44/637 [19:42<4:01:32, 24.44s/it]

Failed to retrieve the page. Status code: 500


  7%|█████▌                                                                         | 45/637 [19:53<3:21:42, 20.44s/it]

Failed to retrieve the page. Status code: 500


  9%|███████▍                                                                       | 60/637 [24:45<2:38:38, 16.50s/it]

Failed to retrieve the page. Status code: 500


 14%|██████████▉                                                                    | 88/637 [37:07<3:22:34, 22.14s/it]

Failed to retrieve the page. Status code: 500


 16%|████████████▋                                                                 | 104/637 [41:38<2:44:27, 18.51s/it]

Failed to retrieve the page. Status code: 500


 28%|█████████████████████▍                                                      | 180/637 [1:12:36<2:41:20, 21.18s/it]

Failed to retrieve the page. Status code: 500


 30%|██████████████████████▊                                                     | 191/637 [1:15:58<1:50:41, 14.89s/it]

Failed to retrieve the page. Status code: 500


 33%|█████████████████████████                                                   | 210/637 [1:21:19<2:00:10, 16.89s/it]

Failed to retrieve the page. Status code: 500


 47%|███████████████████████████████████▉                                        | 301/637 [1:47:28<1:40:30, 17.95s/it]

Failed to retrieve the page. Status code: 500


 59%|████████████████████████████████████████████▊                               | 376/637 [2:10:01<1:20:53, 18.59s/it]

Failed to retrieve the page. Status code: 500


 59%|█████████████████████████████████████████████                               | 378/637 [2:10:37<1:15:27, 17.48s/it]

Failed to retrieve the page. Status code: 500


 61%|██████████████████████████████████████████████▏                             | 387/637 [2:15:04<1:22:55, 19.90s/it]

Failed to retrieve the page. Status code: 500


 61%|██████████████████████████████████████████████▋                             | 391/637 [2:16:55<1:43:21, 25.21s/it]

Failed to retrieve the page. Status code: 500


 63%|███████████████████████████████████████████████▌                            | 399/637 [2:19:08<1:12:21, 18.24s/it]

Failed to retrieve the page. Status code: 500


 64%|████████████████████████████████████████████████▎                           | 405/637 [2:21:42<1:15:45, 19.59s/it]

Failed to retrieve the page. Status code: 500


 67%|██████████████████████████████████████████████████▋                         | 425/637 [2:28:01<1:02:02, 17.56s/it]

Failed to retrieve the page. Status code: 500


 70%|██████████████████████████████████████████████████████▊                       | 448/637 [2:39:10<56:37, 17.98s/it]

Failed to retrieve the page. Status code: 500


 71%|███████████████████████████████████████████████████████▍                      | 453/637 [2:40:51<56:24, 18.40s/it]

Failed to retrieve the page. Status code: 500


 76%|█████████████████████████████████████████████████████████▍                  | 481/637 [2:53:29<1:06:16, 25.49s/it]

Failed to retrieve the page. Status code: 500


 78%|███████████████████████████████████████████████████████████▋                | 500/637 [2:59:55<1:03:02, 27.61s/it]

Failed to retrieve the page. Status code: 500


 81%|███████████████████████████████████████████████████████████████▍              | 518/637 [3:05:45<40:34, 20.46s/it]

Failed to retrieve the page. Status code: 500


100%|██████████████████████████████████████████████████████████████████████████████| 637/637 [4:05:56<00:00, 23.17s/it]


In [9]:
print(predicted_positions)

[['Kochin', 'Kochin'], ['Lehrjunge'], ['Krankenschwester'], ['Hebamme', 'Krankenpflegerin', 'Krankenpflegerin', 'Krankenpflegerin', 'Krankenpflegerin', 'Krankenpflegerin', 'Krankenpflegerin'], ['Gefangenaufsehers'], ['Gärtner', 'Gärtner'], ['Lehrer', 'leger'], ['Wagenschmied', 'Schmiedmstr'], ['Zuschneider'], ['Meierin', 'Landwirtschaft', 'Landwirtschaft', 'Wirtschastsführung', 'Kochen', 'Meierin'], ['Kaufmann'], ['Servierkellnerin'], ['Straßenmeister', 'Straßenmeister', 'provisorisch', 'Straßenmeister', 'Straßenmeister', 'Unteroffizier', 'Unteroffizier'], ['Jackennäherin'], ['Hausadministrator', 'Hausadministrator', 'Staatsbeamte'], ['Schauspieler'], ['Maschinenäherin', 'Knopflochnäherin'], ['Handelsangestellte', 'Handelsangestellte'], ['Hausknecht', 'Hausknecht'], ['Lehrling'], [], [], ['Miedernäherin'], [], ['Mädchen für Alles', 'Uhrmacher'], ['Lehrknabe'], ['Friseurgeschäft', 'Friseurgehilfe'], ['Stütze'], ['Vertreter'], ['Finanz-Prokurators', 'Doktorgrad', 'Doktorgrad', 'Advokat']

In [10]:
annotated_positions = []

for entry in tqdm(testing_data):    
    annotated_positions_row = [entry['text'][start:end] for start, end, _ in entry['entities']]
    
    for i in range(len(annotated_positions_row)):
        if 'stelle' in annotated_positions_row[i]:
            annotated_positions_row[i] = annotated_positions_row[i].replace('stelle', '')
        annotated_positions_row[i] = lemmatize(annotated_positions_row[i])
    
    annotated_positions.append(annotated_positions_row)

100%|████████████████████████████████████████████████████████████████████████████████| 637/637 [12:03<00:00,  1.14s/it]


In [11]:
import pandas as pd
from rapidfuzz import process, fuzz, distance

def conf_mat(pos_list, pred_list, shared_set):
    true_pos = 0
    false_pos = 0
    false_neg = 0
    for el in shared_set:
        pos_match, pos_score, _ = process.extractOne(el, pos_list, scorer=distance.Levenshtein.normalized_similarity)
        pred_match, pred_score, _ = process.extractOne(el, pred_list, scorer=distance.Levenshtein.normalized_similarity)
        if pos_score >= 0.9 and pred_score >= 0.9:
            true_pos += 1
        elif pos_score >= 0.9 and pred_score < 0.9:
            false_neg += 1
        elif pos_score < 0.9 and pred_score >= 0.9:
            false_pos += 1
    return true_pos, false_pos, false_neg

tp_list = []
fp_list = []
fn_list = []

for predicted, annotated in zip(predicted_positions, annotated_positions):
    shared_set = set(annotated + predicted)

    if len(predicted) > 0:
        true_pos, false_pos, false_neg = conf_mat(annotated, predicted, shared_set)
    else:
        true_pos, false_pos, false_neg = (0, 0, len(shared_set))
    tp_list.append(true_pos)
    fp_list.append(false_pos)
    fn_list.append(false_neg)

con_pred_df = pd.DataFrame()
con_pred_df['true_positive'] = tp_list
con_pred_df['false_positive'] = fp_list
con_pred_df['false_negative'] = fn_list

In [12]:
f1 = (2*con_pred_df['true_positive'].sum())/(2*con_pred_df['true_positive'].sum() + con_pred_df['false_positive'].sum() + con_pred_df['false_negative'].sum())
recall = (con_pred_df['true_positive'].sum())/(con_pred_df['true_positive'].sum() + con_pred_df['false_negative'].sum())
precision = (con_pred_df['true_positive'].sum())/(con_pred_df['true_positive'].sum() + con_pred_df['false_positive'].sum())

In [13]:
print('f1: ', f1)
print('recall: ', recall)
print('precision: ', precision)

f1:  0.6315172817809022
recall:  0.6462829736211031
precision:  0.6174112256586484
