In [1]:
import pandas as pd
import spacy
from spacy import displacy
from tqdm import tqdm
import json

nlp = spacy.load('de_core_news_lg')

In [2]:
with open('test_data.json', 'r', encoding='utf-8') as f:
    testing_data = json.load(f)

In [3]:
testing_data[0]

{'text': 'Gute Köchin neben Stuben mädchen gesucht. 2. Bez., Unt. Donaustr. 29, Tür 11. 21856',
 'entities': [[5, 11, 'position']]}

In [4]:
import requests
import re

def lemmatize(token):
    url = f'https://www.deutschestextarchiv.de/demo/cab/query?a=default&fmt=text&clean=1&pretty=1&raw=1&q={token}'
    response = requests.get(url)

    if response.status_code == 200:
        content = response.text
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
    
    match = re.search(r'\[moot/lemma\] (\S+)', content)
    if match:
        lemma = match.group(1)
        return lemma
    else:
        print("The [moot/lemma] tag was not found.")

In [5]:
predicted_positions = []        
        
for entry in tqdm(testing_data):
    text = entry['text']
    positions_row = [] # to store positions within one ad
    doc = nlp(text)
    position_found = False  # problem: several positions on one ad

    if 'Mädchen für Alles' in text:
        positions_row.append('Mädchen für Alles')
        position_found = True
        
    elif 'Mädchen für alles' in text:
        positions_row.append('Mädchen für alles')
        position_found = True
        
    elif 'Stütze der Hausfrau' in text:
        positions_row.append('Stütze der Hausfrau')
        position_found = True

    elif ' als ' in text.lower() or 'Als ' in text:
        index_found = False
        for token in list(doc):
            if not index_found:
                if token.text.lower() == 'als':
                    index = token.i
                    index_found = True
        for i in range(index, len(doc)):
            if not position_found: 
                if doc[i].pos_ == 'NOUN':
                    positions_row.append(str(doc[i]))
                    position_found = True

    elif "stelle" in text:
        for token in doc:
            if not position_found:
                if "stelle" in token.text:
                    positions_row.append(str(token))
                    position_found = True

    else:
        for token in doc:
            if not position_found:
                if token.pos_ == 'NOUN' and (token.dep_ in ['sb', 'ROOT', 'cj']):
                    positions_row.append(str(token))
                    position_found = True
                    if len(doc) >= token.i + 2:
                        if (doc[token.i + 1].text in ['und', 'oder']) and doc[token.i + 2].pos_ == 'NOUN':
                            positions_row.append(str(doc[token.i + 2]))
                            
    for i in range(len(positions_row)):
        if 'stelle' in positions_row[i]:
            positions_row[i] = positions_row[i].replace('stelle', '')
        positions_row[i] = lemmatize(positions_row[i])

    predicted_positions.append(positions_row)

100%|████████████████████████████████████████████████████████████████████████████████| 637/637 [08:45<00:00,  1.21it/s]


In [6]:
print(predicted_positions)

[['Kochin'], ['Lehrjunge'], ['Industriewerk'], ['Krankenpflegerin'], ['Gesangaufseher'], ['Gärtner'], ['Lehre'], ['Wagenschmied'], ['Zuschneider'], ['Meierin'], ['Kaufmann'], ['Servierkellnerin'], ['Straßenmeister'], ['Jackennäherin'], ['Rechnungsführer'], ['Schauspieler'], ['Maschinenäherin'], ['Handelsangestellte'], ['Hausknecht'], ['Lehrling'], ['Wäscherin'], ['Suppeleute'], ['Miedernäherin'], ['Nebenbeschäftigung'], ['Mädchen'], ['Lehrknabe'], ['Friseurgeschäft'], ['Kinderfräulein'], ['Verlag'], ['Finanz-Prokurator'], ['Werkshebamme'], ['Maschinenschlosser'], ['Weingroßhandlung'], ['Maschinenäherin', 'Knopflochnäherin'], ['Bankbeamte'], ['Kochin'], ['Lehrjunge'], ['dienstesn'], ['Friseur-Gehilfe'], ['Büglerin', 'Hilfesarbeiterin'], ['Arbeiter'], [], ['Bedienerin'], ['Arbeits-Vermittlung'], ['Wohnung'], ['Arbeiter'], ['Klavierlehrer'], ['gen'], ['Eisendreher', 'Maschinenschlosser'], ['Praktikant'], ['Sattler'], ['Praktikant'], ['Ausfahrer'], ['Kaffee-Versandhaus'], ['Schuhmacher'], 

In [7]:
annotated_positions = []

for entry in tqdm(testing_data):    
    annotated_positions_row = [entry['text'][start:end] for start, end, _ in entry['entities']]
    
    for i in range(len(annotated_positions_row)):
        if 'stelle' in annotated_positions_row[i]:
            annotated_positions_row[i] = annotated_positions_row[i].replace('stelle', '')
        annotated_positions_row[i] = lemmatize(annotated_positions_row[i])
    
    annotated_positions.append(annotated_positions_row)

100%|████████████████████████████████████████████████████████████████████████████████| 637/637 [11:24<00:00,  1.07s/it]


In [8]:
print(annotated_positions)

[['Kochin'], ['Lehrjunge'], ['Krankenschwester'], ['Hebamme', 'Krankenpflegerin'], ['Gesangaufseher'], ['Gärtner'], ['Lehre'], ['Wagenschmied'], ['Zuschneider'], ['Meierin'], ['Kaufmann'], ['Servierkellnerin'], ['Straßenmeister'], ['Jackennäherin'], ['Kassier', 'Rechnungsführer', 'Hausadministrator', 'Staatsbeamte'], ['Schauspieler'], ['Maschinenäherin', 'Vorrichterin', 'Knopflochnäherin'], ['Handelsangestellte'], ['Hausknecht'], ['Lehrling'], ['Wäscherin'], ['supplenten'], ['Miedernäherin'], ['Schreibe'], ['Mädchen'], ['Lehrknabe'], ['Friseurgehilfe'], ['Kinderfräulein'], ['Vertreter'], ['Finanz-Prokurator'], ['Werkshebamme'], ['Maschinenschlosser'], ['Vertreter'], ['Maschinenäherin', 'Knopflochnäherin'], ['Bankbeamte'], ['Kochin'], ['Lehrjunge'], ['kontrolorspostens', 'Amtsschreibersposten'], ['Friseur-Gehilfe'], ['Büglerin', 'Hilfesarbeiterin'], ['Schlosser'], ['Bonne', 'Haushälterin'], ['Bedienerin'], ['Maler', 'Anstreicher', 'Lackierer'], ['Stallbursch'], ['Kommis'], ['Klavierlehr

In [9]:
import pandas as pd
from rapidfuzz import process, fuzz, distance

def conf_mat(pos_list, pred_list, shared_set):
    true_pos = 0
    false_pos = 0
    false_neg = 0
    for el in shared_set:
        pos_match, pos_score, _ = process.extractOne(el, pos_list, scorer=distance.Levenshtein.normalized_similarity)
        pred_match, pred_score, _ = process.extractOne(el, pred_list, scorer=distance.Levenshtein.normalized_similarity)
        if pos_score >= 0.9 and pred_score >= 0.9:
            true_pos += 1
        elif pos_score >= 0.9 and pred_score < 0.9:
            false_neg += 1
        elif pos_score < 0.9 and pred_score >= 0.9:
            false_pos += 1
    return true_pos, false_pos, false_neg

tp_list = []
fp_list = []
fn_list = []

for predicted, annotated in zip(predicted_positions, annotated_positions):
    shared_set = set(annotated + predicted)

    if len(predicted) > 0:
        true_pos, false_pos, false_neg = conf_mat(annotated, predicted, shared_set)
    else:
        true_pos, false_pos, false_neg = (0, 0, len(shared_set))
    tp_list.append(true_pos)
    fp_list.append(false_pos)
    fn_list.append(false_neg)

con_pred_df = pd.DataFrame()
con_pred_df['true_positive'] = tp_list
con_pred_df['false_positive'] = fp_list
con_pred_df['false_negative'] = fn_list

In [10]:
print(con_pred_df)

     true_positive  false_positive  false_negative
0                1               0               0
1                1               0               0
2                0               1               1
3                1               0               1
4                1               0               0
..             ...             ...             ...
632              0               1               1
633              1               0               0
634              0               1               2
635              0               1               1
636              1               0               0

[637 rows x 3 columns]


In [11]:
f1 = (2*con_pred_df['true_positive'].sum())/(2*con_pred_df['true_positive'].sum() + con_pred_df['false_positive'].sum() + con_pred_df['false_negative'].sum())
recall = (con_pred_df['true_positive'].sum())/(con_pred_df['true_positive'].sum() + con_pred_df['false_negative'].sum())
precision = (con_pred_df['true_positive'].sum())/(con_pred_df['true_positive'].sum() + con_pred_df['false_positive'].sum())

In [12]:
print('f1: ', f1)
print('recall: ', recall)
print('precision: ', precision)

f1:  0.6900742741390952
recall:  0.6134453781512605
precision:  0.7885802469135802
