In [1]:
import pandas as pd
import spacy
from spacy import displacy
from tqdm import tqdm
import json

nlp = spacy.load('de_core_news_lg')

In [2]:
with open('test_data.json', 'r', encoding='utf-8') as f:
    testing_data = json.load(f)

In [3]:
testing_data[0]

{'text': 'Gute Köchin neben Stuben mädchen gesucht. 2. Bez., Unt. Donaustr. 29, Tür 11. 21856',
 'entities': [[5, 11, 'position']]}

In [4]:
positions = []        
        
for entry in tqdm(testing_data):
    positions_row = [] # to store positions within one ad
    text = entry['text'].replace('ſ', 's')
    doc = nlp(text)
    position_found = False  # problem: several positions on one ad

    if 'Mädchen für Alles' in text:
        positions_row.append('Mädchen für Alles')
        position_found = True
        
    elif 'Mädchen für alles' in text:
        positions_row.append('Mädchen für alles')
        position_found = True
        
    elif 'Stütze der Hausfrau' in text:
        positions_row.append('Stütze der Hausfrau')
        position_found = True

    elif ' als ' in text.lower() or 'Als ' in text:
        index_found = False
        for token in list(doc):
            if not index_found:
                if token.text.lower() == 'als':
                    index = token.i
                    index_found = True
        for i in range(index, len(doc)):
            if not position_found: 
                if doc[i].pos_ == 'NOUN':
                    positions_row.append(str(doc[i]))
                    position_found = True

    elif "stelle" in text:
        for token in doc:
            if not position_found:
                if "stelle" in token.text:
                    positions_row.append(str(token))
                    position_found = True

    else:
        for token in doc:
            if not position_found:
                if token.pos_ == 'NOUN' and (token.dep_ in ['sb', 'ROOT', 'cj']):
                    positions_row.append(str(token))
                    position_found = True
                    if len(doc) >= token.i + 2:
                        if (doc[token.i + 1].text in ['und', 'oder']) and doc[token.i + 2].pos_ == 'NOUN':
                            positions_row.append(str(doc[token.i + 2]))

    if not position_found:
        positions_row.append(False)

    positions.append(positions_row)

100%|████████████████████████████████████████████████████████████████████████████████| 637/637 [00:06<00:00, 98.97it/s]


In [5]:
print(positions)

[['Köchin'], ['Lehrjunge'], ['Industriewerk'], ['Krankenpflegerin'], ['Gefangenaufsehersstelle'], ['Gärtner'], ['Lehrstellen'], ['Wagenschmied'], ['Zuschneider'], ['Meierin'], ['Kaufmann'], ['Servierkellnerin'], ['Straßenmeisterstelle'], ['Jackennäherinnen'], ['Rechnungsführer'], ['Schauspieler'], ['Maschinnäherinnen'], ['Handelsangestellter'], ['Hausknecht'], ['Lehrling'], ['Wäscherin'], ['Suppleutenstelle'], ['Miedernäherinnen'], ['Nebenbeschäftigung'], ['Mädchen für Alles'], ['Lehrknabe'], ['Friseurgeschäft'], ['Kinderfräulein'], ['Verlag'], ['Finanz⸗Procuratorsstelle'], ['Werkshebamme'], ['MaschinenSchlosser'], ['Weingroßhandlung'], ['Maschinnäherin', 'Knopflochnäherin'], ['Bankbeamter'], ['Köchin'], ['Lehrjungen'], ['Dienstesstellen'], ['Friseur-Gehilfe'], ['Büglerinnen', 'Hilfsarbeiterinnen'], ['Arbeiter'], [False], ['Bedienerin'], ['Arbeits⸗Vermittlung'], ['Wohnung'], ['Arbeiter'], ['Clavierlehrer'], ['genstelle'], ['Eisendreher', 'Maschinenschlosser'], ['Praktikant'], ['Sattler

In [6]:
total_annotations = 0
correct_matches = 0

for entry, identified in zip(testing_data, positions):
    annotated_positions = [entry['text'][start:end] for start, end, _ in entry['entities']]
    total_annotations += len(annotated_positions)
    
    identified_set = set(identified)
    annotated_set = set(annotated_positions)
    
    # Calculate the number of correct matches
    matches = len(identified_set & annotated_set)
    
    # Calculate the fraction of correct matches compared to total annotated entities
    match_fraction = matches / len(annotated_positions) if len(annotated_positions) > 0 else 0
    
    correct_matches += match_fraction
    
    # Print annotations and predictions for mismatches
    if match_fraction != 1:
        print(f"Annotations: {annotated_positions}")
        print(f"Predictions: {identified}")
        print()

Annotations: ['Krankenschwestern']
Predictions: ['Industriewerk']

Annotations: ['Hebamme', 'Krankenpflegerin']
Predictions: ['Krankenpflegerin']

Annotations: ['Cassier', 'Rechnungsführer', 'Hausadministrator', 'Staatsbeamter']
Predictions: ['Rechnungsführer']

Annotations: ['Maschinnäherinnen', 'Vorrichterinnen', 'Knopflochnäherinnen']
Predictions: ['Maschinnäherinnen']

Annotations: ['Supplentenstelle']
Predictions: ['Suppleutenstelle']

Annotations: ['Schreibstelle']
Predictions: ['Nebenbeschäftigung']

Annotations: ['Friseurgehilfe']
Predictions: ['Friseurgeschäft']

Annotations: ['Vertreter']
Predictions: ['Verlag']

Annotations: ['Vertreter']
Predictions: ['Weingroßhandlung']

Annotations: ['Kontrolorspostens', 'Amtsschreiberspostens']
Predictions: ['Dienstesstellen']

Annotations: ['Schlosser']
Predictions: ['Arbeiter']

Annotations: ['Bonne', 'Haushälterin']
Predictions: [False]

Annotations: ['Maler', 'Anstreicher', 'Lackirer']
Predictions: ['Arbeits⸗Vermittlung']

Annotation

In [7]:
accuracy = correct_matches / len(testing_data) if testing_data else 0

print(f'Accuracy: {accuracy:.2%}')

Accuracy: 69.08%
