# Übungen CH11

## Aufgabe 1

In [20]:
from xml.etree.ElementTree import Element

def add_cv_field_after_lx(entry):
    for idx, field in enumerate(list(entry)):
        if field.tag == 'lx':
            cv_elem = Element('cv')
            cv_elem.text = cv(field.text)  # cv() aus §5.1
            entry.insert(idx+1, cv_elem)
            break

## Aufgabe 2


In [21]:
def delete_field(entry, tag):
    """Löscht alle Subelemente mit dem angegebenen Tag aus entry."""
    for field in list(entry):
        if field.tag == tag:
            entry.remove(field)

## Aufgabe 3

In [22]:
import re
from bs4 import BeautifulSoup

LEGAL_POS = {'n', 'v.t.', 'v.i.', 'adj', 'det'}
pattern = re.compile(r"font-size:11.0pt'>([a-z.]+)<")

def report_illegal_pos(html_file):
    doc = open(html_file, encoding="windows-1252").read()
    matches = set(re.findall(pattern, doc))
    illegal = matches - LEGAL_POS
    print("Illegale POS-Felder:", illegal)
    soup = BeautifulSoup(doc, 'html.parser')
    for p in soup.find_all('p'):
        m = pattern.search(str(p))
        if m and m.group(1) in illegal:
            head = p.get_text().split()[0]
            print(" →", head)

## Aufgabe 4

In [23]:
from nltk.corpus import toolbox
from collections import Counter

def rare_pos(lex_file):
    lex = toolbox.xml(lex_file)
    counts = Counter(e.findtext('ps') for e in lex if e.find('ps') is not None)
    return [ps for ps, cnt in counts.items() if cnt < 10]

print(rare_pos('rotokas.dic'))

['CLASS', 'FFP', 'NUM', 'POST', 'EXCL']


## Aufgabe 5

In [24]:
import re
from nltk.corpus import toolbox

def has_partial_redup(word):
    if not word:
        return False
    return bool(re.search(r'(.{2,})\1', word))

words = [e.findtext('lx') for e in toolbox.xml('rotokas.dic')]
candidates = [w for w in words if has_partial_redup(w)]
print(candidates)

['kaakaaro', 'kaakaaviko', 'kaakaavo', 'kaekae', 'kaekae', 'kaekaearo', 'kaekaeo', 'kaekaesoto', 'kaekaevira', 'kaikaio', 'kairiro', 'kaitutu', 'kaitutupie', 'kaitutuvira', 'kakae', 'kakae', 'kakae', 'kakaevira', 'kakapikoa', 'kakapikoto', 'kakapu', 'kakapua', 'kakara', 'kakarau', 'kakata', 'kakate', 'kakatuara', 'kakau', 'kakauoa', 'kakavea', 'kakavoro', 'kakavu', 'kakiaki', 'kakuaku', 'kaokao', 'kaokaoara', 'kaokaoto', 'kapekape', 'kapekapevira', 'kapikapi', 'kapokapo', 'kapokapoa', 'kapokapora', 'kapokaporo', 'kapuasisi', 'karakarao', 'karakaraoa', 'karakaraoto', 'karakaraovira', 'karakuku', 'karara', 'karekare', 'karekare', 'karekarererava', 'karekareto', 'karikari', 'karokaropo', 'karukaru', 'Karuru', 'kasikasi', 'katakatai', 'katakataivira', 'katokato', 'katokatoto', 'katokatovira', 'katoto', 'katukatu', 'kaukau', 'kaukaupie', 'kaukauvira', 'kauokauo', 'kavakavau', 'kavikavi', 'kavikaviru', 'kavikaviru', 'kavokavo', 'kavokavoa', 'kavokavoto', 'kavovoa', 'kavovovira', 'keakea', 'k

## Aufgabe 6

In [25]:
from xml.etree.ElementTree import Element

def update_cv_field(entry):
    # Vorhandenes <cv> entfernen
    for f in list(entry):
        if f.tag == 'cv':
            entry.remove(f)
    # Neues <cv> nach <lx> einfügen
    for idx, field in enumerate(list(entry)):
        if field.tag == 'lx':
            cv_elem = Element('cv')
            cv_elem.text = cv(field.text)
            entry.insert(idx+1, cv_elem)
            break

## Aufgabe 7

In [26]:
import re
from xml.etree.ElementTree import Element

def add_syl_field(entry):
    lx = entry.findtext('lx', '').lower()
    syl_count = len(re.findall(r'[aeiou]+', lx))
    syl_elem = Element('syl')
    syl_elem.text = str(syl_count)
    for idx, field in enumerate(list(entry)):
        if field.tag == 'lx':
            entry.insert(idx+1, syl_elem)
            break

## Aufgabe 8

In [27]:
import nltk
from nltk import Index, edit_distance
from nltk.corpus import toolbox
from xml.etree.ElementTree import ElementTree

def signature(word):
    vowels = set('aeiou')
    return ''.join('V' if c.lower() in vowels else 'C' for c in word if c.isalpha())

def entry_to_sfm(entry):
    lines = []
    for child in entry:
        tag = child.tag
        text = child.text or ''
        lines.append(f"\\{tag} {text}")
    return "\n".join(lines)

entries = toolbox.xml('rotokas.dic')
lexemes = [e.findtext('lx') for e in entries if e.findtext('lx')]

signatures = Index((signature(w), w) for w in lexemes)

def show_entry(lexeme):
    if lexeme in lexemes:
        entry = entries[lexemes.index(lexeme)]
    else:
        sig = signature(lexeme)
        candidates = signatures[sig] or lexemes
        best = min(candidates, key=lambda w: edit_distance(lexeme, w))
        print(f"(verwende '{best}' als ähnlichsten Treffer)")
        entry = entries[lexemes.index(best)]
    print(entry_to_sfm(entry))

show_entry('musci')     
print()
show_entry('redundant')  

(verwende 'Kusi' als ähnlichsten Treffer)
\lx kurutu
\ps N
\pt NT
\ge portion
\ge part of
\tkp ???
\dt 04/Dec/2004
\ex Uuko kurutu vateri eva.
\xp ???
\xe Give that half full cup of water (to him).
\ex Uko kurutu vateri eva ragai-pa.
\xp Yu givim hap wara long mi.
\xe ???

(verwende 'keruiato' als ähnlichsten Treffer)
\lx kerui
\ps V
\pt A
\ge thin
\ge bony
\ge skinny
\tkp bun nating
\vx 1
\dt 28/Oct/2005
\ex Em ro ira viapau sopeiavoi toupa.
\xp Man i no gat mit i blong bodi.
\xe ???
\ex Ragaia keruito ragoa-ia viapau varuaravai toupaveira ora aue tuga ragai vararo-ia.
\xp Mi bun nating man mi nogat mit na gris istap long bodi bilong mi.
\xe ???


## Aufgabe 9

In [28]:
from collections import Counter
from nltk.corpus import toolbox

def freq_field_pairs(lex_file):
    lex = toolbox.xml(lex_file)
    pairs = Counter()
    for entry in lex:
        tags = [f.tag for f in entry]
        pairs.update(zip(tags, tags[1:]))
    return pairs.most_common()

print(freq_field_pairs('rotokas.dic'))

[(('ex', 'xp'), 1532), (('xp', 'xe'), 1526), (('ps', 'pt'), 835), (('ge', 'tkp'), 824), (('pt', 'ge'), 766), (('dt', 'ex'), 758), (('xe', 'ex'), 708), (('lx', 'ps'), 520), (('rt', 'ps'), 356), (('tkp', 'dt'), 327), (('lx', 'rt'), 313), (('ge', 'ge'), 287), (('eng', 'eng'), 143), (('cmt', 'dt'), 143), (('tkp', 'nt'), 130), (('vx', 'dt'), 119), (('arg', 'vx'), 108), (('nt', 'dt'), 107), (('tkp', 'vx'), 102), (('tkp', 'eng'), 82), (('tkp', 'cmt'), 78), (('tkp', 'tkp'), 69), (('tkp', 'arg'), 67), (('ge', 'eng'), 66), (('eng', 'tkp'), 61), (('vx', 'arg'), 59), (('lx', 'alt'), 54), (('ps', 'ge'), 48), (('dt', 'cmt'), 46), (('alt', 'rt'), 46), (('cmt', 'ex'), 45), (('vx', 'cmt'), 43), (('arg', 'dt'), 40), (('vx', 'sc'), 38), (('sf', 'dt'), 36), (('sc', 'dt'), 31), (('eng', 'dt'), 30), (('dx', 'ge'), 30), (('rdp', 'ge'), 28), (('pt', 'rdp'), 25), (('pt', 'dx'), 25), (('nt', 'sf'), 21), (('nt', 'cmt'), 19), (('tkp', 'dcsv'), 18), (('tkp', 'sf'), 18), (('dcsv', 'vx'), 15), (('cmt', 'vx'), 15), (

## Aufgabe 10

In [29]:
import csv

def csv_to_toolbox(csv_file):
    with open(csv_file, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        for head, pos, gloss in reader:
            print(f"\\lx {head}")
            print(f"\\ps {pos}")
            print(f"\\gl {gloss}")
            print()

## Aufgabe 11

In [30]:
import nltk
from nltk import Index
from xml.etree.ElementTree import ElementTree

def index_shakespeare_words():
    path = nltk.data.find('corpora/shakespeare/merchant.xml')
    merchant = ElementTree().parse(path)
    entries = []
    for i, act in enumerate(merchant.findall('ACT')):
        for j, scene in enumerate(act.findall('SCENE')):
            for k, speech in enumerate(scene.findall('SPEECH')):
                location = (i+1, j+1, k+1)
                text = ' '.join(line.text or '' for line in speech.findall('LINE'))
                tokens = nltk.word_tokenize(text)
                for w in tokens:
                    if any(ch.isalpha() for ch in w):
                        entries.append((w.lower(), location))
    return Index(entries)

idx = index_shakespeare_words()
print(idx['music'])

[(3, 2, 9), (3, 2, 9), (3, 2, 9), (3, 2, 9), (5, 1, 23), (5, 1, 23), (5, 1, 23), (5, 1, 24), (5, 1, 25), (5, 1, 25), (5, 1, 25), (5, 1, 25), (5, 1, 25), (5, 1, 28), (5, 1, 29)]


## Aufgabe 12

In [31]:
import nltk
from nltk import ConditionalFreqDist
from xml.etree.ElementTree import ElementTree

path = nltk.data.find('corpora/shakespeare/merchant.xml')
merchant = ElementTree().parse(path)

cfd = ConditionalFreqDist()
for speech in merchant.findall('ACT/SCENE/SPEECH'):
    speaker = speech.findtext('SPEAKER') or 'UNKNOWN'
    text = ' '.join(line.text or '' for line in speech.findall('LINE'))
    tokens = nltk.word_tokenize(text)
    length = len(tokens)
    cfd[speaker][length] += 1

print(cfd['PORTIA'][12])

2
