In [1]:
import re
from collections import Counter, defaultdict
from typing import List, Tuple
import os
import random
import numpy as np
import re

In [None]:
class ngram:
    def __init__(self, n : int, korpus : str, smoothing = False):
        self.n                          = n
        self.smoothing                  = smoothing
        self.toks                       = self._init_toks(korpus)

        self.vocab, self.vocab_count    = self._init_vocab()

        self.counts                     = self._count()


    def _init_toks(self, korpus):
        # Alle Interpunktionszeichen außer \w \s ; . ? ! entfernen
        text_no_p = re.sub(r"[^\w\s;.?!]", "", korpus)
        # Leerzeichen vor jedem übrigen Punktionszeichen setzen
        text_no_p = re.sub(r"([;.?!])", r" \1", text_no_p)

        # Satzgrenzen
        text_no_p_ends = re.sub(r"([?!;.])", r"\1 </s>|||<s> ", text_no_p)
        text_no_p_ends = "<s> " + text_no_p_ends + " </s>"

        text_no_p_ends = text_no_p_ends.lower()

        text_no_p_ends_list = re.findall(r'\w+|<s>|</s>|[;.!?]', text_no_p_ends)

        # Each sentence gets a whole list
        toks = []
        group = []

        for tok in text_no_p_ends_list:
            group.append(tok)

            if tok == "</s>":
                toks.append(group)
                group = []

        return toks

    def _init_vocab(self):
        # 1) Zähle alle Tokens
        counter = Counter(tok for sentence in self.toks for tok in sentence)
        
        # 2) Sortiere nach Häufigkeit (absteigend)
        items = counter.most_common()    # Liste von (token, count)
        
        # 3) Entpacke in zwei Listen
        vocab, counts = zip(*items)
        
        return list(vocab), list(counts)


    def _count(self):
        """
        counts: Dict[ Tuple(context), Counter(next_word → count) ]
        """
        counts = defaultdict(Counter)
        for sent in self.toks:
            for i in range(len(sent) - self.n + 1):
                ctx  = tuple(sent[i:i + (self.n - 1)])
                nxt  = sent[i + self.n - 1]
                counts[ctx][nxt] += 1

        if self.smoothing:
            # Add-one Smoothing: für jeden Kontext und jedes Vokabel +1
            for ctx in counts:
                for w in self.vocab:
                    counts[ctx][w] += 1

        return counts
    

    def next_word(self, seed):
        """
        Gibt genau ein Wort zurück
        """
        toks = seed.lower().split()
        # Padding mit <s>
        while len(toks) < self.n - 1:
            toks.insert(0, "<s>")
        
        ctx = tuple(toks[-(self.n - 1):])

        # Hol den Counter für diesen Kontext (oder leeren Counter als Fallback)
        counter = self.counts.get(ctx, None)
        if not counter:
            # unbekannter Kontext → gleichverteilt über ganzes Vokabular
            return random.choice(self.vocab)

        # Liste von möglichen Nachfolgern und ihren Counts
        words, weights = zip(*counter.items())

        # Nächste wort
        return random.choices(words, weights=weights, k=1)[0]

    def generate(self, seed: str, length: int) -> str:
        toks = seed.lower().split()
        # Padding links mit <s>
        while self.n > 1 and len(toks) < self.n - 1:
            toks.insert(0, "<s>")

        for _ in range(length):
            nxt = self.next_word(" ".join(toks[-(self.n - 1):]))
            toks.append(nxt)
            if nxt == "</s>":
                break

        # Gib den Text ohne die Start-Markierungen zurück
        return " ".join(toks)
    


In [141]:
# # 10 Random <UNK> for Toks with 1 occourance

# for i, (num, word) in enumerate(num_vocab):
#     if num == 1:
#         begin = i
#         break

# start_end_indexes = list(range(begin, len(num_vocab)))

# choosen = []

# for _ in range(10):
#     rand_idx = random.choice(start_end_indexes)
#     choosen.append(rand_idx)
#     start_end_indexes.remove(rand_idx)

# for i in choosen:
#     num_vocab[i][1] = "<UNK>"

In [3]:
raw_text = ""

for txt in os.listdir("korpus"):
    with open(f"korpus/{txt}", "r", encoding="utf-8") as f:
        content = f.read()

    raw_text += content + " "


In [4]:
LM1 = ngram(1, korpus=raw_text, smoothing=False)

LM2A = ngram(2, korpus=raw_text, smoothing=False)
LM2B = ngram(2, korpus=raw_text, smoothing=True)

LM3A = ngram(3, korpus=raw_text, smoothing=False)
LM3B = ngram(3, korpus=raw_text, smoothing=True)

In [13]:
sum_bi_LM2A = sum(
    len(counter)
    for counter in LM2A.counts
    if counter
)

sum_bi_LM2B = sum(
    len(counter)
    for counter in LM2B.counts
    if counter
)

sum_tri_LM3A = sum(
    len(counter)
    for counter in LM3A.counts
    if counter
)

sum_tri_LM3B = sum(
    len(counter)
    for counter in LM3B.counts
    if counter
)

In [16]:
print("--------Language Model 1--------")
print(f"|V|: {len(LM1.vocab)}")
print(f"|Uni|: {len(LM1.counts.get(()))}")
print("--------Language Model 2--------")
print(f"|Bi|a: {sum_bi_LM2A}")
ratio = sum_bi_LM2A / (len(LM2A.vocab) ** 2)
print(f"Anteil nicht-null: {ratio*100:2f}%") ; print()
print(f"|Bi|b: {sum_bi_LM2B}")
print("--------Language Model 3--------")
print(f"|Tri|a: {sum_tri_LM3A}")
ratio = sum_tri_LM3A / (len(LM3A.vocab) ** 2)
print(f"Anteil nicht-null: {ratio*100:2f}%") ; print()
print(f"|Tri|b: {sum_tri_LM3B}")

--------Language Model 1--------
|V|: 9339
|Uni|: 9339
--------Language Model 2--------
|Bi|a: 9338
Anteil nicht-null: 0.010707%

|Bi|b: 9338
--------Language Model 3--------
|Tri|a: 102882
Anteil nicht-null: 0.117961%

|Tri|b: 102882
