In [1]:
# imports
import re

In [2]:
# read data
with open("../Data/Wikipedia1M/Wikipedia1M.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [3]:
# clean text

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9äöüÄÖÜß.,!?]', ' ', text) # remove all special characters
    text = re.sub(r' +', ' ', text) # remove multiple spaces    
    text = text.strip() # remove leading and trailing spaces
    text = text.lower()
    return text

In [4]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('german'))

# split text into sentences
sentences = []

token_count = 0
for sent in text.split("."):
    clean_sent = clean_text(sent)
    tokens = clean_sent.split(" ")
    tokens = [t for t in tokens if t not in stop_words]
    sentences.append(tokens)
    token_count += len(tokens)

print(f"Tokens: {token_count}")
print(f"Sents : {len(sentences)}")

Tokens: 9385290
Sents : 1175980


In [5]:
sentences[:5]

[['0,7', 'prozent', 'stammen', 'zwei', 'mehr', 'ethnien', 'ab'],
 ['0',
  'bedeutet,',
  'strahlengang',
  'frei',
  'ist,',
  'füllstand',
  'grenze',
  'liegt'],
 ['0', 'kb', ',', 'video', 'icty', 'sitzung', '16'],
 ['juni', '2005,', 'http', 'hague'],
 ['0', 'kb', ',', 'video', 'icty', 'sitzung', '20']]

In [6]:
from nltk import ngrams

bigrams = []
unigrams = []

for sent in sentences:
    
    if len(sent) < 2:
        continue
    
    sent.insert(0, "<s>")
    sent.append("</s>")
    
    tmp_bigrams = list(ngrams(sent, 2))
    bigrams += tmp_bigrams
    
    unigrams += sent

print(bigrams[:10])

[('<s>', '0,7'), ('0,7', 'prozent'), ('prozent', 'stammen'), ('stammen', 'zwei'), ('zwei', 'mehr'), ('mehr', 'ethnien'), ('ethnien', 'ab'), ('ab', '</s>'), ('<s>', '0'), ('0', 'bedeutet,')]


In [7]:
from collections import Counter

bigram_counter = Counter(bigrams)
bigram_counter.most_common(25)[:10]

[(('s', '</s>'), 13137),
 (('wurde', '</s>'), 11074),
 (('<s>', 'seit'), 9323),
 (('<s>', '000'), 8889),
 (('<s>', 'jahr'), 8463),
 (('<s>', 'wurde'), 8356),
 (('1', '</s>'), 8054),
 (('<s>', 'jahrhundert'), 6896),
 (('<s>', 'januar'), 5952),
 (('<s>', 'dabei'), 5848)]

In [8]:
import math


def eval_model(model, sentence):
    tokens = sentence.split()
    bigrams = ngrams(tokens, 2)
    
    log_prob = 0
    for bigram in bigrams:
        t_1, t = bigram
        
        try:
            probability = model[t_1][t]
        except:
            print("Prob not found for", bigram)
            probability = 0.00001
        
        log_prob += math.log(probability)
    
    log_prob = math.exp(log_prob)
    print(f"Probability for '{sentence}' is: {log_prob}")
    print("\n\n")


eval_model(bigrams, "Es ist")


Prob not found for ('Es', 'ist')
Probability for 'Es ist' is: 9.999999999999997e-06





In [9]:
from collections import defaultdict

def kneser_ney_smoothing(ngrams, N, vocabulary_size):
    # Erstelle eine Liste von (N-1)-Grammen
    ngrams_count = defaultdict(int)
    for ngram in ngrams:
        ngrams_count[ngram[:-1]] += 1

    # Erstelle eine Liste von N-Grammen, die von jedem (N-1)-Gramm begonnen werden
    context_count = defaultdict(int)
    for ngram in ngrams:
        context_count[ngram[:-1]] += 1

    # Berechne die Kneser-Ney Schätzung der Wahrscheinlichkeit von jedem N-Gramm
    kneser_ney_probabilities = {}
    for ngram in ngrams:
        context = ngram[:-1]
        # Berechne die Anzahl der N-Gramme, die von dem gleichen (N-1)-Gramm begonnen werden
        following = context_count[context]
        # Berechne die Häufigkeit des (N-1)-Gramms
        count = ngrams_count[context]
        # Berechne die Schätzung der Wahrscheinlichkeit mit Kneser-Ney Smoothing
        probability = (max(count - N, 0) / count) + (N * following / count * vocabulary_size)
        kneser_ney_probabilities[ngram] = probability

    return kneser_ney_probabilities

In [10]:
bigramss = [("ich", "bin"), ("bin", "hier"), ("hier", "um"), ("um", "zu"), ("zu", "lernen"), ("ich", "bin"), ("ich", "bin"), ("ich", "bin")]
N = 2
vocabulary_size = 6

kneser_ney_probabilities = kneser_ney_smoothing(bigramss, N, vocabulary_size)
print(kneser_ney_probabilities)

{('ich', 'bin'): 12.5, ('bin', 'hier'): 12.0, ('hier', 'um'): 12.0, ('um', 'zu'): 12.0, ('zu', 'lernen'): 12.0}


In [11]:
kneser_ney_probabilities = kneser_ney_smoothing(bigrams, N, 9385290)
print(kneser_ney_probabilities[:10])

TypeError: unhashable type: 'slice'