In [5]:
from nltk import ngrams
import re
import math
from collections import defaultdict, Counter

In [2]:
# Read in data and clean
# each row is a sentence and the sentences are separated by a newline
with open("./data/wikipedia300K.txt", "r", encoding="utf-8") as inf:
    text = inf.read()
sentences = text.split("\n")
sentences_cleaned = []
for x in sentences:
    x = re.sub(r"[^a-zA-Z0-9äöüÄÖÜß\.,!\?]", " ", x)
    x = re.sub(' +', ' ', x)
    x = x.lower()
    sentences_cleaned.append(x)
sentences = sentences_cleaned

In [6]:
# Create bigrams and unigrams
# bigrams is a dictionary with the bigram as key and the count as value
# unigrams is a dictionary with the unigram as key and the count as value
# add <s> and </s> to each sentence
bigrams = []
unigrams = []
for sent in sentences:
    if len(sent) < 2:
        continue
    sent = "<s>" + sent
    sent = sent + "</s>"
    words = sent.split()
    tmp_bigrams = list(ngrams(words, 2))
    bigrams += tmp_bigrams
    unigrams += words

bigram_counter = Counter(bigrams)
bigrams = dict(bigram_counter)
unigram_counter = Counter(unigrams)
unigrams = dict(unigram_counter)

In [11]:
# Calculate the probability for each bigram
def kneser_ney_smooth(bigram_counts, unigram_counts, d=0.75):
    N1 = defaultdict(int)
    for w1, w2 in bigram_counts:
        N1[w1] += 1
    smoothed_probs = {}
    for w1, w2 in bigram_counts:
        c_w1_w2 = bigram_counts[(w1, w2)]
        c_w1 = unigram_counts[w1]
        smoothed_probs[(w1, w2)] = max(c_w1_w2 - d, 0) / c_w1 + (d * N1[w1] / c_w1)
    return smoothed_probs

smoothed_probs = kneser_ney_smooth(bigrams, unigrams)
count = 0
for key, value in smoothed_probs.items():
    print(key, value)
    count += 1
    if count == 10:
        break

('<s>', '102') 0.34856679286100595
('102', 'gefangene,') 0.6557377049180327
('gefangene,', 'die') 0.8928571428571428
('die', 'am') 0.2230343191845687
('am', 'tor') 0.149655811490601
('tor', 'von') 0.4403114186851211
('von', 'isin') 0.3085801001476999
('isin', 'gefangen') 0.8
('gefangen', 'gesetzt') 0.1818181818181818
('gesetzt', 'wurden,') 0.293859649122807


In [8]:
# Calculate the probability of a sentence
def calc_sentence_prob(sentence, bigram_prob): #Die Wahrscheinlichkeit für den Auftritt des Satzes wird berechnet
    sentence = re.sub(r"[^a-zA-Z0-9öäüÖÄÜß\?!-]", " ", sentence)#Satz bereinigen
    sentence = [t.strip().lower() for t in sentence.split(" ")]#Liste aus allen Wörtern erstellen und dabei alles in Kleinbuchstaben
    sentence.insert(0, "<s>") #Am Satzanfang <s> einfügen
    sentence.append("</s>")#Am Satzende </s> einfügen
    prob = 1#Prob mit Startwert 1
    for i in range(len(sentence)-1): #Über die Liste der Sätze iterieren
        bigram = (sentence[i], sentence[i+1])#Bigram bilden
        if bigram in bigram_prob:#Wenn das Bigram in der Liste der Bigramme vorhanden ist multipliziere mit der Auftrittswahrscheinlichkeit des Bigrammes
            prob *= bigram_prob[bigram]
    return prob #Prob zurückgeben

sentences = ["Für die Reise zum Brandenburger Tor braucht man einige Stunden", "Ich studiere Wirtschaftsinformatik an der Technischen Hoschule Mittelhessen in Friedberg", "Das Wetter ist heute nicht sehr schön.", "Ich habe vor kurzem ein interessantes Buch gelesen.", "Ich werde morgen früh joggen gehen."]

probs = []
for s in sentences:
    probs.append(calc_sentence_prob(s, smoothed_probs)) #Prob für alle Sätze mit dem vorher erstellten Kneser-Ney Bigramm Modell aufrufen

for i in range(5):
    print(f"Wahrscheinlichkeit für '{sentences[i]}': {probs[i]}") #Ergebnisse ausgeben

Wahrscheinlichkeit für 'Für die Reise zum Brandenburger Tor braucht man einige Stunden': 0.000140407844671374
Wahrscheinlichkeit für 'Ich studiere Wirtschaftsinformatik an der Technischen Hoschule Mittelhessen in Friedberg': 0.0034243397997157045
Wahrscheinlichkeit für 'Das Wetter ist heute nicht sehr schön.': 0.0001128991594542684
Wahrscheinlichkeit für 'Ich habe vor kurzem ein interessantes Buch gelesen.': 0.0010275204049260103
Wahrscheinlichkeit für 'Ich werde morgen früh joggen gehen.': 0.05818796525095558


In [9]:
def calc_perplexity(prob):#Funktion zur Berechnung der Perplexity
    if prob == 0:
        return float('inf')
    return 2**(-math.log2(prob))

perplex = []
for i in range(5):
    perplex.append(calc_perplexity(probs[i])) #Über die Sätze iterieren und die Perplexity berechnen

for i in range(5):
    print(f"Perplexity für '{sentences[i]}': {perplex[i]}") #Ergebnisse ausgeben

Perplexity für 'Für die Reise zum Brandenburger Tor braucht man einige Stunden': 7122.109183717691
Perplexity für 'Ich studiere Wirtschaftsinformatik an der Technischen Hoschule Mittelhessen in Friedberg': 292.0270938307648
Perplexity für 'Das Wetter ist heute nicht sehr schön.': 8857.461869811932
Perplexity für 'Ich habe vor kurzem ein interessantes Buch gelesen.': 973.2166828083651
Perplexity für 'Ich werde morgen früh joggen gehen.': 17.1856842851809
