In [None]:
# imports
import re

In [None]:
# read data
with open("../Data/Wikipedia1M/Wikipedia1M.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [None]:
# clean text

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9äöüÄÖÜß.,!?]', ' ', text) # remove all special characters
    text = re.sub(r' +', ' ', text) # remove multiple spaces
    text = re.sub(r'[!?]', '.', text) # replace ! and ? with .
    text = re.sub(r'\.+', '.', text) # remove multiple dots
    text = text.strip() # remove leading and trailing spaces
    text = text.lower()
    return text

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('german'))

# split text into sentences
sentences = []

token_count = 0
for sent in text.split("."):
    clean_sent = clean_text(sent)
    tokens = clean_sent.split(" ")
    # tokens = [t for t in tokens if t not in stop_words]
    sentences.append(tokens)
    token_count += len(tokens)

print(f"Tokens: {token_count}")
print(f"Sents : {len(sentences)}")

In [None]:
sentences[:5]

In [None]:
from nltk import ngrams

bigrams = []
unigrams = []

for sent in sentences:
    
    if len(sent) < 2:
        continue
    
    sent.insert(0, "<s>")
    sent.append("</s>")
    
    tmp_bigrams = list(ngrams(sent, 2))
    bigrams += tmp_bigrams
    
    unigrams += sent

print(bigrams[:10])

In [None]:
from collections import Counter

bigram_counter = Counter(bigrams)
print(bigram_counter.most_common(5), end="\n")

unigram_counter = Counter(unigrams)
print(unigram_counter.most_common(5))

len(unigram_counter)

In [3]:
def kneser_ney_smoothing(ngrams, delta):
    # Calculate the counts of all ngrams
    ngram_counts = {}
    for ngram in ngrams:
        if ngram in ngram_counts:
            ngram_counts[ngram] += 1
        else:
            ngram_counts[ngram] = 1
    # Calculate the total number of ngrams
    N = sum(ngram_counts.values())
    # Calculate the probability of each ngram
    ngram_probs = {}
    for ngram, count in ngram_counts.items():
        # Split the ngram into the prefix and the last word
        prefix = ngram[:-1]
        last_word = ngram[-1]
        # Calculate the probability of the continuation
        if prefix in ngram_counts:
            p_cont = ngram_counts[prefix] / N
        else:
            p_cont = 0
        # Calculate the probability of the ngram using the Kneser-Ney formula
        prob = (count - delta) / N + (delta * p_cont)
        ngram_probs[ngram] = prob
    return ngram_probs

# Example usage
ngrams = ["the cat", "cat sat", "sat on", "on the", "the mat"]
probs = kneser_ney_smoothing(ngrams, 0.75)
print(probs)
# Output: {'the cat': 0.3125, 'cat sat': 0.3125, 'sat on': 0.3125, 'on the': 0.3125, 'the mat': 0.3125}


{'the cat': 0.05, 'cat sat': 0.05, 'sat on': 0.05, 'on the': 0.05, 'the mat': 0.05}


In [None]:

bigramss = [("ich", "bin"), ("bin", "hier"), ("hier", "um"), ("um", "zu"), ("zu", "lernen"), ("ich", "bin"), ("ich", "bin"), ("ich", "bin")]

smoothed = kneser_ney_smoothing(bigrams[:10], 0.75, 6)
# smoothed = kneser_ney_smoothing(bigrams, 0.75, len(unigram_counter))

In [None]:
smoothed

In [None]:
# discount = 0.75
# bigram_prob = {}
# for bigram, freq in bigram_counter.items():
#     w1, w2 = bigram
#     if w1 in bigram_prob:
#         bigram_prob[bigram] = (freq - discount) / unigram_counter[w1]
#     else:
#         bigram_prob[bigram] = freq / sum(unigram_counter.values())

In [None]:
import math


def eval_model(model, sentence):
    tokens = sentence.split()
    bigrams = ngrams(tokens, 2)
    
    log_prob = 0
    for bigram in bigrams:
        t_1, t = bigram
        
        try:
            probability = model[t_1][t]
        except:
            print("Prob not found for", bigram)
            probability = 0.00001
        
        log_prob += math.log(probability)
    
    log_prob = math.exp(log_prob)
    print(f"Probability for '{sentence}' is: {log_prob}")
    print("\n\n")


eval_model(bigrams, "Es ist")


In [None]:
bigramss = [("ich", "bin"), ("bin", "hier"), ("hier", "um"), ("um", "zu"), ("zu", "lernen"), ("ich", "bin"), ("ich", "bin"), ("ich", "bin")]
N = 2
vocabulary_size = 6

kneser_ney_probabilities = kneser_ney_smoothing(bigramss, N, vocabulary_size)
print(kneser_ney_probabilities)

In [None]:
kneser_ney_probabilities = kneser_ney_smoothing(bigrams, N, 9385290)
print(kneser_ney_probabilities[:10])