<a href="https://colab.research.google.com/github/MSimonFRA-UAS/JuniorUni/blob/main/BigramSprachModell.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
from typing import List


class BigramSprachModell(object):
    """
    Class that implements a bigram language model

    Attributes:
        bigram_anz: A List of dictionaries. The ith dictionary contains counts for words following word i. So if
        i = 67 corresponds to "of", bigram_counts[67] stores a dict of counts like {the: 125, my: 23. ...} for all words
        that we ever see after "of"
        prev_word_counts: A List of counts for each word appearing as a "previous" word, or "context" word.
        unigram_counts: A List of counts for words appearing as the "current" word. These are the same counts as those
        estimated by the UnigramLanguageModel
    """
    def __init__(self, bigram_anz: dict, vorh_wort_anz: dict, unigram_anz: dict):
        self.bigram_anz = bigram_anz
        self.vorh_wort_anz = vorh_wort_anz
        self.unigram_anz = unigram_anz
        self.total_unigram_anz = sum([unigram_anz[wort] for wort in unigram_anz.keys()])
        self.use_multiplicative = True

    def get_vokabular(self):
        """
        :return: A set containing the vocabulary of the
        """
        return self.unigram_anz.keys()

    def _get_unigram_wkt(self, wort: str) -> float:
        """
        Helper method to calculate the unigram probability of the given word
        :param word: The index of the word to get the unigram probability for
        :return: The unigram probability of the word
        """
        return float(self.unigram_anz[wort])/self.total_unigram_anz

    def get_wkt(self, vorh_wort: str, wort: str) -> float:
        """
        Computes the probability P(word | prev_word)
        :param prev_word: the previous word
        :param word: the next word (candidate) to score
        :return: The float bigram probability of word given prev_word
        """
        anz_nach_vorh_wort = self.bigram_anz[vorh_wort]
        if wort in anz_nach_vorh_wort:
            naechstes_wort_in_kontext_anz = anz_nach_vorh_wort[wort]
        else:
            naechstes_wort_in_kontext_anz = 0
        return naechstes_wort_in_kontext_anz / self.vorh_wort_anz[vorh_wort]


def estimate_bigram_lm(train_seqs: List[List[str]]) -> BigramSprachModell:
    bigram_anz = {}
    # The following two have to be different because of the start/end of sequence characters
    unigram_anz = {}
    vorh_wort_anz = {}
    for train_seq in train_seqs:
        for i in range(1, len(train_seq)):
            vorh_wort = train_seq[i-1]
            wort = train_seq[i]
            if vorh_wort not in bigram_anz:
                bigram_anz[vorh_wort] = {}
            if wort in bigram_anz[vorh_wort]:
                bigram_anz[vorh_wort][wort] += 1
            else:
                bigram_anz[vorh_wort][wort] = 1
            if vorh_wort not in vorh_wort_anz:
                vorh_wort_anz[vorh_wort] = 1
            else:
                vorh_wort_anz[vorh_wort] += 1
            if wort not in unigram_anz:
                unigram_anz[wort] = 1
            else:
                unigram_anz[wort] += 1
    return BigramSprachModell(bigram_anz, vorh_wort_anz, unigram_anz)

In [2]:
import numpy as np
from collections import Counter


BEGIN_SYMBOL = "<S>"
END_SYMBOL = "</S>"


def read_wikitext(path: str) -> List[List[str]]:
    """
    Reads a Wikitext file at the given path.
    :param path: The string path of the file to read
    :return: A nested List[List[str]]: The first List is of lines, and the second List is of string words on that line
    """
    print("Started reading from file " + path)
    f = open(path)
    lines = []
    for line in f:
        # If it's a non-empty line
        if len(line.strip()) > 0:
            this_line = [BEGIN_SYMBOL]
            split_line = line.split(" ")
            for word in split_line:
                if len(word.strip()) > 0:
                    this_line.append(word.strip())
            this_line.append(END_SYMBOL)
            lines.append(this_line)
    print("Read %i lines" % len(lines))
    return lines

In [18]:
import urllib.request
url = 'https://github.com/MSimonFRA-UAS/JuniorUni/blob/main'
urllib.request.urlretrieve(url, 'wiki.train.tokens')
urllib.request.urlretrieve(url, 'wiki.valid.tokens')

('wiki.valid.tokens', <http.client.HTTPMessage at 0x7e3409bf69e0>)

In [7]:
def read_data():
    return (read_wikitext("wiki.train.tokens"), read_wikitext("wiki.valid.tokens"))

In [14]:
(train,test) = read_data()


Started reading from file wiki.train.tokens
Read 23767 lines
Started reading from file wiki.valid.tokens
Read 2461 lines


In [15]:
lm = estimate_bigram_lm(train)