### ДЗ2  
### Выполнил: _Сидоров Иван_  
### Задание:  
* Сделать полноценные модели для биграмм и триграмм, которые могут оценивать вероятность текста.
* Посчитать perplexity нового текста для обоих моделей. Необходимо учесть случаи незнакомых слов и N-грамм.

In [1]:
import nltk
from nltk.corpus import reuters
from nltk.lm.preprocessing import pad_both_ends
from itertools import product
import math

nltk.download("reuters")
nltk.download("punkt")

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/ivsidorov/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ivsidorov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
class PLM(object):
    def __init__(self, train_data, n):
        self.n = n
        self.tokens = self._preprocess_corpus(train_data, n)
        self.vocab = nltk.FreqDist(self.tokens)
        self.model = {}
        # всевозможные вариации замены токенов в n-грамме на <UNK>
        self.masks = list(product((1, 0), repeat=self.n))

    def fit(self):
        n_grams = nltk.ngrams(self.tokens, self.n)
        vocab = nltk.FreqDist(n_grams)

        prev_grams = nltk.ngrams(self.tokens, self.n - 1)
        prev_vocab = nltk.FreqDist(prev_grams)

        for n_gram, count in vocab.items():
            # Add-1 estimate
            self.model[n_gram] = (count + 1) / (
                prev_vocab[n_gram[:-1]] + len(self.vocab)
            )

        return self

    @staticmethod
    def _pad_sentences(sentences, n):
        padded_sentences = []
        for sent in sentences:
            padded_sent = " ".join(pad_both_ends(sent.split(" "), n=n))
            padded_sentences.append(padded_sent)
        return padded_sentences

    @staticmethod
    def _replace_unknown(tokens):
        vocab = nltk.FreqDist(tokens)
        return [token if vocab[token] > 1 else "<UNK>" for token in tokens]

    @staticmethod
    def _preprocess_corpus(sentences, n, train=True):
        sentences = PLM._pad_sentences(sentences, n)
        sentence_tokens = ' '.join(sentences)
        tokens = sentence_tokens.split(' ')
        if train:
            tokens = PLM._replace_unknown(tokens)
        return tokens

    @staticmethod
    def _mask_unknown(ngram, mask):
        masked_ngram = []
        for token, flag in zip(ngram, mask):
            if flag == 1:
                masked_ngram.append(token)
            else:
                masked_ngram.append("<UNK>")

        return tuple(masked_ngram)

    def _find_oov(self, ngram):
        for mask in self.masks:
            masked_ngram = self._mask_unknown(ngram, mask)
            if masked_ngram in self.model:
                return masked_ngram

        raise ValueError("The n-gram is out of vocabulary.")

    def perplexity(self, test_data):
        test_tokens = self._preprocess_corpus(test_data, self.n, train=False)
        test_ngrams = nltk.ngrams(test_tokens, self.n)
        N = len(test_tokens)
        test_ngrams = (self._find_oov(ngram) for ngram in test_ngrams)
        probs = [self.model[ngram] for ngram in test_ngrams]

        return math.exp((-1 / N) * sum(map(math.log, probs)))

In [3]:
data = []
for tokenized_sentence in reuters.sents():
    data.append(" ".join(tokenized_sentence))

Биграммы:

In [11]:
model = PLM(data, 2)
model.fit()

print(
    "Correct sentence perplexity: ",
    model.perplexity(["SpaceX has launched a new rocket"]),
)
print(
    "Incorrect sentence perplexity: ", model.perplexity(["SpaceX has opened a new restaurant"])
)

Correct sentence perplexity:  306.7723993055946
Incorrect sentence perplexity:  351.6277015176996


Триграммы:

In [12]:
model = PLM(data, 3)
model.fit()

print(
    "Correct sentence perplexity: ",
    model.perplexity(["SpaceX has launched a new rocket"]),
)
print(
    "Incorrect sentence perplexity: ", model.perplexity(["SpaceX has opened a new restaurant"])
)

Correct sentence perplexity:  696.1575093777993
Incorrect sentence perplexity:  746.0573455241924


На триграммах перплексия повыше. Думаю, это происходит из-за небольшого объема обучающего корпуса.