# Assignment #1: PFL067 Statistical NLP

## Exploring Entropy and Language Modeling

---

### 1. Entropy of a Text

In this experiment, you will determine the conditional entropy of the word distribution in a text given the previous word. To do this, you will first have to compute P(i,j), which is the probability that at any position in the text you will find the word i followed immediately by the word j, and P(j|i), which is the probability that if word i occurs in the text then word j will follow. Given these probabilities, the conditional entropy of the word distribution in a text given the previous word can then be computed as:

$$H(J|I) = -\sum_{i \in I, j \in J} P(i,j) \log_2 P(j|i)$$

The perplexity is then computed simply as

$$P_X(P(J|I)) = 2^{H(J|I)}$$

Compute this conditional entropy and perplexity for `TEXTEN1.txt`

This file has every word on a separate line. (Punctuation is considered a word, as in many other cases.) The i,j above will also span sentence boundaries, where i is the last word of one sentence and j is the first word of the following sentence (but obviously, there will be a fullstop at the end of most sentences).

---

In [1]:
import pandas as pd
import numpy as np
import collections as c
import math
import random
from numpy.random import RandomState

In [2]:
random.seed(200)
np.random.seed(200)

In [3]:
english = './TEXTEN1.txt'
czech = './TEXTCZ1.txt'

In [4]:
def open_text(filename):
    """Reads a text line by line, applies light preprocessing, and returns a dataframe of each word"""
    with open(filename, encoding='iso-8859-2') as f:
        content = f.readlines()

    text = pd.DataFrame(content, columns=['words'])
    text.words = text.words.apply(lambda word: word.strip().lower())
    
    return text

In [5]:
def process_ngrams(text):
    """Generates a list of bigrams and trigrams from a text and inserts it into the dataframe"""
    wordprev = text.words.shift(1).fillna('<s>')
    wordprev2 = wordprev.shift(1).fillna('<ss>')
    
    text['bigrams'] = list(zip(wordprev, text.words))
    text['trigrams'] = list(zip(*[wordprev2, wordprev, text.words]))
    
    return text

In [6]:
process_ngrams(open_text(english))[:5]

Unnamed: 0,words,bigrams,trigrams
0,when,"(<s>, when)","(<ss>, <s>, when)"
1,on,"(when, on)","(<s>, when, on)"
2,board,"(on, board)","(when, on, board)"
3,h,"(board, h)","(on, board, h)"
4,.,"(h, .)","(board, h, .)"


In [7]:
process_ngrams(open_text(czech))[:5]

Unnamed: 0,words,bigrams,trigrams
0,v,"(<s>, v)","(<ss>, <s>, v)"
1,.,"(v, .)","(<s>, v, .)"
2,laštůvka,"(., laštůvka)","(v, ., laštůvka)"
3,:,"(laštůvka, :)","(., laštůvka, :)"
4,ať,"(:, ať)","(laštůvka, :, ať)"


In [206]:
def language_model(text):
    """Counts unigrams and bigrams in a dataframe"""
    words = list(text.words) + ['<s>']
    word_counts = c.Counter(words)
    num_words = sum(word_counts.values())
    vocabulary = sorted(list(set(word_counts.keys())))

    bigrams = list(text.bigrams)
    bigram_counts = c.Counter(bigrams)
    num_bigrams = sum(bigram_counts.values())
    bigram_vocabulary = sorted(list(set(bigram_counts.keys())))
    
    unigram_model = words, word_counts, num_words, vocabulary
    bigram_model = bigrams, bigram_counts, num_bigrams, bigram_vocabulary
    
    return unigram_model, bigram_model

In [9]:
def Pword(unigram_model, W='', alpha=0.7):
    """Calculates the probability a word appears in a sentence"""
    _, word_counts, num_words, vocabulary = unigram_model
    return (word_counts[W] + alpha) / (num_words + alpha * len(vocabulary))

In [10]:
def Pbigram(bigram_model, W='', Wprev='', alpha=0.7):
    """Calculates the probability a bigram appears in a sentence"""
    _, bigram_counts, num_bigrams, bigram_vocabulary = bigram_model
    return (bigram_counts[(Wprev, W)] + alpha) / (num_bigrams + alpha * len(bigram_vocabulary))

In [11]:
# P(A|B) = P(A,B) / P(B)
def Pwprev(models, W='', Wprev='', alpha=0.7):
    """Calculates the probability a word W proceeds a word Wprev"""
    unigram_model, bigram_model = models
    return Pbigram(bigram_model, W=W, Wprev=Wprev, alpha=alpha) / Pword(unigram_model, W=Wprev, alpha=alpha)

In [230]:
def entropy(models, bigrams, alpha=0.7):
    """Calculates the entropy from a list of bigrams"""
    _, bigram_model = models
    return - sum(Pbigram(bigram_model, W=W, Wprev=Wprev, alpha=alpha) 
                 * math.log(Pwprev(models, W=W, Wprev=Wprev, alpha=alpha), 2) for Wprev,W in bigrams)

In [40]:
def perplexity(models, bigrams, alpha=0.7):
    """Calculates the perplexity from a list of bigrams"""
    return 2 ** entropy(models, bigrams, alpha=alpha)

In [14]:
def text_stats(text, name='', alpha=1e-5):
    process_ngrams(text)
    models = language_model(text)
    unigram_model, bigram_model = models
    words, word_counts, num_words, vocabulary = unigram_model
    bigrams, bigram_counts, num_bigrams, bigram_vocabulary = bigram_model
    
    stats = pd.DataFrame([[
        name,
        num_words,
        len([char for word in words for char in word]),
        word_counts.most_common()[:10],
        sum(1 for key in word_counts if word_counts[key] == 1),
        entropy(models, bigram_vocabulary, alpha=alpha),
        perplexity(models, bigram_vocabulary, alpha=alpha)
    ]], columns=[
        'language', 'word_count', 'char_count', 'most_frequent_words', 
        'num_words_freq_1', 'entropy', 'perplexity'
    ])

    return stats

In [15]:
text_stats(open_text(english), name='english')

Unnamed: 0,language,word_count,char_count,most_frequent_words,num_words_freq_1,entropy,perplexity
0,english,221098,972917,"[(,, 14721), (the, 13949), (of, 9400), (., 564...",3165,5.332007,40.280431


In [16]:
text_stats(open_text(czech), name='czech')

Unnamed: 0,language,word_count,char_count,most_frequent_words,num_words_freq_1,entropy,perplexity
0,czech,222412,1030631,"[(,, 13788), (., 12931), (a, 4768), (v, 4653),...",22715,4.909983,30.064369


Next, you will mess up the text and measure how this alters the conditional entropy. For every character in the text, mess it up with a likelihood of 10%. If a character is chosen to be messed up, map it into a randomly chosen character from the set of characters that appear in the text. Since there is some randomness to the outcome of the experiment, run the experiment 10 times, each time measuring the conditional entropy of the resulting text, and give the min, max, and average entropy from these experiments. Be sure to use srand to reset the random number generator seed each time you run it. Also, be sure each time you are messing up the original text, and not a previously messed up text. Do the same experiment for mess up likelihoods of 5%, 1%, .1%, .01%, and .001%.

In [17]:
def charset(words):
    return sorted(list(set(char for word in words for char in word)))

In [18]:
charset(open_text(english).words)[:10]

['!', '"', '&', "'", '(', ')', ',', '.', '/', '0']

In [19]:
def vocab_list(words):
    return sorted(list(set(word for word in words)))

In [20]:
vocab_list(open_text(english).words)[:10]

['"', '&', '&c', '&e', '(', ')', ',', '.', '000', '1']

In [21]:
def perturb_char(word, charset, prob=0.1):
    """Changes each character with given probability to a random character in the charset"""
    return ''.join(np.random.choice(charset) if np.random.random() < prob else char for char in word)

In [22]:
def perturb_word(word, vocabulary, prob=0.1):
    """Changes a word with given probability to a random word in the vocabulary"""
    return np.random.choice(vocabulary) if np.random.random() < prob else word

In [23]:
def perturb(words, charset, vocabulary, prob=0.1):
    pchars = words.apply(lambda word: perturb_char(word, charset, prob=prob))
    pwords = words.apply(lambda word: perturb_word(word, vocabulary, prob=prob))
    
    return pchars, pwords

In [76]:
def perturb_text(text, seed=200):
    np.random.seed(seed)
    
    chars = charset(text.words)
    vocab = vocab_list(text.words)
    
    text_chars = pd.DataFrame()
    text_words = pd.DataFrame()
    
    for prob in [0, 0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1]:
        text_chars[str(prob)], text_words[str(prob)] = perturb(text.words, chars, vocab, prob=prob)
    
    return text_chars, text_words

In [77]:
def perturbed_text_stats(text, alpha=1e-5):
    process_ngrams(text)
    models = language_model(text)
    unigram_model, bigram_model = models
    words, word_counts, num_words, vocabulary = unigram_model
    bigrams, bigram_counts, num_bigrams, bigram_vocabulary = bigram_model
    
    H = entropy(models, bigram_vocabulary, alpha=alpha)
    P = 2 ** H
    
    return [H, P]

In [78]:
def all_stats(filename):
    text = open_text(filename)
    text_chars, text_words = perturb_text(text)
    
    char_stats = pd.DataFrame(columns=['prob', 'entropy', 'perplexity'])
    word_stats = pd.DataFrame(columns=['prob', 'entropy', 'perplexity'])
    
    for col in text_chars:
        stat = perturbed_text_stats(pd.DataFrame(list(text_chars[col]), columns=['words']))
        char_stats.loc[len(char_stats)] = [float(col)] + stat
    
    for col in text_words:
        stat = perturbed_text_stats(pd.DataFrame(list(text_words[col]), columns=['words']))
        word_stats.loc[len(word_stats)] = [float(col)] + stat
    
    return char_stats, word_stats

In [83]:
char_stats_en, word_stats_en = all_stats(english)
char_stats_cz, word_stats_cz = all_stats(czech)

In [84]:
char_stats_en

Unnamed: 0,prob,entropy,perplexity
0,0.0,5.332007,40.280431
1,1e-05,5.331978,40.279614
2,0.0001,5.331495,40.266136
3,0.001,5.328537,40.183652
4,0.01,5.301017,39.42441
5,0.05,5.14134,35.293738
6,0.1,4.842314,28.686777


In [85]:
word_stats_en

Unnamed: 0,prob,entropy,perplexity
0,0.0,5.332007,40.280431
1,1e-05,5.332009,40.280477
2,0.0001,5.332307,40.288794
3,0.001,5.334793,40.35829
4,0.01,5.354069,40.901147
5,0.05,5.435002,43.26121
6,0.1,5.520615,45.906147


In [86]:
char_stats_cz

Unnamed: 0,prob,entropy,perplexity
0,0.0,4.909983,30.064369
1,1e-05,4.909836,30.061314
2,0.0001,4.908833,30.040426
3,0.001,4.900083,29.858771
4,0.01,4.817114,28.190051
5,0.05,4.493063,22.518876
6,0.1,4.15163,17.773175


In [87]:
word_stats_cz

Unnamed: 0,prob,entropy,perplexity
0,0.0,4.909983,30.064369
1,1e-05,4.909961,30.063913
2,0.0001,4.909977,30.064247
3,0.001,4.909434,30.052935
4,0.01,4.900151,29.860191
5,0.05,4.862496,29.090894
6,0.1,4.795332,27.76762
