# Module 3: Infer Language Models

## Efrain Olivares - dpy8wq

* DS5001
* Professor Sree
* Spring 2026

## Use the pandas method

In [1]:
import pandas as pd

## Use the polynesian character set (6 total)

In [2]:
# use the polynisian alphabet
letters = list("ptkaiu")

## Generate all possible messages of length 7

In [3]:
# set message length to 7
L = 7

# We can use the cartesian product to get the permutations
X = [letters for _ in range(L)]
library = pd.DataFrame(index=pd.MultiIndex.from_product(X)).reset_index()

# then our books are each of the permutations joined into a string
library["book"] = library.astype(str).agg("".join, axis=1)

## Number of messages (books)

In [4]:
# assert that the length of our library is a (number of chars) to the power of L (message length)
assert len(letters)**L == len(library)
print(f"Number of messages = {len(library)}")

Number of messages = 279936


## Let's peek at the first 10 books

In [5]:
library.book.head(10)

0    ppppppp
1    ppppppt
2    ppppppk
3    ppppppa
4    ppppppi
5    ppppppu
6    ppppptp
7    ppppptt
8    ppppptk
9    pppppta
Name: book, dtype: object

## Calculate the probability of one book

In [6]:
p_book = 1/len(library)
print(f"The probability of a book is 1/len_library: {p_book}")

The probability of a book is 1/len_library: 3.5722450845907635e-06


## Create text by pulling messages separated by text

In [7]:
# Weights from Sch√ºtze & Manning example:
# p=1/8, t=1/4, k=1/8, a=1/4, i=1/8, u=1/8
weights = pd.Series({
    "p": 1/8,
    "t": 1/4,
    "k": 1/8,
    "a": 1/4,
    "i": 1/8,
    "u": 1/8
})

df_alphabet = pd.DataFrame({"char": weights.index, "weight": weights.values})
df_alphabet

Unnamed: 0,char,weight
0,p,0.125
1,t,0.25
2,k,0.125
3,a,0.25
4,i,0.125
5,u,0.125


## Create functions to sample a word and generate text (words separated by space)

In [8]:
def sample_word_pandas(alphabet, word_len=7):
    return alphabet.sample(
        n=word_len,
        replace=True,
        weights="weight"
    )["char"].str.cat()

def generate_text(alphabet, n_words, word_len=7):
    return " ".join(sample_word_pandas(alphabet, word_len) for _ in range(n_words))

## Generate a text of 100 words

In [9]:
n_words = 100
text = generate_text(df_alphabet, n_words, L)
text[:200]

'autpiut itatpit aaipita kuaktit ituiktt aatpiku pakappa piiptkt kpttitt uiatutk aakkapk atauauu ttutitt atakkuk uuiukap kattutt kapiitt titpaki tttpiat tutttap atuatkp iipautk tpaapat aatiptu ttukaau '

## Calculate entropy

In [10]:
import math
H = (p_book * math.log(1/p_book, len(letters)) * len(library)) 
H2 = (p_book * math.log(1/p_book, 2)) * len(library) # Base 2
H, H2

(7.0, 18.094737505048094)

## Round Trip: Compare our given frequencies given and observed

In [13]:
chars = pd.Series(list(text.replace(" ", "")))
observed = chars.value_counts(normalize=True).sort_index()
given = weights.sort_index()

pd.DataFrame({"observed": observed, "given": given}).fillna(0)


Unnamed: 0,observed,given
a,0.241429,0.25
i,0.121429,0.125
k,0.135714,0.125
p,0.127143,0.125
t,0.232857,0.25
u,0.141429,0.125
