In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

In [40]:
def relative_frequency(letter, df_bg):
    n_first = np.sum(df_bg[df_bg['first'] == letter]['count'])
    n_second = np.sum(df_bg[df_bg['second'] == letter]['count'])
    return n_first, n_second

In [75]:
class Letter:
    def __init__(self, data, name, eps=1e-2):
        self.name = name
        self.followers = defaultdict(float)

        n_first, n_second = relative_frequency(name, data)
        #f_eos = max(n_second - n_first, n_first * eps)
        f_eos = max(0, (n_second - n_first)) + eps * min(n_second, n_first)

        data = data[data['first'] == name]
        
        if 'medial' in name:
            f_eos = 0.0
        
        self.followers['EOW'] = f_eos
        n_first += f_eos

        for follower, freq in zip(data['second'], data['count']):
            self.followers[follower] += freq

        print(f'{name}: {n_first}, {f_eos}')
        for follower in self.followers:
            self.followers[follower] /= n_first

    def random_next(self):
        rand_float = np.random.random()
        for follower in self.followers:
            rand_float -= self.followers[follower]
            if rand_float <= 0:
                return follower
            
            
class EOW_Letter(Letter):
    def __init__(self, data, name='EOW', eps=1e-2):
        
        self.followers = defaultdict(float)
        
        total = 0
        let_set = list(set(data['first']))
        
        for letter in let_set:
            n_first, n_second = relative_frequency(letter, data)
            self.followers[letter] += max(0, (n_first - n_second)) + eps * min(n_second, n_first)
            total += self.followers[letter]
            
        for follower in self.followers:
            self.followers[follower] /= total
            

In [76]:
df_bigrams = pd.read_csv('../data/bigrams.csv')

In [98]:
letters = {}
let_set = set(df_bigrams['first']) | set(df_bigrams['second'])

letters['EOW'] = EOW_Letter(df_bigrams, name='EOW', eps=5e-2)

for letter in let_set:
    letters[letter] = Letter(df_bigrams, letter, eps=5e-2)


Alef: 1867.4, 22.400000000000002
Yod: 1146.7, 1072.7
Waw: 663.5, 493.5
Het: 87.75, 0.75
Tet: 10.0, 0.0
Dalet: 515.3, 21.3
He: 689.6, 417.6
Taw: 746.25, 741.25
Pe: 91.2, 0.2
Tasdi-final: 29.0, 29.0
Bet: 552.5, 4.5
Nun-final: 646.0, 646.0
Mem: 770.0, 770.0
Qof: 43.9, 0.9
Mem-medial: 501.0, 0.0
Pe-final: 33.0, 33.0
Samekh: 5.0, 5.0
Resh: 102.45, 3.45
Tsadi: 6.2, 2.2
Lamed: 1801.25, 896.25
Kaf: 946.2, 1.2000000000000002
Gimel: 35.55, 0.55
Shin: 149.2, 3.2
Kaf-final: 102.0, 102.0
Nun-medial: 128.0, 0.0
Ayin: 1375.85, 1.85
Zayin: 50.35, 1.35


In [99]:
letters['EOW'].followers

defaultdict(float,
            {'Alef': 0.26926433205600037,
             'Yod': 0.0007019008233106955,
             'Waw': 0.0016124748643624085,
             'Het': 0.013800887809690026,
             'Tet': 0.001897029252191069,
             'Dalet': 0.016940471222066246,
             'He': 0.002579959782979854,
             'Taw': 4.7425731304776726e-05,
             'Pe': 0.01654209507910612,
             'Bet': 0.08773760291383693,
             'Qof': 0.0049133057631748685,
             'Mem-medial': 0.09143680995560952,
             'Resh': 0.006345562848579126,
             'Tsadi': 3.794058504382138e-05,
             'Lamed': 0.008584057366164587,
             'Kaf': 0.17494403763706037,
             'Gimel': 0.004657206814129075,
             'Shin': 0.01616268922866791,
             'Nun-medial': 0.023380885533254923,
             'Ayin': 0.2539837614296012,
             'Zayin': 0.004429563303866146})

In [100]:
counts = defaultdict(int)
lengths = defaultdict(int)

for letter in let_set:
    counts[letter] = 0
    
print(len(let_set))

WORDS = 100000
for idx in range(WORDS):
    current = 'EOW'
    word = current
    nxt = ''
    length = 0
    while True:
        nxt = letters[current].random_next()
        word += f'_{nxt}'
        current = nxt
        if nxt != 'EOW':
            counts[nxt] += 1
            length += 1
        else:
            break
    lengths[length] += 1
   
for  letter in counts:
    counts[letter] /= WORDS / 100
    sep = ':' + (
        '\t' if len(letter) > 2 else '\t\t'
    )
    print(letter, counts[letter], sep=sep)
print(lengths)


27
Alef:	35.605
Yod:	21.56
Waw:	12.693
Het:	1.692
Tet:	0.182
Dalet:	9.706
He:		13.075
Taw:	14.18
Pe:		1.639
Tasdi-final:	0.56
Bet:	10.394
Nun-final:	12.266
Mem:	14.563
Qof:	0.833
Mem-medial:	9.427
Pe-final:	0.664
Samekh:	0.097
Resh:	1.932
Tsadi:	0.112
Lamed:	34.349
Kaf:	18.124
Gimel:	0.664
Shin:	2.872
Kaf-final:	1.959
Nun-medial:	2.455
Ayin:	26.038
Zayin:	1.001
defaultdict(<class 'int'>, {2: 66536, 4: 7825, 3: 20359, 6: 979, 1: 1399, 5: 2373, 7: 320, 9: 51, 8: 119, 10: 28, 11: 8, 12: 3})


In [56]:
for letter in let_set:
    print(letter, letters[letter].followers['EOW'], sep=': ')

Alef: 0.047619047619047616
Yod: 0.9352580927384077
Waw: 0.7404580152671756
Het: 0.04761904761904763
Tet: 0.047619047619047616
Dalet: 0.04761904761904762
He: 0.5976331360946746
Taw: 0.9932975871313673
Pe: 0.047619047619047616
Tasdi-final: 1.0
Bet: 0.04761904761904762
Nun-final: 1.0
Mem: 1.0
Qof: 0.047619047619047616
Mem-medial: 0.0
Pe-final: 1.0
Samekh: 1.0
Resh: 0.047619047619047616
Tsadi: 0.3333333333333333
Lamed: 0.484624145785877
Kaf: 0.047619047619047616
Gimel: 0.047619047619047616
Shin: 0.04761904761904762
Kaf-final: 1.0
Nun-medial: 0.0
Ayin: 0.047619047619047616
Zayin: 0.047619047619047616


In [49]:
relative_frequency('Samekh', df_bigrams)

(0, 5)