In [26]:
######################################
############# MARKOV CHAIN ###################

import random as rd
import itertools as it
# We want to merge all the text into a single stream of printable characters.

#To do this we will use itertools.chain.from_iterable to combine lines into a single stream.

l = [['hello', 'world'],['nice', 'day']]
l1 = it.chain.from_iterable(l)
for el in l1:
    print(el) ## prints the single elements of a list ( so also strings)

hello
world
nice
day


In [27]:
from collections import Counter
# We will use the Counter class to count the number of occurrences of each character in the text.
# The Counter class is a subclass of dict that is used to count hashable objects.

file = './great_expectations_cleaned.txt'
with open(file, 'r', encoding='utf8') as text:
    lines = (line.lower() for line in text)
    characters = it.chain.from_iterable(lines)
    result = list(characters)
    counts = Counter(result)

print(result[:30])
print (counts)
 


['\n', '\n', 'm', 'y', ' ', 'f', 'a', 't', 'h', 'e', 'r', '’', 's', ' ', 'f', 'a', 'm', 'i', 'l', 'y', ' ', 'n', 'a', 'm', 'e', ' ', 'b', 'e', 'i', 'n']
Counter({' ': 168157, 'e': 91702, 't': 68614, 'a': 62849, 'o': 59867, 'i': 54474, 'n': 52701, 'h': 48082, 's': 45132, 'r': 40780, 'd': 36630, 'l': 28004, 'm': 22549, 'u': 21712, '\n': 20244, 'w': 20232, ',': 17046, 'c': 16878, 'g': 16441, 'y': 16061, 'f': 15742, 'p': 12851, 'b': 12167, '.': 8522, 'k': 7535, 'v': 6718, '“': 3945, '”': 3908, '’': 2596, 'j': 1669, ';': 1257, '-': 1221, '?': 1216, '—': 1151, '!': 985, 'x': 938, 'q': 699, '_': 494, '(': 250, ')': 250, 'z': 164, ':': 105, '‘': 95, '2': 3, '4': 1, '\t': 1, 'ô': 1, '&': 1, 'ê': 1, '1': 1})


In [28]:
random_sample = rd.choices(list(counts.keys()), weights=list(counts.values()), k=50)

print(random_sample)
# The random.choices() function is used to return a list with a randomly selected element from the given iterable.

text2=str.join('', random_sample) 
print(text2)
# The str.join() method is used to join the elements of an iterable (such as a list or tuple) into a single string.

print(len(Counter(text2)))

['c', 'e', 'o', 'g', 'e', 'o', ' ', 'r', 'e', 'g', 'c', ' ', 'e', 'e', 'w', ' ', 'r', ' ', 'h', ' ', 'n', 't', ' ', 'h', ' ', 'n', 's', '—', ' ', ' ', 'a', ' ', ' ', 's', 'e', ' ', 'h', 't', 'a', 'e', 'a', 'd', ' ', ' ', 'i', 't', 'g', ' ', ' ', 'i']
ceogeo regc eew r h nt h ns—  a  se htaead  itg  i
15


First order Markov chains

Instead of generating each letter based on its frequency in the text, let's follow these steps:

    look at the last letter in the text that has been generated so far;
    consider in the original text how often each letter follows the chosen one;
    generate the new letter based on this conditional probability;
    consider the last letter generated as new starting point and go back to point 2

In [29]:
print (len(counts))
# first order chain is 50**2 for a pair of characters

to_replace = {'ô': 'o', 'ê': 'i'}

def letter_normalization(letter):
    if letter in to_replace:
        return to_replace[letter] 
    return letter

norm_result = []

with open(file, 'r', encoding='utf8') as text:
    lines = (line.lower() for line in text)
    characters = it.chain.from_iterable(lines)
    result = list(characters)
    for letter in result:
        modified_letter = letter_normalization(letter)
        norm_result.append(modified_letter)

50


We need to take our characters in couples.

If we have a text like "home", we want to obtain the following couples:

(h, o) (o, m) (m, e)


In [30]:
from toolz import sliding_window

seq = "home"
couples = sliding_window(2, seq)
print(list(couples))


couples = list(sliding_window(2, norm_result)) # the whole text is divided into couples of letters
couples[:20] 


[('h', 'o'), ('o', 'm'), ('m', 'e')]


[('\n', '\n'),
 ('\n', 'm'),
 ('m', 'y'),
 ('y', ' '),
 (' ', 'f'),
 ('f', 'a'),
 ('a', 't'),
 ('t', 'h'),
 ('h', 'e'),
 ('e', 'r'),
 ('r', '’'),
 ('’', 's'),
 ('s', ' '),
 (' ', 'f'),
 ('f', 'a'),
 ('a', 'm'),
 ('m', 'i'),
 ('i', 'l'),
 ('l', 'y'),
 ('y', ' ')]

WE RE WRITING AN ALGORYTHM TO BE ABLE TO RECREATE PHRASES

In [31]:
count_couples = Counter(couples)
len(count_couples)
count_couples.most_common(20)

[(('e', ' '), 26238),
 ((' ', 't'), 23005),
 (('t', 'h'), 20522),
 (('d', ' '), 20014),
 ((' ', 'a'), 19685),
 (('h', 'e'), 19622),
 (('t', ' '), 16691),
 ((',', ' '), 14395),
 (('i', 'n'), 14071),
 ((' ', 'i'), 14032),
 ((' ', 'h'), 13476),
 (('a', 'n'), 13002),
 (('e', 'r'), 12975),
 (('s', ' '), 12944),
 ((' ', 'w'), 12708),
 ((' ', 's'), 11562),
 (('n', 'd'), 10741),
 (('n', ' '), 10641),
 (('h', 'a'), 10443),
 ((' ', 'o'), 10028)]



In order to generate the new letter starting from the previous one, we have to select only the part of our dictionary that contains our letter as initial part.

We can use the dictionary items function to get the sequence of key-value pairs, and filter them by their starting point.



In [32]:
def starts_with(sequence, letter):
    return sequence[0]==letter

fake_counts = {('a', 'b'): 1, ('b', 'c'): 2, ('a', 'p'): 3}
letter = 'a'
ks = [k for k, v in fake_counts.items() if starts_with(k, letter)]
print(ks)

letter = 'a'
letters = [k[-1] for k, v in fake_counts.items() if starts_with(k, letter)]
occurrences = [v for k, v in fake_counts.items() if starts_with(k, letter)]
letters, occurrences

[('a', 'b'), ('a', 'p')]


(['b', 'p'], [1, 3])

In [33]:
new_text = [' ']

for i in range(10):
    character = new_text[-1]
    characters = [k[-1] for k, v in count_couples.items() if starts_with(k, character)]
    occurrences = [v for k, v in count_couples.items() if starts_with(k, character)]
    next_character = rd.choices(characters, occurrences)[0]
    new_text.append(next_character)

new_text = str.join('', new_text)
print(new_text)

 s be
llod



2nd order Markov chains?!!!

In [34]:
triplets = list(sliding_window(3, norm_result))
count_triplets = Counter(triplets)

count_triplets.most_common(20)

[((' ', 't', 'h'), 14542),
 (('t', 'h', 'e'), 11599),
 (('h', 'e', ' '), 10105),
 (('a', 'n', 'd'), 8130),
 (('n', 'd', ' '), 7553),
 ((' ', 'a', 'n'), 7497),
 (('i', 'n', 'g'), 6072),
 ((' ', 't', 'o'), 5875),
 (('e', 'd', ' '), 5777),
 (('t', 'o', ' '), 5172),
 (('a', 't', ' '), 5130),
 (('n', 'g', ' '), 4993),
 ((' ', 'i', ' '), 4875),
 ((' ', 'h', 'a'), 4615),
 ((' ', 'o', 'f'), 4586),
 ((' ', 'h', 'e'), 4495),
 ((',', ' ', 'a'), 4359),
 (('a', 's', ' '), 4320),
 ((' ', 'i', 'n'), 4079),
 (('o', 'f', ' '), 4055)]


We can generalize this process by writing a function that allows to generate text of any length, given the order of the Markov chain.


In [35]:

def starts_with(sequence, letter):
    return sequence[:len(letter)]==letter

def generate_text(path_to_file=file, new_text=' ', order=1):
    new_text = list(new_text)
    L = order+1
    
    with open(path_to_file, 'r', encoding='utf8') as text:
        lines = (line.lower() for line in text)
        characters = it.chain.from_iterable(lines)
        substrings = sliding_window(L, characters)
        counts = Counter(substrings)
        
    while True:
        substring = tuple(new_text[-L:])
        letters = [c[-1] for c, v in counts.items() if starts_with(c, substring)]
        occurrences = [v for c, v in counts.items() if starts_with(c, substring)]
        next_letter = rd.choices(letters, occurrences)[0]
        new_text.append(next_letter)
        new_text = new_text[1:]
        yield next_letter



This function is a generator, meaning that it does not produce an output directly. Instead, when you call this function, it returns a generator object that you can iterate over to generate text dynamically.

To see the output, you have to iterate over the generator using a loop or another method for consuming generators.

In our case, we will use the islice function of the itertools library.


In [37]:
generated_text = str.join('', it.islice(generate_text(file, ' ', 3), 500))
print(generated_text)

d’h e a ndthh dmcefhnahtthsvynaerr k rf  sbstsk nsnh eokthgcoe tntlttwnolni o shs lh!soa
tt eatsipfetm  te
t rbtsdeaan o”“fimd,rt sla eouneaexeottsheiegamu ettn mtdot,h dooi iwe
etii,nioeceie.oa  lctaihwtbu  ldi,heoe  , rnhonwn mi i to euhotit.nahvumttwr eb,ewni,h,tkoa mdaslccdh,nrl t  t ee dh eawnlf,r w  diatoei r lvctemd ao  csai tebaeitty ei,sko,eep  dshgb u ertyhbntp vsodussreh det ahnonda.“
nht oi rlsadehta yneetrfreope thensitngsmt’ aeoetnfeoibtmennnhsrrpt
o  on oo teoottaeiee rcraidoei mo


(EXERCISE)

   Genaration Function with Words: 

1) Preprocessecing of the text with words 

2) Use words as items of the lists and 