# Text generator based on Markov Chain

First, we have to create the model that chooses the next word

This time it'll be Markov Chain

It makes predictions based on current state and probabilities of next state to come real
More info can be found here: https://en.wikipedia.org/wiki/Markov_chain

In [1]:
import numpy as np

class MarkovChain(object):
    def __init__(self, transition_prob, splitters = [" "], s_prob = [1]):
        self.transition_prob = transition_prob
        self.states = list(transition_prob.keys())
        self.splitters = splitters #["", ".", ",", ";", ":", " --"]
        self.s_prob = s_prob #[0.8, 0.05, 0.05, 0.025, 0.025, 0.05]
 
    def next_state(self, current_state):
        arr = []
        norm = 1 #sum(list(self.transition_prob[current_state].values()))
        for next_state in self.states:
            if next_state not in self.transition_prob[current_state]:
                arr.append(0)
            else:
                arr.append(self.transition_prob[current_state][next_state] / norm)
        return np.random.choice(self.states, p = arr)
 
    def generate_states(self, current_state, no=10):
        future_states = []
        for i in range(no):
            next_state = self.next_state(current_state)
            future_states.append(next_state + np.random.choice(self.splitters, p = self.s_prob))
            current_state = next_state
        return future_states

The next cell creates a formatted list from given text

In [2]:
txt = open("ViM.txt").read() + open("Frai_Maks__Chuchak.txt").read() + open("sorokin.txt").read()
text = ""

for c in txt:
    if c.isalpha():
        text += c
    #elif c in [", ", ". ", "... ", "; ", ": ", " -- ", " - ", " "]:
    #    text += " " + c
    else:
        text += " "

text = (text.lower()).split()

In [3]:
_ = [", ", ". ", "... ", "; ", ": ", " -- ", " - ", " "]
sp = dict(zip(_, [txt.count(__) for __ in _]))
sp[". "] -= 3 * sp["... "]
sp[" "] -= sp[", "] + sp[". "] + sp["... "] + sp["; "] + sp[": "] + 2 * sp[" -- "] + 2 * sp[" - "]

norm = sum(list(sp.values()))

for __ in sp:
    sp[__] /= norm

And this one counts the transition probabilities between each pair of states

In [4]:
#some debug may be needed
#UPD: works fine enough

trans_prob = {}

for i in range(len(text) - 1):
    wd1 = text[i]
    wd2 = text[i + 1]
    if wd1 not in trans_prob:
        trans_prob[wd1] = {}
    if wd2 not in trans_prob[wd1]:
        trans_prob[wd1][wd2] = 0
    trans_prob[wd1][wd2] += 1

for wd in trans_prob:
    norm = sum(list(trans_prob[wd].values()))
    for q in trans_prob[wd]:
        trans_prob[wd][q] /= norm

So, it's time to initialize our model

In [5]:
word_chain = MarkovChain(transition_prob = trans_prob, splitters = list(sp.keys()), s_prob = list(sp.values()))

The next cells are examples and tests of the model

In [6]:
print(*word_chain.generate_states(current_state = np.random.choice(text), no = np.random.randint(20, 80)), sep = "")

безупречное. и опять, стал девушкой, на: левом фланге: высшее только мешаю хожу тут я непременно, попытался изобразить: в фижмах это уже после слов тяжести: небрежно, разминая ноги все спасибо за, отсылкой бенигсена напрягая все. само собою радовало ее от барабана несколько дней маловато но буксгевден стоял болконский, в: оглядываясь кругом высвободив ногу с дворни но и знала но перевод это объяснить себе. 


In [61]:
word1 = text[3785]

In [10]:
word_chain.generate_states(current_state = word1, no = 20)

['николая',
 'и',
 'ясно',
 'как',
 'бы',
 'совершенно',
 'откровенна',
 'сказала',
 'она',
 'в',
 'лесу',
 'граф',
 'вдруг',
 'при',
 'удовлетворении',
 'своих',
 'то',
 'услужливо',
 'вынул',
 'из']

In [13]:
word_chain.generate_states(current_state = np.random.choice(text), no = 40)

['ты',
 'постой',
 'пожалуста',
 'голубчик',
 'я',
 'без',
 'перевода',
 'нет',
 'андрей',
 'я',
 'здесь',
 'присядем',
 'артиллеристы',
 'сдули',
 'нагоревшие',
 'пальники',
 'офицер',
 'в',
 'редком',
 'взгляде',
 'как',
 'и',
 'та',
 'же',
 'стоявшие',
 'перед',
 'домом',
 'в',
 'нерешительности',
 'итти',
 'сударыня',
 'в',
 'долг',
 'я',
 'ложусь',
 'спать',
 'николай',
 'в',
 'русскую',
 'батарею']

Some trash code

It can be unoptimized version of some parts or some kind of attempt to first create a very fine-working thing

In [1]:
import joblib
import word2vec



In [2]:
word2vec.word2phrase("/home/vsevolod/Desktop/Inf_project/ViM.txt", "/home/vsevolod/Desktop/Inf_project/ViM-phrases.txt", verbose = True)

Starting training using file /home/vsevolod/Desktop/Inf_project/ViM.txt
Words processed: 500K     Vocab size: 414K  
Vocab size (unigrams + bigrams): 238567
Words in train file: 584772
Words written: 500K

In [3]:
word2vec.word2vec("/home/vsevolod/Desktop/Inf_project/ViM-phrases.txt", "/home/vsevolod/Desktop/Inf_project/ViM.bin", size = 100, verbose = True)

Starting training using file /home/vsevolod/Desktop/Inf_project/ViM-phrases.txt
Vocab size: 13257
Words in train file: 462253
Alpha: 0.000371  Progress: 99.31%  Words/thread/sec: 269.67k  

In [4]:
word2vec.word2clusters("/home/vsevolod/Desktop/Inf_project/ViM.txt", "/home/vsevolod/Desktop/Inf_project/ViM-clusters.txt", 100, verbose = True)

Starting training using file /home/vsevolod/Desktop/Inf_project/ViM.txt
Vocab size: 12470
Words in train file: 490352
Alpha: 0.000102  Progress: 100.00%  Words/thread/sec: 277.98k  

In [5]:
model = word2vec.load("/home/vsevolod/Desktop/Inf_project/ViM.bin")

In [4]:
#works fine

trans_prob = {}
wd2ind = {}

i = 0
for i in range(len(text)):
    if text[i] not in wd2ind:
        wd2ind[text[i]] = []
        trans_prob[text[i]] = {}
    wd2ind[text[i]].append(i)

In [5]:
import multiprocessing as mp

In [5]:
#works fine, do not touch

for wd in trans_prob:
    arr = []
    for i in wd2ind[wd]:
        if (i < len(text) - 1):
            arr.append(text[i + 1])
    for w in set(arr):
        trans_prob[wd][w] = arr.count(w) / len(arr)

In [6]:
#does not work properly

keys = list(trans_prob.keys())

def getword(text, wd2ind, trans_prob, start, step):
    for i in range(start, len(trans_prob), step):
        wd = keys[i]
        arr = []
        for i in wd2ind[wd]:
            if (i < len(text) - 1):
                arr.append(text[i + 1])
        for w in set(arr):
            trans_prob[wd][w] = arr.count(w) / len(arr)

In [None]:
#even this...

k = int(len(text) ** 1/2) + 1
if __name__ == "__main__":
    man = mp.Manager()
    text_ = man.list(text)
    wd2ind_ = man.dict(wd2ind)
    trans_prob_ = man.dict(trans_prob)
    for j in range(k):
        p = mp.Process(target = getword, args = (text_, wd2ind_, trans_prob_, j, k))
        p.start()
        #p.join()
trans_prob = trans_prob_