In [442]:
from pathlib import Path
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from tokenize_uk.tokenize_uk import tokenize_words
import numpy as np

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import (Embedding, Input, Dense, Flatten, Conv1D, concatenate, 
                          Activation, LSTM, Dropout, Reshape, Lambda, TimeDistributed)
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers, Model, Sequential
from keras.models import load_model, Sequential
import random

#### N-Gram model

In [29]:
filename_linux = "linux_input.txt"
filename_kobzar = "kobzar.txt"

In [7]:
def zipngram(doc, n=2):
    return zip(*[doc[i:] for i in range(n)])

In [361]:
with open(filename_kobzar) as f:
    data = f.read().lower()

In [362]:
tokens = tokenize_words(data)

In [80]:
def build_ngram(tokens, n=1, counts=False):
    if n == 1:
        c = Counter(tokens)
        if counts:
            return c
        total = sum(c.values(), 0.0)
        for key in c:
            c[key] /= total
        return c
    else:
        c = Counter(zipngram(tokens, n))
        if counts:
            return c
        total = sum(c.values(), 0.0)
        for key in c:
            c[key] /= total
        return c

In [81]:
unigrams = build_ngram(tokens, n=1)
bigrams = build_ngram(tokens, n=2, counts=False)
three_grams = build_ngram(tokens, n=3)

In [82]:
bigrams[(",", "що")] / unigrams[","]
bigrams[(",", "і")] / unigrams[","]

0.03358648266122536

0.058276340551818136

In [158]:
def normalize(probs):
    prob_factor = 1 / sum(probs)
    return [prob_factor * p for p in probs]

In [352]:
def sample(ngrams, keys):
    probas = normalize([ngrams[key] for key in keys])
    idx = np.random.choice(range(len(keys)), p=probas)
    return keys[idx]

In [208]:
def generate_bigram(word, bigrams):
    keys = list(filter(lambda x: x[0]==word, bigrams))
    if not keys:
        return None
    bigr = sample(bigrams, keys)
    return bigr

In [203]:
def generate_trigram(words, trigrams):
    keys = list(filter(lambda x: x[:2]==words, trigrams))
    if not keys:
        return None
    trigram = sample(trigrams, keys)
    return trigram

In [360]:
word = "як"
N = 30
sentence = [word]
bigr = None
for i in range(N):
    if bigr:
        trigram = generate_trigram(bigr, three_grams)
        if trigram:
            bigr = trigram[-2:]
            word = trigram[-1]
        else:   
            bigr = generate_bigram(word, bigrams)
            word = bigr[1]
    else:
        bigr = generate_bigram(word, bigrams)
        word = bigr[1]
    sentence.append(word)

print(" ".join(sentence))

як схочеш , серце поховати . може , дасть він мені шепче : « боже ж ти дівся , в . жуковським навесні 1838 викупили молодого поета з кріпацтва . перед


#### Character RNN

In [363]:
import string

In [364]:
alphabet = "а, б, в, г, ґ, д, е, є, ж, з, и, і, ї, й, к, л, м, н, о, п, р, с, т, у, ф, х, ц, ч, ш, щ, ь, ю, я"
alphabet = alphabet.split(", ") + list(string.punctuation+string.digits+string.whitespace)

In [366]:
def build_vocab(data, alphabet):
    char_index = {}
    chars = []
    for char in list(data):
        if char in alphabet:
            char_id = len(char_index) + 1
            char_index[char] = char_index.get(char, char_id)
            chars.append(char_index[char])
    return chars, char_index

In [367]:
data = list(filter(lambda x: x in alphabet, data))

In [369]:
chars = sorted(list(set(data)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# split the corpus into sequences of length=maxlen
#input is a sequence of 40 chars and target is also a sequence of 40 chars shifted by one position
#for eg: if you maxlen=3 and the text corpus is abcdefghi, your input ---> target pairs will be
# [a,b,c] --> [b,c,d], [b,c,d]--->[c,d,e]....and so on
maxlen = 40
step = 1
sentences = []
next_chars = []
for i in range(0, len(data) - maxlen+1, step):
    sentences.append(data[i: i + maxlen]) #input seq is from i to i  + maxlen
    next_chars.append(data[i+1:i +1+ maxlen]) # output seq is from i+1 to i+1+maxlen
    #if i<10 :
       # print (text[i: i + maxlen])
        #print(text[i+1:i +1+ maxlen])
print('nb sequences:', len(sentences))

total chars: 65
nb sequences: 536242


In [418]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) # y is also a sequence , or  a seq of 1 hot vectors
for i, sentence in enumerate(sentences):
    #for t, char in enumerate(sentence):
    idx = [(i, t, char_indices[c]) for t,c in enumerate(sentence)]
    #X[i, t, char_indices[char]] = 1
    X[tuple(zip(*idx))] = 1

for i, sentence in enumerate(next_chars):
    idx = [(i, t, char_indices[c]) for t,c in enumerate(sentence)]
    y[tuple(zip(*idx))] = 1
    #for t, char in enumerate(sentence):
     #   y[i, t, char_indices[char]] = 1

Vectorization...


In [507]:
input_sent = Input(shape=(None, len(chars),))
x = LSTM(256, input_shape=(None, len(chars)), return_sequences=True)(input_sent)
x = Dropout(0.3)(x)
x = TimeDistributed(Dense(len(chars)))(x)
output = Activation("softmax")(x)

In [508]:
model = Model(inputs=input_sent, outputs=output)
model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['acc'])

In [509]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, None, 65)          0         
_________________________________________________________________
lstm_21 (LSTM)               (None, None, 256)         329728    
_________________________________________________________________
dropout_20 (Dropout)         (None, None, 256)         0         
_________________________________________________________________
time_distributed_17 (TimeDis (None, None, 65)          16705     
_________________________________________________________________
activation_15 (Activation)   (None, None, 65)          0         
Total params: 346,433
Trainable params: 346,433
Non-trainable params: 0
_________________________________________________________________


In [518]:
model.fit(X, y, batch_size=128, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x7f15f6e5f0f0>

In [524]:
model.save("lstm_kobzar.h5")

In [438]:
# model = Sequential()
# #model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))  # original one
# model.add(LSTM(10, input_shape=(None, len(chars)),return_sequences=True)) #minesh witout specifying the input_length
# model.add(Dropout(0.2))
# model.add(TimeDistributed(Dense(len(chars))))
# model.add(Activation('softmax'))

# model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [439]:
#model.fit(X[:100000], y[:100000], batch_size=128, epochs=1)

In [519]:
def sample_char(probas, indices_char=indices_char):
    idx = np.random.choice(range(len(probas)), p=probas)
    return indices_char[idx]

In [523]:
start_index = random.randint(0, len(data) - maxlen - 1)
generated = ''
sentence = "".join(data[start_index: start_index + maxlen])
generated += sentence
print('Seed: ', sentence, "\n")

for i in range(320):
    x = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sentence):
        x[0, t, char_indices[char]] = 1.
    preds = model.predict(x, verbose=0)[0][-1]
    next_char = sample_char(preds)
    sentence = sentence[1:] + next_char
    generated += next_char

print(generated)

Seed:  єй-богу, ярино,
не жартую!..  та це й сп 

єй-богу, ярино,
не жартую!..  та це й сполавули,
що битьалось ізек,
відив хнапі, ве чубривек ставо!..
бусокнуботь гиматия 
та в'янея, мийеї й,
і? в прамі голяб'ять
як я від кає, теже буде!
дрого, покущоє,
нена чернокого-пропін,
ригу неватвий деня!!. . ж ти моми
і чарочої токоми,
що кбернелисм йне з серок
утитанцем в гаром продикали
в сказі ж вого якої
очтий 
