In [1]:
from os import listdir
from os.path import isfile, join, exists
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import xml.etree.ElementTree as etree
from collections import Counter
import io
from time import localtime, strftime
import numpy as np
from keras.models import Model, Sequential
from keras.layers import Input, Activation, Embedding, TimeDistributed, Dense, Dropout, Reshape, Merge, Highway, \
                        LSTM, Convolution2D, MaxPooling2D, BatchNormalization, SpatialDropout1D, Masking
from keras.optimizers import RMSprop
from keras.callbacks import Callback
import sys
from contextlib import contextmanager
# from nltk import wordpunct_tokenize, sent_tokenize
import re
import string
import pickle
import gensim
from contextlib import redirect_stdout

Using TensorFlow backend.


In [2]:
lemmas_counter = pickle.load( open("lemmas_counter.pkl", "rb") )
dictionary = pickle.load( open("dictionary.pkl", "rb") )

In [3]:
lemma2index = {}
EMPTY_WORD = ''
END_OF_LINE = '<eol>'
lemma2index[EMPTY_WORD] = 0
lemma2index[END_OF_LINE] = 1

for lemma, _ in lemmas_counter.most_common(50000):
    lemma2index[lemma] = len(lemma2index)

index2lemma = { lemma2index[x] : x for x in lemma2index }

In [5]:
GRAMMEMES = []

def binarize(i, length):
    vector = np.zeros(length + 1, dtype=np.int)
    vector[i] = 1
    return list(vector)
    
with open('ClassesNames.txt', encoding='utf8') as f:
    for line in f:
        fields = line.strip().split(' ')
        GRAMMEMES.append({ x : binarize(i, len(fields)) for i, x in enumerate(fields) })
        GRAMMEMES[-1][u'UNK'] = binarize(len(fields), len(fields))
        
GRAMMEMES_COUNT = sum(len(x) for x in GRAMMEMES)
    
def convert_tags(pos, tags):
    tags = tags.split('|')
    tags_vector = GRAMMEMES[0][pos] if pos in GRAMMEMES[0] else GRAMMEMES[0][u'UNK']
    tags_vector = tags_vector[:]
    for mapping in GRAMMEMES[1:]:
        if any(tag in mapping for tag in tags):
            for tag in tags:
                if tag in mapping:
                    tags_vector.extend(mapping[tag])
        else:
            tags_vector.extend(mapping[u'UNK'])
    return tags_vector

classesToValue = {}
with open('classesList.txt', encoding='utf-8') as f:
    for line in f:
        line = line[:-2]
        pos, val = line[:line.index('\t')], line[line.index('\t') + 1:]
        classesToValue[int(pos)] = val.replace('\t', ' ')

classesToValue[-1] = "Unknown"

with open('classesMapping.txt', encoding='utf-8') as f:
    classesMapping = { int(line[:-1].split('\t')[1]) : int(line.split('\t')[0]) for line in f }
    
classesMappingRev = {classesMapping[x] : x for x in classesMapping}

space = re.compile('\\s+')
def format_gram_value(gram_val):
    first_tab = gram_val.find(' ')
    if first_tab != -1:
        pos, grval = gram_val[: first_tab], gram_val[first_tab + 1 :]
        grval = grval.replace('-', ' ')
        grval = space.subn(' ', grval)[0].strip()
        grval = grval.replace(' ', '|')
        if len(grval) != 0:
            tags = { a.split('=')[0] : a.split('=')[1] if a != '_' else '_' for a in grval.split('|') }
            if pos == 'VERB' and 'Tense' not in tags \
                    and (tags["VerbForm"] == "Fin" and tags["Mood"] == "Ind"
                        or tags["VerbForm"] == "Conv"):
                tags["Tense"] = "Past"
            grval = ''
            for tag in tags:
                grval += tag + '=' + tags[tag] + '|'
        else:
            grval = '_ '
        return pos + '\t' + grval[:-1]
    else:
        return gram_val + '\t_'

index2tags_vector = {}
for i in range(286 + 1):
    if i >= 2:
        pos, tags = format_gram_value(classesToValue[classesMapping[i] - 2]).split('\t')
        index2tags_vector[i] = convert_tags(pos, tags)
    elif i == 0:
        index2tags_vector[i] = list(np.zeros(GRAMMEMES_COUNT, dtype=np.int))
    elif i == 1:
        index2tags_vector[i] = convert_tags('', '')

In [7]:
class LemmatizedWord(object):
    def __init__(self, lemma, gr_tag, word_form):
        self.lemma = lemma
        self.gr_tag = gr_tag
        self.word_form = word_form
        
    def __repr__(self):
        return "<Lemma = {}; GrTag = {}; WordForm = {}>".format(self.lemma, self.gr_tag, self.word_form)
    
    def __eq__(self, other):
        return (self.lemma, self.gr_tag, self.word_form) == (other.lemma, other.gr_tag, other.word_form)

    def __hash__(self):
        return hash((self.lemma, self.gr_tag, self.word_form))
        
class LemmatizedVocabulary(object):
    def __init__(self, voc_path, lemmas_counter=None, dictionary=None):
        self.word_form2_lemmatization = {}
        self.lemmatization2word_form = {}
        self.lemmatizedWords = []
        self.lemma2lemmatizedWord = {}
        self.lemmatizedWordIndicies = {}
        if voc_path:
            with open(voc_path, encoding='utf8') as f:
                for line in f:
                    lemma, gr_tag, word_form = line.strip().split(' ')
                    gr_tag = int(gr_tag)
                    self.lemmatizedWords.append(LemmatizedWord(lemma, gr_tag, word_form))
                    if word_form not in self.word_form2_lemmatization:
                        self.word_form2_lemmatization[word_form] = []
                    self.word_form2_lemmatization[word_form].append((lemma, gr_tag))
                    if (lemma, gr_tag) not in self.lemmatization2word_form:
                        self.lemmatization2word_form[(lemma, gr_tag)] = []
                    self.lemmatization2word_form[(lemma, gr_tag)].append(word_form)
                    if lemma not in self.lemma2lemmatizedWord:
                        self.lemma2lemmatizedWord[lemma] = []
                    self.lemma2lemmatizedWord[lemma].append(LemmatizedWord(lemma, gr_tag, word_form))
        else:
            for pair in dictionary:
                if pair[0] not in self.lemma2lemmatizedWord:
                    self.lemma2lemmatizedWord[pair[0]] = []
                self.lemma2lemmatizedWord[pair[0]].append(LemmatizedWord(pair[0], pair[1], dictionary[pair]))
            self.word_form2_lemmatization = {dictionary[pair] : LemmatizedWord(pair[0], pair[1], dictionary[pair]) 
                                             for pair in dictionary}
            self.lemmatization2word_form = {self.word_form2_lemmatization[x] : x for x in self.word_form2_lemmatization}
            for lemma, _ in lemmas_counter.most_common():
                self.lemmatizedWords.extend(self.lemma2lemmatizedWord[lemma])
        self.lemmatizedWordIndicies = {x : i for i, x in enumerate(self.lemmatizedWords)}
        self.index2lemmatizedWord = {i : x for i, x in enumerate(self.lemmatizedWords)}
                
    def get_word_form(self, lemma, gr_tag):
        return self.lemmatization2word_form[(lemma, gr_tag)] \
                if (lemma, gr_tag) in self.lemmatization2word_form[(lemma, gr_tag)] \
                else None
    
    def get_lemmatization(self, word_form):
        return self.word_form2_lemmatization[word_form] \
                if word_form in self.word_form2_lemmatization[word_form] \
                else None
    
    def choice_word(self):
        return self.lemmatizedWords[np.random.randint(0, len(self.lemmatizedWords))]
    
    def get_paradigm(self, lemma):
        return self.lemma2lemmatizedWord[lemma] if lemma in self.lemma2lemmatizedWord else None
    
    def get_word_form_index(self, lemmatizedWord):
        return self.lemmatizedWordIndicies[lemmatizedWord] \
            if lemmatizedWord in self.lemmatizedWordIndicies \
            else len(self.lemmatizedWordIndicies)
            
    def get_word_form_by_index(self, index):
        return self.index2lemmatizedWord[index]

In [8]:
lemmatizedVocabulary = LemmatizedVocabulary(None, lemmas_counter, dictionary)

In [9]:
SOFTMAX_SIZE = 60000

In [24]:
from keras import backend as K
from keras.utils.generic_utils import get_custom_objects

def hard_tanh(x):
    one = K.constant(1)
    neg_one = K.constant(-1)
    return K.minimum(K.maximum(x, neg_one), one)

get_custom_objects().update({'custom_activation': Activation(hard_tanh)})

In [25]:
# Grammemes + embeddings model
words = Input(shape=(None,), name='words')
words_embedding = SpatialDropout1D(0.3)(Embedding(len(lemma2index) + 1, 150, name='embeddings')(words))
  
grammemes_input = Input(shape=(None, GRAMMEMES_COUNT), name='grammemes')
grammemes_layer = Masking(mask_value=0.)(grammemes_input)

# Добавить ещё слой, сделать функцию активации
grammemes_layer = Dense(25, activation=Activation(hard_tanh))(grammemes_layer)
grammemes_layer = Dense(25, activation=Activation(hard_tanh))(grammemes_layer)
    
layer = Merge(mode='concat', name='LSTM_input')([words_embedding, grammemes_layer])

layer = LSTM(368, dropout=.2, recurrent_dropout=.2, return_sequences=True, name='LSTM_1')(layer)
layer = LSTM(368, dropout=.2, recurrent_dropout=.2, return_sequences=False, name='LSTM_2')(layer)

output = Dense(SOFTMAX_SIZE + 1, activation='softmax')(layer)

model = Model(inputs=[words, grammemes_input], outputs=[output])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

  ).format(identifier=identifier.__class__.__name__))


In [39]:
model.save(name + '.model')

In [11]:
SENT_LEN = 10

def get_word_index(word):
    return min(lemmatizedVocabulary.get_word_form_index(word), SOFTMAX_SIZE)

class BatchGenerator():
    def __init__(self, fname, batch_size):
        self.fname = fname
        self.batch_size = batch_size

    @staticmethod
    def __generate_seqs(sents):
        seqs, next_words = [], []
        for sent in sents:
            sent = sent[::-1]
            for i in range(1, len(sent)):
                if lemmatizedVocabulary.get_word_form_index(sent[i]) >= SOFTMAX_SIZE:
                    continue
                seqs.append(sent[max(0, i - SENT_LEN) : i])
                next_words.append(sent[i])
        return seqs, next_words
        
    @staticmethod
    def __to_tensor(sents):
        sents, next_words = BatchGenerator.__generate_seqs(sents)
        max_len = max(len(sent) for sent in sents)
        X_emb = np.zeros((len(sents), max_len), dtype=np.int)
        X_grammemes = np.zeros((len(sents), max_len, GRAMMEMES_COUNT), dtype=np.int)
        y = np.zeros(len(sents), dtype=np.int)
        for i in range(len(sents)):
            X_emb[i, -len(sents[i]):] = [get_word_index(x) for x in sents[i]]
            X_grammemes[i, -len(sents[i]):] = [index2tags_vector[x.gr_tag] for x in sents[i]]
            y[i] = get_word_index(next_words[i])
        return X_emb, X_grammemes, y
    
    @staticmethod
    def to_tensor(words_indices):
        return BatchGenerator.__to_tensor([idx2word[ind] for ind in words_indices])
    
    def __iter__(self):
        sents = [[]]
        with open(self.fname, encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if len(line) == 0:
                    sents.append([])
                else:
                    word, lemma, pos, tags, index = line.split('\t')
                    sents[-1].append(LemmatizedWord(lemma + '_' + pos, int(index), word))
                if len(sents) >= self.batch_size:
                    yield BatchGenerator.__to_tensor(sents)
                    sents = [[]]

In [14]:
class EvalCallback(Callback):
    def __init__(self, name):
        self._name = name
        
    def write(self, message):
        print('[{}] {}'.format(strftime("%H:%M:%S", localtime()), message))
        
    def _sample(self, prob, temperature=1.0):
        prob = prob[:-1] # Не хотим предсказывать UNKNOWN_WORD
        prob = np.log(prob) / temperature
        prob = np.exp(prob) / np.sum(np.exp(prob))
        return np.random.choice(len(prob), p=prob)

    def _generate(self):
        cur_sent = [lemmatizedVocabulary.get_word_form_by_index(np.random.randint(0, SOFTMAX_SIZE))]
        for i in range(10):
            X_emb = np.zeros((1, len(cur_sent)))
            X_gr = np.zeros((1, len(cur_sent), GRAMMEMES_COUNT))
            for ind, word in enumerate(cur_sent):
                X_emb[0, ind] = get_word_index(word)
                X_gr[0, ind] = index2tags_vector[word.gr_tag]

            preds = model.predict([X_emb, X_gr], verbose=0)[0]
            cur_sent.append(lemmatizedVocabulary.get_word_form_by_index(self._sample(preds)))

        print('Sentence', end=': ')
        for word in cur_sent[::-1]:
            print(word.word_form, end=' ')
        print()

    def on_epoch_end(self, epoch, logs={}):
        self._generate()

In [None]:
batch_generator = BatchGenerator('Data/Poetry_preds.txt_lemmatized_train', 10000)
name = 'Lemmatized_hard_tanh'
with open(name + '_log.txt', 'a') as f:
    with redirect_stdout(f):
        callback = EvalCallback(name)
        for big_epoch in range(1000):
            print('------------Big Epoch {}------------'.format(big_epoch))
            for epoch, (X1, X2, y) in enumerate(batch_generator):
                model.fit([X1, X2], y, batch_size=768, epochs=1, verbose=2, callbacks=[callback])
                if epoch != 0 and epoch % 10 == 0:
                    model.save_weights(name + '_model.h5')
                f.flush()

In [2]:
from rupo.main.vocabulary import Vocabulary
from rupo.generate.lemmatized_vocabulary import LemmatizedVocabulary, LemmatizedWord
from rupo.generate import lstm
from rupo.generate import generator

print('loading voc')
vocab_dump_file = "rupo_files/voc.pickle"
vocabulary = Vocabulary(vocab_dump_file)

print('loading lstm')
lstm_container = lstm.LSTM_Container('rupo_files/Lemmatized')

print('loading generator')
gen = generator.Generator(lstm_container, vocabulary)

loading voc
loading lstm


  return cls(**config)


loading generator


In [3]:
print(gen.generate_poem())

Так толку мне теперь грустить
Что будет это прожито
Не суждено кружить в пути
Почувствовав боль бомжика



In [5]:
print(gen.generate_poem(rhyme_pattern='ababccddeffegg'))

  model /= np.sum(model)
  return choice(range(len(model)), 1, p=model)[0]


Плечом к плечу рукой махнул
Но вот она опять давно
И на удачу распахну
Окно и снова все равно
Придет прощайте мама и
Мне друг за что вам вы скажи
Твои давно по кораблю
Шуршат а я дышу люблю
Тебя пытаться объяснить
Все не твое и жду я и
Мне все решать мне одному
Не крошка ты меня простить
За совесть я хочу в простор
Больно и чтоб твоих обжор



In [6]:
print(gen.generate_poem(rhyme_pattern='ababccdd'))

На все кричать и убивать
Она в вине тебя затмит
Ты спишь и хочешь танцевать
Что хочешь для меня найти
Мне хуже будешь ты никак
Кто рядом так ведь он пока
Что то подруги выпили
И волны снова выплюнут

